所以我需要建立一个从pdf文件(简历)(OCR)中提取数据的模型,所以我收集了一堆pdf文件,我需要将它们转换为适合OCR的形式,我迷失了方向。我试着用Regex把它们转换成csv文件,但没有用。csv文件是空的,如果你们能帮我做什么,那就太好了。提前谢谢!
这是我的代码:
import pdfplumber
import os
import re
import pandas as pd
def clean_text(text):
cleaned_text = re.sub(r"[^a-zA-Z0-9\s@.-]", "", text) # Remove special characters except alphanumeric, @, and -
cleaned_text = re.sub(r"\s+", " ", cleaned_text) # Remove extra spaces
return cleaned_text.strip()
def extract_data_from_resume(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
cleaned_text = clean_text(text)
name_pattern = r"Name:\s*(.*)"
name_match = re.search(name_pattern, cleaned_text, re.IGNORECASE)
name = name_match.group(1) if name_match else ""
email_pattern = r"Email:\s*([^\s@]+@[^\s@]+\.[^\s@]+)"
email_match = re.search(email_pattern, cleaned_text, re.IGNORECASE)
email = email_match.group(1) if email_match else ""
phone_pattern = r"Phone:\s*(.*)"
phone_match = re.search(phone_pattern, cleaned_text, re.IGNORECASE)
phone = phone_match.group(1) if phone_match else ""
return name, email, phone
def convert_pdf_to_csv(directory, output_csv_path):
data = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
pdf_path = os.path.join(directory, filename)
name, email, phone = extract_data_from_resume(pdf_path)
data.append({"Name": name, "Email": email, "Phone": phone})
df = pd.DataFrame(data)
df.to_csv(output_csv_path, index=False)
# Example usage
directory = 'D:/data/test'
output_csv_path = 'D:/data/test/test.csv'
convert_pdf_to_csv(directory, output_csv_path)