I am trying to extract the following information from all PDF files within a folder, the PDF files are CV's: Email Address, First Name, Last Name for a work project.
I have successfully managed to extract Email Addresses using this code:
from io import StringIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import subprocess
from subprocess import call
import os
import re
working_directory = './folder'
file_list = [] # define file_list to save all dxf files
email_list = {} # define file_list to save all dxf files
for subdir, dirs, files in os.walk(working_directory):
for file in files:
if file.endswith('.pdf'):
file_list.append(file)
for input_file in file_list:
pagenums = set()
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open('./folder/' input_file, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
match = re.search(r'[\w\.-] @[a-z0-9\.-] ', text)
try:
email = match.group(0)
except AttributeError:
email = match
if email is None:
pass
else:
email_list.update({input_file: email})
print(email_list[input_file])
email_list
But have trouble extracting First and Last Names, any help would be appreciated!
CodePudding user response:
You can find email information because there is logic behind it
match = re.search(r'[\w\.-] @[a-z0-9\.-] ', text)
But also you have to figure out a logic to find out first and last names of your PDF files.
Maybe an specific field after Dear,
for example