I have written a code to convert multiple pdf files to .txt file. The code works pretty fine but the major issue I am having is that while having an extension I am getting double extension meaning "companyA.pdf" to "companyA.pdf.txt". I am not really sure where I am making a mistake. Following is the code :
'''
import os
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
pdf_folder_path = os.getcwd() #Get the path of the current folder
text_folder_path = os.getcwd() '/' 'text_folder' #Notation of path is mac specification. For windows'/'To'\'Correct to.
os.makedirs(text_folder_path, exist_ok=True)
pdf_file_name = os.listdir(pdf_folder_path)
#name is a PDF file (ends.pdf) returns TRUE, otherwise FALSE is returned.
def pdf_checker(name):
pdf_regex = re.compile(r'. \.pdf')
if pdf_regex.search(str(name)):
return True
else:
return False
#Convert PDF to text file
def convert_pdf_to_txt(path, txtname, buf=True):
rsrcmgr = PDFResourceManager()
if buf:
outfp = StringIO()
else:
outfp = file(txtname, 'w')
codec = 'utf-8'
laparams = LAParams()
laparams.detect_vertical = True
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
device.close()
if buf:
text = outfp.getvalue()
make_new_text_file = open(text_folder_path '/' path '.txt', 'w')
make_new_text_file.write(text)
make_new_text_file.close()
outfp.close()
#Get the pdf file name in the folder and list it
for name in pdf_file_name:
if pdf_checker(name):
convert_pdf_to_txt(name, name '.txt') # pdf_Use checker and TRUE (end is.For pdf) proceed to conversion)
else:
pass #Pass if not a PDF file
'''
CodePudding user response:
What I recommend is running a regex to remove .pdf
when the name string ends with it, like so:
if pdf_checker(name):
newName = re.sub(r'\.pdf$', '.txt', name)
convert_pdf_to_txt(name, newName)
Then replace this line:
make_new_text_file = open(text_folder_path '/' path '.txt', 'w')
With the following:
make_new_text_file = open(text_folder_path '/' txtname, 'w')