Hi i used to following code to exctract the text(as a string) from the following insurance contract:
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
def extract_text_by_page(pdf_path):
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=False):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager,
fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager,
converter)
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
yield text
# close open handles
converter.close()
fake_file_handle.close()
def extract_text(pdf_path):
text = ""
for page in extract_text_by_page(pdf_path):
#print(page)
text= page " " text
return text
# Driver code
if __name__ == '__main__':
text=extract_text('document.pdf')
print(text)
i would like to extract the following values highlithed in red and the corralated value(the ones highlighted in blue)
some exmaple outputs:
print(oneri_di_registrazione_atti_giudiziari)
"Valore in lite minimo 300 Limite di indennizzo 500" (as string)
print(tutela_dati_personali)
"Massimale per evento 25000" (as string)
The red part will not change if i change file but the blue part could so would like to link the numeric values to their red counterpart does anyone know how? I also share the raw string i exctracted if can help
Valoreinliteminimoeuro300,00OneridiregistrazionediattigiudiziariLimitediindennizzoeuro500,00PerlagaranziaATTIVITA'AZIENDALECOMPLETAMassimalepereventoeuro50.000,00SpeseperunsecondolegaledomiciliatarioLimitediindennizzoeuro2.000,00ControversiecontrattualiconifornitoriValoreinlitemassimoEuro50.000,00ControversiecontrattualiconifornitoriScoperto20%PerlagaranziaSALUTEESICUREZZASULLAVOROMassimalepereventoeuro25.000,00PerlagaranziaTUTELADEIDATIPERSONALIMassimalepereventoeuro25.000,00
comment if you need any more information thanks in advance to anyone who is able to solve it
CodePudding user response:
disclaimer: I am the author of borb
, the library used in this answer
I describe a scenario similar to yours in the examples repository of the library. For the sake of completeness I'll repeat the answer here.
#!chapter_005/src/snippet_008.py
import typing
from decimal import Decimal
from borb.pdf.canvas.geometry.rectangle import Rectangle
from borb.pdf import Document
from borb.pdf import PDF
from borb.toolkit import LocationFilter
from borb.toolkit import RegularExpressionTextExtraction
from borb.toolkit import PDFMatch
from borb.toolkit import SimpleTextExtraction
def main():
# set up RegularExpressionTextExtraction
# fmt: off
l0: RegularExpressionTextExtraction = RegularExpressionTextExtraction("[nN]isi .* aliquip")
# fmt: on
# process Document
doc: typing.Optional[Document] = None
with open("output.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l0])
assert doc is not None
# find match
m: typing.Optional[PDFMatch] = next(iter(l0.get_matches_for_page(0)), None)
assert m is not None
# get page width
w: Decimal = doc.get_page(0).get_page_info().get_width()
# change rectangle to get more text
r0: Rectangle = m.get_bounding_boxes()[0]
r1: Rectangle = Rectangle(
r0.get_x() r0.get_width(), r0.get_y(), w - r0.get_x(), r0.get_height()
)
# process document (again) filtering by rectangle
l1: LocationFilter = LocationFilter(r1)
l2: SimpleTextExtraction = SimpleTextExtraction()
l1.add_listener(l2)
doc: typing.Optional[Document] = None
with open("output.pdf", "rb") as in_file_handle:
doc = PDF.loads(in_file_handle, [l1])
assert doc is not None
# get text
print(l2.get_text_for_page(0))
if __name__ == "__main__":
main()
The idea is that you use RegularExpressionTextExtraction
to find text inside the PDF. This class can then return a list of PDFMatch
objects, which contain the bounding boxes of the matching text.
You can then do something with those Rectangle
objects (in your case, move them to the rightmost side of the Page
) and extract the text from the PDF on those given Rectangle
(s).