How to exctract text from a PDF and manipulate it (python)-CodePudding

Hi i used to following code to exctract the text(as a string) from the following insurance contract:

    import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage



def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=False):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()

            converter = TextConverter(resource_manager,
                                      fake_file_handle)

            page_interpreter = PDFPageInterpreter(resource_manager,
                                                  converter)

            page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()

            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()


def extract_text(pdf_path):
    text = ""
    for page in extract_text_by_page(pdf_path):
        #print(page)
            text= page " " text
        return text
        # Driver code
    
    
if __name__ == '__main__':
    text=extract_text('document.pdf')
    print(text)

i would like to extract the following values highlithed in red and the corralated value(the ones highlighted in blue)

some exmaple outputs:

print(oneri_di_registrazione_atti_giudiziari)

"Valore in lite minimo 300 Limite di indennizzo 500" (as string)

print(tutela_dati_personali)

"Massimale per evento 25000" (as string)

The red part will not change if i change file but the blue part could so would like to link the numeric values to their red counterpart does anyone know how? I also share the raw string i exctracted if can help

Valoreinliteminimoeuro300,00OneridiregistrazionediattigiudiziariLimitediindennizzoeuro500,00PerlagaranziaATTIVITA'AZIENDALECOMPLETAMassimalepereventoeuro50.000,00SpeseperunsecondolegaledomiciliatarioLimitediindennizzoeuro2.000,00ControversiecontrattualiconifornitoriValoreinlitemassimoEuro50.000,00ControversiecontrattualiconifornitoriScoperto20%PerlagaranziaSALUTEESICUREZZASULLAVOROMassimalepereventoeuro25.000,00PerlagaranziaTUTELADEIDATIPERSONALIMassimalepereventoeuro25.000,00

comment if you need any more information thanks in advance to anyone who is able to solve it

CodePudding user response：

disclaimer: I am the author of borb, the library used in this answer

I describe a scenario similar to yours in the examples repository of the library. For the sake of completeness I'll repeat the answer here.

#!chapter_005/src/snippet_008.py
import typing
from decimal import Decimal

from borb.pdf.canvas.geometry.rectangle import Rectangle
from borb.pdf import Document
from borb.pdf import PDF
from borb.toolkit import LocationFilter
from borb.toolkit import RegularExpressionTextExtraction
from borb.toolkit import PDFMatch
from borb.toolkit import SimpleTextExtraction


def main():

    # set up RegularExpressionTextExtraction
    # fmt: off
    l0: RegularExpressionTextExtraction = RegularExpressionTextExtraction("[nN]isi .* aliquip")
    # fmt: on

    # process Document
    doc: typing.Optional[Document] = None
    with open("output.pdf", "rb") as in_file_handle:
        doc = PDF.loads(in_file_handle, [l0])
    assert doc is not None

    # find match
    m: typing.Optional[PDFMatch] = next(iter(l0.get_matches_for_page(0)), None)
    assert m is not None

    # get page width
    w: Decimal = doc.get_page(0).get_page_info().get_width()

    # change rectangle to get more text
    r0: Rectangle = m.get_bounding_boxes()[0]
    r1: Rectangle = Rectangle(
        r0.get_x()   r0.get_width(), r0.get_y(), w - r0.get_x(), r0.get_height()
    )

    # process document (again) filtering by rectangle
    l1: LocationFilter = LocationFilter(r1)
    l2: SimpleTextExtraction = SimpleTextExtraction()
    l1.add_listener(l2)
    doc: typing.Optional[Document] = None
    with open("output.pdf", "rb") as in_file_handle:
        doc = PDF.loads(in_file_handle, [l1])
    assert doc is not None

    # get text
    print(l2.get_text_for_page(0))


if __name__ == "__main__":
    main()

The idea is that you use RegularExpressionTextExtraction to find text inside the PDF. This class can then return a list of PDFMatch objects, which contain the bounding boxes of the matching text.

You can then do something with those Rectangle objects (in your case, move them to the rightmost side of the Page) and extract the text from the PDF on those given Rectangle(s).