PDF range split-CodePudding

I am trying to split a PDF file by finding a key word of text and then grabbing that page the key word is on and the following 4 pages after, so total of 5 pages, and splitting them from that original PDF and putting them into their own PDF so the new PDF will have those 5 pages only, then loop through again find that key text again because its repeated further down the original PDF X amount of times, grabbing that page plus the 4 after and putting into its own PDF.

Example: key word is found on page 7 the first loop so need page 7 and also pages 8-11 and put those 5 pages 7-11 into a pdf file, the next loop they key word is found on page 12 so need page 12 and pages 13-16 so pages 12-16 split onto their own pdf at this point it has created 2 separate pdfs

the below code finds the key word and puts it into its own pdf file but only got it for that one page not sure how to include the range

import os

from PyPDF2 import PdfFileReader, PdfFileWriter

path = "example.pdf"
fname = os.path.basename(path)
reader = PdfFileReader(path)
for page_number in range(reader.getNumPages()):
    writer = PdfFileWriter()
    writer.addPage(reader.getPage(page_number))
    text = reader.getPage(page_number).extractText()
    text_stripped = text.replace("\n", "")

    print(text_stripped)

    if text_stripped.find("Disregarded Branch") != (-1):
        output_filename = f"{fname}_page_{page_number   1}.pdf"

        with open(output_filename, "wb") as out:
            writer.write(out)

        print(f"Created: {output_filename}")

CodePudding user response：

disclaimer: I am the author of borb, the library used in this answer.

I think your question comes down to 2 common functionalities:

find the location of a given piece of text
merge/split/extract pages from a PDF

For the first part, there is a good tutorial in the examples repo. You can find it here. I'll repeat one of the examples here for completeness.

import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction


def main():

    # read the Document
    doc: typing.Optional[Document] = None
    l: SimpleTextExtraction = SimpleTextExtraction()
    with open("output.pdf", "rb") as in_file_handle:
        doc = PDF.loads(in_file_handle, [l])

    # check whether we have read a Document
    assert doc is not None

    # print the text on the first Page
    print(l.get_text_for_page(0))


if __name__ == "__main__":
    main()

This example extracts all the text from page 0 of the PDF. of course you could simply iterate over all pages, and check whether a given page contains the keyword you're looking for.

For the second part, you can find a good example in the examples repository. This is the link. This example (and subsequent example) takes you through the basics of frankensteining a PDF from various sources.

The example I copy/paste here will show you how to build a PDF by alternatively picking a page from input document 1, and input document 2.

import typing
from borb.pdf.document.document import Document
from borb.pdf.pdf import PDF

import typing
from decimal import Decimal

from borb.pdf.document.document import Document
from borb.pdf.page.page import Page
from borb.pdf.pdf import PDF


def main():

    # open doc_001
    doc_001: typing.Optional[Document] = Document()
    with open("output_001.pdf", "rb") as pdf_file_handle:
        doc_001 = PDF.loads(pdf_file_handle)

    # open doc_002
    doc_002: typing.Optional[Document] = Document()
    with open("output_002.pdf", "rb") as pdf_file_handle:
        doc_002 = PDF.loads(pdf_file_handle)

    # create new document
    d: Document = Document()
    for i in range(0, 10):
        p: typing.Optional[Page] = None
        if i % 2 == 0:
            p = doc_001.get_page(i)
        else:
            p = doc_002.get_page(i)
        d.append_page(p)

    # write
    with open("output_003.pdf", "wb") as pdf_file_handle:
        PDF.dumps(pdf_file_handle, d)


if __name__ == "__main__":
    main()

CodePudding user response：

You've almost got it!

import os

from PyPDF2 import PdfFileReader, PdfFileWriter


def create_4page_pdf(base_pdf_path, start):
    reader = PdfFileReader(base_pdf_path)
    writer = PdfFileWriter()

    for i in range(4):
        index = start   i
        if index < len(reader.pages):
            page = reader.pages[index]
            writer.addPage(page)

    fname = os.path.basename(base_pdf_path)
    output_filename = f"{fname}_page_{start   1}.pdf"
    with open(output_filename, "wb") as out:
        writer.write(out)
    print(f"Created: {output_filename}")


def main(base_pdf_path="example.pdf"):
    base_pdf_path = "example.pdf"

    reader = PdfFileReader(base_pdf_path)
    for page_number, page in enumerate(reader.pages):
        text = page.extractText()
        text_stripped = text.replace("\n", "")
        print(text_stripped)
        if text_stripped.find("Disregarded Branch") != (-1):
            create_4page_pdf(base_pdf_path, page_number)