Split pdf from A4 into A6 quarters and don't save empty quarters-CodePudding

Do not judge strictly, I'm a self-taught beginner)))

Please help me figure out how to share I learned both with the help of PyPDF2 and with the help of PyMuPDF (fitz). But when splitting, it often happens that there is text in only one quarter, but it writes all 4 quarters to the new file, both with text and empty, one with text, the rest are empty, and I need something so that the empty ones are not saved, I wanted to somehow do a check, but it didn't work out, lack of knowledge. I tried to read the newly recorded file and delete empty pages, but there is text on each page, even on empty ones, I open the file in acrobat reader, but the pages are empty, I don’t understand how.

Here is my code just in case how and what I do: https://paste.aiogram.dev/opiquhehus.py

This is my first time posting here and I don't know how to attach files. pdf files for example in the telegram channel: https://t.me/ Tq7WpP1ImcjQXSZF.

import copy
import logging
import random
from pathlib import Path

import PyPDF2
import fitz
from PyPDF2.filters import decodeStreamData, ASCII85Decode
from PyPDF2.generic import EncodedStreamObject, DecodedStreamObject


def from_a4_to_a6_not_sync(input_file, output_file):
    input_file = str(input_file.absolute())

    pdf_reader = PyPDF2.PdfFileReader(input_file)

    # print(f'{pdf_reader.getNumPages()=}')
    # print(f'{pdf_reader.documentInfo=}')

    first_page = pdf_reader.getPage(0)

    left_up_side = copy.deepcopy(first_page)
    right_up_side = copy.deepcopy(first_page)
    left_down_side = copy.deepcopy(first_page)
    right_down_side = copy.deepcopy(first_page)

    # print(f'{left_up_side.extractText()=}')
    # print(f'{right_up_side.extractText()=}')
    # print(f'\nДО ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
    # print(f'\nДО ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')

    # second_page = pdf_reader.getPage(0)
    # print(f'{type(second_page)=}\n{second_page.extractText()=}')
    # third_page = pdf_reader.getPage(0)
    # fourth_page = pdf_reader.getPage(0)

    first_coord = first_page.mediaBox.upperRight[0]
    second_coord = first_page.mediaBox.upperRight[1]
    # print(f'{first_coord=}')
    # print(f'{second_coord=}')

    # cords_upperLeft = first_page.mediaBox.upperLeft
    # cords_lowerLeft = first_page.mediaBox.lowerLeft
    # cords_upperRight = first_page.mediaBox.upperRight
    # cords_lowerRight = first_page.mediaBox.lowerRight
    # print(f'{cords_upperLeft=}')
    # print(f'{cords_lowerLeft=}')
    # print(f'{cords_upperRight=}')
    # print(f'{cords_lowerRight=}')

    # first_page.mediaBox.lowerRight = (first_coord / 2, second_coord / 2)  # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
    # second_page.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2)  #   ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
    # third_page.mediaBox.upperRight = (first_coord / 2, second_coord / 2)  #   НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
    # fourth_page.mediaBox.upperLeft = (first_coord / 2, second_coord / 2)  # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА

    left_up_side.mediaBox.lowerRight = (first_coord / 2, second_coord / 2)  # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
    right_up_side.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2)  # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
    left_down_side.mediaBox.upperRight = (first_coord / 2, second_coord / 2)  # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
    right_down_side.mediaBox.upperLeft = (first_coord / 2, second_coord / 2)  # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА

    # print(f'{first_page=}\n\n')
    # one_page = left_up_side.getContents()
    # second_page = right_up_side.getContents()
    # decode_one = DecodedStreamObject()
    # print(f'{decode_one.getData()}')
    # print(f'{decodeStreamData(second_page)}')

    # print(f'ПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
    # print(f'{left_up_side.extractText().encode("utf8")=} {type(left_up_side.extractText())=}')
    # print(f'{right_up_side.extractText().encode("utf8")=} {type(right_up_side.extractText())=}')
    # print(f'{left_up_side.getContents()=} {type(left_up_side.getContents())=}')
    # print(f'{right_up_side.getContents()=} {type(right_up_side.getContents())=}')
    # print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
    # print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')

    pdf_writer = PyPDF2.PdfFileWriter()
    # pdf_writer.addPage(first_page)
    pdf_writer.addPage(left_up_side)
    pdf_writer.addPage(right_up_side)

    with open(output_file, 'wb') as file:
        pdf_writer.write(file)
        file.close()


def fitz_four_piaces(input_file, output_file):
    input_file = str(input_file.absolute())

    src = fitz.open(input_file)
    doc = fitz.open()  # empty output PDF
    page = 0

    for spage in src:  # for each page in input
        r = spage.rect  # input page rectangle
        d = fitz.Rect(spage.cropbox_position,  # CropBox displacement if not
                      spage.cropbox_position)  # starting at (0, 0)

        # --------------------------------------------------------------------------
        # example: cut input page into 2 x 2 parts
        # --------------------------------------------------------------------------
        r1 = r / 2  # top left rect
        r2 = r1   (r1.width, 0, r1.width, 0)  # top right rect
        r3 = r1   (0, r1.height, 0, r1.height)  # bottom left rect
        r4 = fitz.Rect(r1.br, r.br)  # bottom right rect
        rect_list = [r1, r2, r3, r4]  # put them in a list

        for rx in rect_list:  # run thru rect list
            count = 0 # почему-то не считает
            rx  = d  # add the CropBox displacement
            # print(f'{rx=}')
            page = doc.new_page(-1,  # new output page with rx dimensions
                                width=rx.width,
                                height=rx.height)
            page.show_pdf_page(
                page.rect,  # fill all new page with the image
                src,  # input document
                spage.number,  # input page number
                clip=rx,  # which part to use of input page
            )
            # print(f'{spage.number=}')
            # text_in_page = page.get_text("text")#.encode("utf8")
            # print(f'{text_in_page=}')
            # print(f'{count=} {doc.get_page_text(doc.page_count - 1)=}')
            # print(f'in cicle {doc.page_count - 1=}')
            count  = 1

    # that's it, save output file
    # print(f'{doc.metadata=}')
    # print(f'{doc.page_count=}')
    doc.save(output_file,  #
             garbage=3,  # eliminate duplicate objects
             deflate=True,  # compress stuff where possible
             )
    # input_file2 = str(output_file.absolute())
    # src2 = fitz.open(input_file2)
    # print(f'{src2.page_count=}')
    # for page in src2:
    #     print(f'{page.get_text("words")=}')


def fitz_four_piaces_read(input_file):
    input_file = str(input_file.absolute())
    src = fitz.open(input_file)
    print(f'{src.page_count=}')
    for page in src:
        print(f'{page.get_text("text")=}')


destination = Path().joinpath("MAKETS")
destination.mkdir(parents=True, exist_ok=True)
destination_input = destination.joinpath(
    f'up_lef.pdf')  # up_lef_up_rig_low_lef_low_rig

destination_output = destination.joinpath(
    f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf')  # f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf'

# from_a4_to_a6_not_sync(destination_input, destination_output)

fitz_four_piaces(destination_input, destination_output)
fitz_four_piaces_read(destination_output)

CodePudding user response：

Solution found! It is necessary after dividing the page into 4 parts, convert the resulting pages into pictures and then compare the size. I will share the code, maybe it will be useful to someone)

import os

import fitz


def get_size(filename):
    st = os.stat(filename)
    return st.st_size


async def from_a4_to_a6(input_file, output_file):
    input_file = str(input_file.absolute())

    src = fitz.open(input_file)
    doc = fitz.open()  # empty output PDF

    for spage in src:  # for each page in input
        r = spage.rect  # input page rectangle
        d = fitz.Rect(spage.cropbox_position,  # CropBox displacement if not
                      spage.cropbox_position)  # starting at (0, 0)
        # --------------------------------------------------------------------------
        # example: cut input page into 2 x 2 parts
        # --------------------------------------------------------------------------
        r1 = r / 2  # top left rect
        r2 = r1   (r1.width, 0, r1.width, 0)  # top right rect
        r3 = r1   (0, r1.height, 0, r1.height)  # bottom left rect
        r4 = fitz.Rect(r1.br, r.br)  # bottom right rect
        rect_list = [r1, r2, r3, r4]  # put them in a list

        for rx in rect_list:  # run thru rect list
            rx  = d  # add the CropBox displacement
            page = doc.new_page(-1,  # new output page with rx dimensions
                                width=rx.width,
                                height=rx.height)
            page.show_pdf_page(
                page.rect,  # fill all new page with the imageb
                src,  # input document
                spage.number,  # input page number
                clip=rx,  # which part to use of input page
            )
            #  Here we will convert the pdf to an image and check the size
            pix = page.get_pixmap()  # render page to an image
            name_png = f"page-{page.number}.png"  # _{random.randint(1,100)}
            pix.save(name_png)  # store image as a PNG
            imgsize = get_size(name_png)
            os.remove(name_png)
            if imgsize < 1300:  #  A6 blank page size approximately 1209 Yours may be different, check first
                doc.delete_page(pno=-1)
                break

    doc.save(output_file,
             garbage=4,  # eliminate duplicate objects
             clean=True,
             deflate=True,  # compress stuff where possible
             )