Do not judge strictly, I'm a self-taught beginner)))
Please help me figure out how to share I learned both with the help of PyPDF2 and with the help of PyMuPDF (fitz). But when splitting, it often happens that there is text in only one quarter, but it writes all 4 quarters to the new file, both with text and empty, one with text, the rest are empty, and I need something so that the empty ones are not saved, I wanted to somehow do a check, but it didn't work out, lack of knowledge. I tried to read the newly recorded file and delete empty pages, but there is text on each page, even on empty ones, I open the file in acrobat reader, but the pages are empty, I don’t understand how.
Here is my code just in case how and what I do: https://paste.aiogram.dev/opiquhehus.py
This is my first time posting here and I don't know how to attach files. pdf files for example in the telegram channel: https://t.me/ Tq7WpP1ImcjQXSZF.
import copy
import logging
import random
from pathlib import Path
import PyPDF2
import fitz
from PyPDF2.filters import decodeStreamData, ASCII85Decode
from PyPDF2.generic import EncodedStreamObject, DecodedStreamObject
def from_a4_to_a6_not_sync(input_file, output_file):
input_file = str(input_file.absolute())
pdf_reader = PyPDF2.PdfFileReader(input_file)
# print(f'{pdf_reader.getNumPages()=}')
# print(f'{pdf_reader.documentInfo=}')
first_page = pdf_reader.getPage(0)
left_up_side = copy.deepcopy(first_page)
right_up_side = copy.deepcopy(first_page)
left_down_side = copy.deepcopy(first_page)
right_down_side = copy.deepcopy(first_page)
# print(f'{left_up_side.extractText()=}')
# print(f'{right_up_side.extractText()=}')
# print(f'\nДО ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nДО ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
# second_page = pdf_reader.getPage(0)
# print(f'{type(second_page)=}\n{second_page.extractText()=}')
# third_page = pdf_reader.getPage(0)
# fourth_page = pdf_reader.getPage(0)
first_coord = first_page.mediaBox.upperRight[0]
second_coord = first_page.mediaBox.upperRight[1]
# print(f'{first_coord=}')
# print(f'{second_coord=}')
# cords_upperLeft = first_page.mediaBox.upperLeft
# cords_lowerLeft = first_page.mediaBox.lowerLeft
# cords_upperRight = first_page.mediaBox.upperRight
# cords_lowerRight = first_page.mediaBox.lowerRight
# print(f'{cords_upperLeft=}')
# print(f'{cords_lowerLeft=}')
# print(f'{cords_upperRight=}')
# print(f'{cords_lowerRight=}')
# first_page.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# second_page.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# third_page.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# fourth_page.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_up_side.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_up_side.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_down_side.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_down_side.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# print(f'{first_page=}\n\n')
# one_page = left_up_side.getContents()
# second_page = right_up_side.getContents()
# decode_one = DecodedStreamObject()
# print(f'{decode_one.getData()}')
# print(f'{decodeStreamData(second_page)}')
# print(f'ПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'{left_up_side.extractText().encode("utf8")=} {type(left_up_side.extractText())=}')
# print(f'{right_up_side.extractText().encode("utf8")=} {type(right_up_side.extractText())=}')
# print(f'{left_up_side.getContents()=} {type(left_up_side.getContents())=}')
# print(f'{right_up_side.getContents()=} {type(right_up_side.getContents())=}')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
pdf_writer = PyPDF2.PdfFileWriter()
# pdf_writer.addPage(first_page)
pdf_writer.addPage(left_up_side)
pdf_writer.addPage(right_up_side)
with open(output_file, 'wb') as file:
pdf_writer.write(file)
file.close()
def fitz_four_piaces(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
page = 0
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
count = 0 # почему-то не считает
rx = d # add the CropBox displacement
# print(f'{rx=}')
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the image
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# print(f'{spage.number=}')
# text_in_page = page.get_text("text")#.encode("utf8")
# print(f'{text_in_page=}')
# print(f'{count=} {doc.get_page_text(doc.page_count - 1)=}')
# print(f'in cicle {doc.page_count - 1=}')
count = 1
# that's it, save output file
# print(f'{doc.metadata=}')
# print(f'{doc.page_count=}')
doc.save(output_file, #
garbage=3, # eliminate duplicate objects
deflate=True, # compress stuff where possible
)
# input_file2 = str(output_file.absolute())
# src2 = fitz.open(input_file2)
# print(f'{src2.page_count=}')
# for page in src2:
# print(f'{page.get_text("words")=}')
def fitz_four_piaces_read(input_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
print(f'{src.page_count=}')
for page in src:
print(f'{page.get_text("text")=}')
destination = Path().joinpath("MAKETS")
destination.mkdir(parents=True, exist_ok=True)
destination_input = destination.joinpath(
f'up_lef.pdf') # up_lef_up_rig_low_lef_low_rig
destination_output = destination.joinpath(
f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf') # f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf'
# from_a4_to_a6_not_sync(destination_input, destination_output)
fitz_four_piaces(destination_input, destination_output)
fitz_four_piaces_read(destination_output)
CodePudding user response:
Solution found! It is necessary after dividing the page into 4 parts, convert the resulting pages into pictures and then compare the size. I will share the code, maybe it will be useful to someone)
import os
import fitz
def get_size(filename):
st = os.stat(filename)
return st.st_size
async def from_a4_to_a6(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
rx = d # add the CropBox displacement
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the imageb
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# Here we will convert the pdf to an image and check the size
pix = page.get_pixmap() # render page to an image
name_png = f"page-{page.number}.png" # _{random.randint(1,100)}
pix.save(name_png) # store image as a PNG
imgsize = get_size(name_png)
os.remove(name_png)
if imgsize < 1300: # A6 blank page size approximately 1209 Yours may be different, check first
doc.delete_page(pno=-1)
break
doc.save(output_file,
garbage=4, # eliminate duplicate objects
clean=True,
deflate=True, # compress stuff where possible
)