Python UnicodeError Issue with PYPDF2-CodePudding

I wrote a script to crop PDFs if they were over 1300 pts in height. It works perfectly. However I want run it on multiple files in multiple directories. It runs fine for a while and then I get an error. However, if I run it just on the folder with the file that throws the UnicodeError, the script runs fine. If I run it on all of the directories, I get an error. Not sure what to do as I've tried everything I can think of.

The error is:

Traceback (most recent call last):
  File "L:\py_test\!pdferizer_recursive.py", line 17, in <module>
    input1 = PdfFileReader(in_f)
  File "C:\Users\xxx\AppData\Roaming\Python\Python310\site-packages\PyPDF2\_reader.py", line 1901, in __init__
    super().__init__(*args, **kwargs)
  File "C:\Users\xxx\AppData\Roaming\Python\Python310\site-packages\PyPDF2\_reader.py", line 274, in __init__
    self.read(stream)
  File "C:\Users\xxx\AppData\Roaming\Python\Python310\site-packages\PyPDF2\_reader.py", line 1331, in read
    self._basic_validation(stream)
  File "C:\Users\xxx\AppData\Roaming\Python\Python310\site-packages\PyPDF2\_reader.py", line 1378, in _basic_validation
    f"PDF starts with '{header_byte.decode('utf8')}', "
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte

My code looks like this:

from pathlib import Path
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import shutil
import fnmatch

source_folder = r'L:\\py_test'   '\\'

for path, dirs, files in os.walk(source_folder):
  if dirs:
    for dir_names in dirs:
      folders = str(dir_names)
      file_names = os.listdir(os.path.join(path, folders))
      for files in file_names:
        with open(os.path.join(source_folder   folders, files), 'rb') as in_f:
          input1 = PdfFileReader(in_f)
          output = PdfFileWriter()

          numPages = input1.getNumPages()
          print ("document has %s pages." % numPages)

          for i in range(numPages):
            page = input1.getPage(i)
            print (page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y(), page.mediaBox.getUpperLeft_x(), page.mediaBox.getLowerLeft_y())
            if page.mediaBox.getUpperRight_y() > 1300:
              page.cropBox.upperRight = (page.mediaBox.getUpperRight_x(), (page.mediaBox.getUpperRight_y() - 1220))
              page.cropBox.lowerLeft = (0, page.mediaBox.getUpperRight_y())
              output.addPage(page)
              with open(os.path.join(source_folder   folders,files[:-4])   "_new.pdf", "wb") as out_f:
                output.write(out_f)
                in_f.close()
            else:
              print (dir_names   "\\"   files   ", this document is under 1300 pts")
              in_f.close()

CodePudding user response：

Option 1:

It could happen that a file in a particular folder doesn't actually contain 'UTF-8' encoded data, it contains some other encoding. Figure out what that encoding is and use it in the `open call.

For example:

with open(os.path.join(source_folder   folders, files), 'rb', encoding='ISO 8859-1') as in_f:

Option 2:

If the error persists, you could set the errors keyword argument to ignore to ignore the characters that cannot be decoded.

Note that ignoring characters that cannot be decoded can lead to data loss.

# set errors to ignore
with open(os.path.join(source_folder   folders, files), 'rb', encoding='ISO 8859-1', errors='ignore') as in_f:

CodePudding user response：

There were a few thumb.db files hidden in different directories. Added a check for just .pdf and it's golden now.

# Import Libraries
from pathlib import Path
from PyPDF4 import PdfFileReader, PdfFileWriter
import os
import shutil
import fnmatch

source_folder = r'L:\\py_test'   '\\'

for path, dirs, files in os.walk(source_folder):
  if dirs:
    for dir_names in dirs:
      folders = str(dir_names)
      file_names = os.listdir(os.path.join(path, folders))
      for files in file_names:
        if files.endswith('.pdf'):  
          with open(os.path.join(source_folder   folders, files), 'rb') as in_f:
            input1 = PdfFileReader(in_f)
            output = PdfFileWriter()

            numPages = input1.getNumPages()
            print ("document has %s pages." % numPages)

            for i in range(numPages):
              page = input1.getPage(i)
              print (page.mediaBox.getUpperRight_x(), page.mediaBox.getUpperRight_y(), page.mediaBox.getUpperLeft_x(), page.mediaBox.getLowerLeft_y())
              if page.mediaBox.getUpperRight_y() > 1300:
                page.cropBox.upperRight = (page.mediaBox.getUpperRight_x(), (page.mediaBox.getUpperRight_y() - 1220))
                page.cropBox.lowerLeft = (0, page.mediaBox.getUpperRight_y())
                output.addPage(page)
                with open(os.path.join(source_folder   folders,files[:-4])   "_new.pdf", "wb") as out_f:
                  output.write(out_f)
                  in_f.close()
              elif page.mediaBox.getUpperRight_y() > 950 and page.mediaBox.getUpperRight_x() < 570:
                page.cropBox.upperRight = (page.mediaBox.getUpperRight_x(), (page.mediaBox.getUpperRight_y() - 850))
                page.cropBox.lowerLeft = (0, page.mediaBox.getUpperRight_y())
                output.addPage(page)
                with open(os.path.join(source_folder   folders,files[:-4])   "_new.pdf", "wb") as out_f:
                  output.write(out_f)
                  in_f.close()            
              else:
                print (dir_names   "\\"   files   ", this document is under 1300 pts")
                in_f.close()```