Home > Software engineering >  Python shutil pack a zip file and unzip it back EOF error
Python shutil pack a zip file and unzip it back EOF error

Time:05-31

I am using shutils to pack and unpack a Tensorflow model folder (I think this issue is more related to shutils)

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
def load_model_as_bytes(model):
    def file_chunk_generate(file_path):
        CHUNK_SIZE = 4 * 1024 * 1024
        with open(file_path, 'rb') as f:
            while True:
                piece = f.read(CHUNK_SIZE);
                if len(piece) == 0:
                    return
                yield ByteChunk(buffer=piece)
        return file_chunk_generator

    tmpdir = tempfile.mkdtemp()
    tf.saved_model.save(model, tmpdir)
    zip_path = os.path.join(tmpdir, "tf_model")
    shutil.make_archive(zip_path, 'zip', tmpdir)
    size = os.path.getsize(f'{zip_path}.zip')
    logging.info(f"send model file zip, length: {size}") #-------output 4621
    file_chunk_generator = file_chunk_generate(f'{zip_path}.zip')
    return file_chunk_generator

class NeuralNetworkPart(Model):
    def __init__(self):
        super().__init__()
        self.d1 = Dense(128, activation='relu')
        self.d2 = Dense(10)

    def call(self, x):
        x = x[0]
        x = self.d1(x)
        return self.d2(x)

model = NeuralNetworkPart()
it = load_model_as_bytes(model)
tmpdir = tempfile.mkdtemp()
zip_path = os.path.join(tmpdir, "tf_model.zip")
with open(zip_path, 'wb') as f:
    for byte_chunk in it:
        f.write(byte_chunk.buffer)
        logging.info(f"receive model file zip, length: {os.path.getsize(zip_path)}") #-------output 4621

shutil.unpack_archive(zip_path, tmpdir)      

Basically this program get a folder, using make_archive to zip it. Then read the zip file as bytes, store it in a generator variable, and use the generator to write another zip file, and try to use unpack_archive to unzip it.

Before the byte generator is written, and after the zip file is written before unpack, the size are the same (checked in the logging), however when calling unpack, it raise EOF error

    shutil.unpack_archive(zip_path, tmpdir)
  File "/lib/python3.6/shutil.py", line 983, in unpack_archive
    func(filename, extract_dir, **kwargs)
  File "/lib/python3.6/shutil.py", line 901, in _unpack_zipfile
    data = zip.read(info.filename)
  File "/lib/python3.6/zipfile.py", line 1338, in read
    return fp.read()
  File "/lib/python3.6/zipfile.py", line 858, in read
    buf  = self._read1(self.MAX_N)
  File "/lib/python3.6/zipfile.py", line 940, in _read1
    data  = self._read2(n - len(data))
  File "/lib/python3.6/zipfile.py", line 975, in _read2
    raise EOFError

CodePudding user response:

This slightly simplified version seems to work just fine. Note that none of the temporary files are cleaned up by this; you might want to fix that before you have your tmpdir full of TensorFlow models.

import os
import shutil
import tempfile


def file_chunk_generate(file_path):
    CHUNK_SIZE = 4 * 1024 * 1024
    with open(file_path, "rb") as f:
        while True:
            piece = f.read(CHUNK_SIZE)
            if not piece:
                return
            yield piece


def get_zip_chunk_generator(source_dir):
    arcname = shutil.make_archive(
        os.path.join(tempfile.mkdtemp("zip-"), "tf_model"), "zip", source_dir
    )
    return file_chunk_generate(arcname)


def make_source_dir():
    tmpdir = tempfile.mkdtemp("src-")
    for x in range(5):
        with open(os.path.join(tmpdir, f"test-{x}.txt"), "wb") as f:
            f.write(b"foo" * 1024)
    return tmpdir


source_dir = make_source_dir()
it = get_zip_chunk_generator(source_dir)
dest_dir = tempfile.mkdtemp(prefix="dest-")
print("1", os.listdir(dest_dir))
zip_path = os.path.join(dest_dir, "tf_model_dest.zip")
with open(zip_path, "wb") as f:
    for byte_chunk in it:
        f.write(byte_chunk)
print("2", os.listdir(dest_dir))
shutil.unpack_archive(zip_path, dest_dir)
print("3", os.listdir(dest_dir))

The output is

1 []
2 ['tf_model_dest.zip']
3 ['test-0.txt', 'test-1.txt', 'test-3.txt', 'test-2.txt', 'tf_model_dest.zip', 'test-4.txt']

as you might expect.

I'd recommend using tarballs though if you're going to stream over the network (since you could indeed do that without any file on disk at all; ZIPs require seeking support to unpack, but TARs don't).

  • Related