Home > front end >  Loop through .zip files and print length of excels inside zip
Loop through .zip files and print length of excels inside zip

Time:12-19

I am having trouble looping through my .zip files (5 of them) in my directory , which have multiple sheets in them.

  1. I want to count the records in each .zip file (appending record count of each sheet)

right now im just getting the first row of my below output. I do think i have an issue at this line df = df[:1] as i think its just limiting to the one row. Any ideas?

Output should be something like -

enter image description here

Code:

# Import File Function
def read_excel_sheets(xls_path):
   """Read all sheets of an Excel workbook and return a single DataFrame"""
   xl = pd.ExcelFile(xls_path)
   df = pd.DataFrame()
   for idx, name in enumerate(xl.sheet_names):
        sheet = xl.parse(name, header=None, dtype=str, ignore_index=True)
        # Drop Empty Columns
        sheet.dropna(axis=1, how='all', inplace=True)
        # Add sheet name as column
        sheet['sheet'] = name.split(" ")[-1]
        # Appending Data
        if len(df.columns) >= len(sheet.columns):
            df = df.append(sheet, ignore_index=True, sort=True)
        else:
            df = sheet.append(df, ignore_index=True, sort=True)
        del sheet

   return df

# Process File Function
def process_files(list_of_files):
    df = pd.DataFrame()
    for file in list_of_files:
        # zip file handler
        zip = zipfile.ZipFile(file)
        # list available files in the container
        zfiles = zip.namelist()
        #extensions to process
        extensions = (".xls",".xlsx")
        # Importing to Dataframe
        for zfile in zfiles:
            if zfile.endswith(extensions):
                # For Row Count Check
                df_temp = read_excel_sheets(zip.open(zfile))
                df = df.append(df_temp, ignore_index=True, sort=True)
                df['Length'] = len(df)
                df['FileName'] = file
                df = df[['Length','FileName']]
                df = df[:1]
                
    return df         
                



input_location = r'O:\Stack\Over\Flow'
month_to_process = glob.glob(input_location   "\\2022 - 10\\*.zip")             
df = process_files(month_to_process)  
print(df)

CodePudding user response:

Glob all zip files in path with filenames containing "Test". Loop files and sheets creating a dictionary. Convert dictionary to pandas df.

You'll have to change to your file path and file names:

import os
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile

import pandas as pd


def process_files(files: list) -> pd.DataFrame:
    file_mapping = {}
    for file in files:
        with ZipFile(file, "r") as zipped:
            os.chdir(path)
            zipped.extractall()

        data_mapping = pd.read_excel(BytesIO(ZipFile(file).read(Path(file).stem)), sheet_name=None)

        row_counts = []
        for sheet in list(data_mapping.keys()):
            row_counts.append(len(data_mapping.get(sheet)))

        file_mapping.update({file: sum(row_counts)})

    frame = pd.DataFrame([file_mapping]).transpose().reset_index()
    frame.columns = ["file_name", "row_counts"]

    return frame

path = f"{Path().home()}/Documents/"
zip_files = (str(x) for x in Path(path).glob("Test*.zip"))
df = process_files(zip_files)
print(df)

Output:

                                    file_name  length
0  /Users/joeblow/Documents/Test2.xlsx.zip      16
1  /Users/joeblow/Documents/Test3.xlsx.zip      16
2  /Users/joeblow/Documents/Test4.xlsx.zip       4
3  /Users/joeblow/Documents/Test1.xlsx.zip      21
  • Related