I am having trouble looping through my .zip files (5 of them) in my directory , which have multiple sheets in them.
- I want to count the records in each .zip file (appending record count of each sheet)
right now im just getting the first row of my below output. I do think i have an issue at this line df = df[:1]
as i think its just limiting to the one row. Any ideas?
Output should be something like -
Code:
# Import File Function
def read_excel_sheets(xls_path):
"""Read all sheets of an Excel workbook and return a single DataFrame"""
xl = pd.ExcelFile(xls_path)
df = pd.DataFrame()
for idx, name in enumerate(xl.sheet_names):
sheet = xl.parse(name, header=None, dtype=str, ignore_index=True)
# Drop Empty Columns
sheet.dropna(axis=1, how='all', inplace=True)
# Add sheet name as column
sheet['sheet'] = name.split(" ")[-1]
# Appending Data
if len(df.columns) >= len(sheet.columns):
df = df.append(sheet, ignore_index=True, sort=True)
else:
df = sheet.append(df, ignore_index=True, sort=True)
del sheet
return df
# Process File Function
def process_files(list_of_files):
df = pd.DataFrame()
for file in list_of_files:
# zip file handler
zip = zipfile.ZipFile(file)
# list available files in the container
zfiles = zip.namelist()
#extensions to process
extensions = (".xls",".xlsx")
# Importing to Dataframe
for zfile in zfiles:
if zfile.endswith(extensions):
# For Row Count Check
df_temp = read_excel_sheets(zip.open(zfile))
df = df.append(df_temp, ignore_index=True, sort=True)
df['Length'] = len(df)
df['FileName'] = file
df = df[['Length','FileName']]
df = df[:1]
return df
input_location = r'O:\Stack\Over\Flow'
month_to_process = glob.glob(input_location "\\2022 - 10\\*.zip")
df = process_files(month_to_process)
print(df)
CodePudding user response:
Glob all zip files in path with filenames containing "Test". Loop files and sheets creating a dictionary. Convert dictionary to pandas df.
You'll have to change to your file path and file names:
import os
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
import pandas as pd
def process_files(files: list) -> pd.DataFrame:
file_mapping = {}
for file in files:
with ZipFile(file, "r") as zipped:
os.chdir(path)
zipped.extractall()
data_mapping = pd.read_excel(BytesIO(ZipFile(file).read(Path(file).stem)), sheet_name=None)
row_counts = []
for sheet in list(data_mapping.keys()):
row_counts.append(len(data_mapping.get(sheet)))
file_mapping.update({file: sum(row_counts)})
frame = pd.DataFrame([file_mapping]).transpose().reset_index()
frame.columns = ["file_name", "row_counts"]
return frame
path = f"{Path().home()}/Documents/"
zip_files = (str(x) for x in Path(path).glob("Test*.zip"))
df = process_files(zip_files)
print(df)
Output:
file_name length
0 /Users/joeblow/Documents/Test2.xlsx.zip 16
1 /Users/joeblow/Documents/Test3.xlsx.zip 16
2 /Users/joeblow/Documents/Test4.xlsx.zip 4
3 /Users/joeblow/Documents/Test1.xlsx.zip 21