How to create a loop through multiple excel files same worksheet and perform the same function in pa-CodePudding

I have 130 different excel files which each having 5 worksheets with each having the same name. for example 1st excel file has worksheets like 'Personnele', 'Sales', 'Profit', 'Customer' and 'KPI'. each 130 of them has the same worksheets. I would like to work on the 5th worksheet of 130 excel files, 'KPI'.

One by one i can read, manipulate, clean change the format (unpivoting etc) and write to excel again. But 130 files one by one takes a lot of time to do it.

I want to read all of the excel files KPI worksheets into one pandas dataframe and manipulate, clean, change the format and write all of them as one excel file.

I did try out some of the answers here asked 6 years ago but non of them are my solution.

How can I do this with pandas?

CodePudding user response：

I have this working for some (3) simple csv files I created. It should be adaptable to excel format. It load/processes them in parallel in load_df(), then appends them (in a thread-safe way, since the queue will block) all into the same file. You could then load the "final" file into memory and do further processing.

You can definitely increase the default number of threads for loading the dataframes. It doesn't need to be equal to the number of files you are processing, but should not be more than that.

I have no idea if this is faster than your current method, but I hope so!

Edit: I tried this with 20 threads on 200 files that each had a one thousand rows and 26 columns, and it took less than 13 seconds.


import os
from queue import Queue
from threading import Thread

import pandas as pd

OUTPUT_PATH = 'all_kpi.csv'


def load_df(file_name: str) -> pd.DataFrame:
    # load the df from file however you would like, below is just example
    # df = pd.read_excel(file_name, sheet_name='KPI')
    df = pd.read_csv(file_name, header=0)
    # process if necessary before return
    return df


def reader(file_q: Queue, df_q: Queue):
    while True:
        file_name = file_q.get()
        df = load_df(file_name)
        df_q.put(df)
        file_q.task_done()


def append_df(df):
    file_exists = os.path.exists(OUTPUT_PATH)
    df.to_csv(OUTPUT_PATH, mode='a', header=not file_exists, index=False)


def writer(df_q: Queue[pd.DataFrame]):
    while True:
        df = df_q.get()
        append_df(df)
        df_q.task_done()


def get_files() -> list:
    # load file names somehow
    return ['1.csv', '2.csv', '3.csv']


def main(n_reader_threads: int = 3):
    file_queue = Queue()
    df_queue = Queue()

    for _ in range(n_reader_threads):
        Thread(target=reader, args=(file_queue, df_queue), daemon=True).start()

    Thread(target=writer, args=(df_queue,), daemon=True).start()

    for f_name in get_files():
        file_queue.put(f_name)

    file_queue.join()
    df_queue.join()


if __name__ == '__main__':
    main()

CodePudding user response：

I found a solution without queue and threads. here is how i did it.

I installed the libraries

import pandas as pd
import numpy as np
import os

I afterwards added my pandas code where i treated the dataframe

def get_dataframe(file_name, sheet_name="CMO & KPI"):
    df = pd.read_excel(file_name, sheet_name=sheet_name)
    entity = df[df["Unnamed: 6"]=="Entity:"]["Unnamed: 9"].values[0]
    df["Unnamed: 32"] = entity
    data_entity = entity.split("-")
    first_data = second_data = ""
    if len(data_entity) > 1:
        first_data = data_entity[0].strip()
        second_data = data_entity[1].strip()

    data = second_data.split(" ")
    data1 = second_data
    data2 = ""
    if len(data) > 1:
        data1 = data[0].strip()
        data2 = data[1].strip()
        
    df["Activity"] = first_data
    df["Division"] = data1
    df["Site"] = data2
    
    date = df[df["Unnamed: 6"]=="Month:"]["Unnamed: 8"].values[0]
    
    df["Unnamed: 36"] = date 
    
    df = df.drop([0, 1, 2, 3, 4, 5], axis=0)
    df = df.drop(df.columns[[0, 1, 2, 3, 4, 5]], axis=1)
    
    df.columns = df.iloc[0]
    
    df = df.drop([6, 7, 8, 9, 10, 11, 12], axis=0)
    
    df.rename(columns={list(df)[26]:'Entity'}, inplace=True)

    df.rename(columns={list(df)[27]:'Activity'}, inplace=True)

    df.rename(columns={list(df)[28]:'Division'}, inplace=True)

    df.rename(columns={list(df)[29]:'Site'}, inplace=True)

    df.rename(columns={list(df)[30]:'Date'}, inplace=True)
    
    df.columns.values[0] = 'Index'

    df = df.drop(labels=["colonne cachée","July","August"],axis=1)
    
    df = pd.melt(df, id_vars=df.columns[[0, 23, 24, 25, 26, 27]], value_vars=df.columns[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]])

    return  df

I adressed python where to get the dataframe

and in order to concatenate, and use my function code to treat the dataframes I created a list of the files

files = []
dir_path = "./data"

for path in os.listdir(dir_path):
    data_path = path.split(".")
    if len(data_path) > 0 and data_path[-1].lower() == "xlsm":
        files.append(path)
        print(path)

than, i forced pandas code to treat the dataframes at the same

df_files = []

for name_file in files:
    path_file = os.path.join(dir_path, name_file)
    df = get_dataframe(path_file)
    df_files.append(df)
    print(name_file, list(df.columns), df.shape)

and finally i concatenated all the files to one file and export it as an excel file.

I read 25 files for the first time run, and took approximately 24 seconds to finish. the excel files are xlsm format in which there are macros and micros enabled formatted with visual basic.

I think it is the same logic but different path. Thank you