What I am looking to do is take the current code I have built and be able to use it or something similar so I can run it on an entire directory of CSV files instead of doing one file at a time. Problem being is I have 50 files and it would be much simpler if I can just point it at a directory and have it run on every file inside the directory.
Thanks in advance
import pandas as pd
df=pd.read_csv(r"C:\Users\Kris\OneDrive - kris\SW\12-21.csv",)
df=df.rename(columns = {'Segmentation/Pool Code':'Code'})
df_Auto = df.loc[df['Code'].isin(['21', '94', '103', '105', '22', '82', '97', '104', '1', '71', '100', '2', '35', '62', '72', '101'])]
df_Mortgage = df.loc[df['Code'].isin(["M000","M001", "M003", "M004", "M005", "M006", "M007", "M008","M010", "M011", "M013", "M014", "M015", "M016", "M024", "M025", "M027", "M028", "M029", "M031", "M033", "M035","M036","M037","M038","M039",'M040','M041','M042','M043','M044','M020','M021','M022','M023','M026','M032','M034', '18', '28', '34', '87'])]
df_HELOC = df.loc[df['Code'].isin(["17","83","88","19","31","84","85",])]
df_CC = df.loc[df['Code'].isin(["116","118","119","120","121","122","123","125",])]
df_Other = df.loc[df['Code'].isin(["33","41","51","52", "56","57","58","59","75","76","130","131","132","133","134","135","136","140","54", "55","60","77", "78","79","115","4","5","6","7","13","14","16", "32","44","45","46","47","67","106","107","109","110","160","3","10","11","12","25","69","95","102",])]
#Save Files
df_Auto.to_csv(r"C:\Users\Kris\OneDrive - kris\SW\12-21_auto.csv")
df_Mortgage.to_csv(r"C:\Users\Kris\OneDrive - kris\SW\12-21_mortgage.csv")
df_HELOC.to_csv(r"C:\Users\Kris\OneDrive - kris\SW\12-21_HELOC.csv")
df_CC.to_csv(r"C:\Users\Kris\OneDrive - kris\SW\12-21_CC.csv")
df_Other.to_csv(r"C:\Users\Kris\OneDrive - kris\SW\12-21_Other.csv")
CodePudding user response:
you should define a function which processes a csv and then loop over all files in your directory. For example:
import os
def process_csv(path_name):
print(f"Processing csv at {path_name}")
def loop_over_directory(directory_name):
for file_name in os.listdir(directory_name):
process_csv(directory_name "/" file_name)
CodePudding user response:
Using glob and a function to loop over:
import pandas as pd
import os
from glob import glob
# Using a dict to make things a bit more generic
code_dict = {
'Auto': ['21', '94', '103', '105', '22', '82', '97', '104', '1', '71', '100', '2', '35', '62', '72', '101'],
'Mortgage': ["M000","M001", "M003", "M004", "M005", "M006", "M007", "M008","M010", "M011", "M013", "M014", "M015", "M016", "M024", "M025", "M027", "M028", "M029", "M031", "M033", "M035","M036","M037","M038","M039",'M040','M041','M042','M043','M044','M020','M021','M022','M023','M026','M032','M034', '18', '28', '34', '87'],
'HELOC': ["17","83","88","19","31","84","85",],
'CC': ["116","118","119","120","121","122","123","125",],
'Other': ["33","41","51","52", "56","57","58","59","75","76","130","131","132","133","134","135","136","140","54", "55","60","77", "78","79","115","4","5","6","7","13","14","16", "32","44","45","46","47","67","106","107","109","110","160","3","10","11","12","25","69","95","102",]
}
def process_csv(file):
# Splitting up the file name in to parts for use later
base_name = os.path.splitext(os.path.basename(file))[0]
save_dir = os.path.split(file)[0]
print(f"Reading {base_name}")
df = pd.read_csv(file)
df = df.rename(columns={"Segmentation/Pool Code": "Code"})
# Looping over each of the items in the dictionary
for name, codes in code_dict.items():
sub_df = df.loc[df["Code"].isin(codes)]
# Constructing the save file from the dictionary key
save_file = os.path.join(save_dir, f"{base_name}_{name.lower()}.csv")
sub_df.to_csv(save_file)
# Search for files in this directory:
search_dir = r"C:\Users\Kris\OneDrive - kris\SW"
# glob is a nice way to search in a directory
files = glob(os.path.join(search_dir, "*.csv"))
for file in files:
# Process each file one at a time
process_csv(file)