Run Python script on all files in the directory after a file has run, create a new folder and name t-CodePudding

I wrote a Python script TestData.pythat uses Pandas and NumPy to test a CSV for data anomalies. It inputs one CSV and outputs 4 new ones. For each input file that needs testing I do the following:

Copy the name of the unknown file. In this example: unknownfilename1.csv
Create a folder.
Rename the New Folder by pasting in unknownfilename1.csv, removing the .csv
Paste unknownfilename1.csv into data = pd.read_csv("unknownfilename0.csv")
Drag TestData.py into the folder unknownfilename1
Finally, run TestData.py

import pandas as pd
import numpy as np

# Import raw data
data = pd.read_csv("unknownfilename1.csv", encoding='latin-1' )

#################################################
# Over 500 lines of code using Pandas and Numpy #
#################################################

# failed at least one testcase, needs to be fixed before importing.
failed.to_csv("C:/users/path/Failed.csv", index = False)
# Output passed rows.
passed.to_csv("C:/users/path/Passed.csv", index = False)
# Ready to import.
newimpomatic.to_csv("C:/users/path/Import.csv", index = False)
# Duplicates IDs
duplicated.to_csv("C:/users/path/duplicated.csv", index = False)

I would like each file to be tested in:

C:/users/path/unknownfilename1.csv
C:/users/path/unknownfilename2.csv
C:/users/path/unknownfilename3.csv

To output:

 C:/users/path/unknownfilename1/Failed.csv
 C:/users/path/unknownfilename1/Passed.csv
 C:/users/path/unknownfilename1/Import.csv
 C:/users/path/unknownfilename1/duplicated.csv

 C:/users/path/unknownfilename2/Failed.csv
 C:/users/path/unknownfilename2/Passed.csv
 C:/users/path/unknownfilename2/Import.csv
 C:/users/path/unknownfilename2/duplicated.csv

 C:/users/path/unknownfilename3/Failed.csv
 C:/users/path/unknownfilename3/Passed.csv
 C:/users/path/unknownfilename3/Import.csv
 C:/users/path/unknownfilename3/duplicated.csv

If I have 100s different files in a folder. What is the easiest way to add something to my script to test all files, after each file is tested, create a new folder and then name the folder after the file that was tested?

CodePudding user response：

The Path class in the python builtin library pathlib is great at this, and working with files/folder locations in general. With glob(pattern: str), you can yield all matches to a particular file pattern in a directory, and iterate over those matches.

https://docs.python.org/3.9/library/pathlib.html#pathlib.Path.glob

You can also use Path to grab the name of the file and create a new directory to place your outputted csvs.

The file below assumes it is in the same directory as all of the original csvs, but that is changeable. I call that directory base_dir, equivalent to what you listed as C:/users/path/

/users/path/main.py:

from pathlib import Path
import pandas as pd
import numpy as np


failed_csv = 'Failed.csv'
passed_csv = 'Passed.csv'
import_csv = 'Import.csv'
dup_csv = 'duplicated.csv'


def get_root() -> Path:
    return Path(__file__).resolve().parent


def process(csv_file: Path, out_dir: Path) -> None:
    data = pd.read_csv(csv_file, encoding='latin-1')
    
    ###
    ### Do existing processing of data DataFrame
    ###

    # Save files. These print statements will show the final
    # file path for each of the output csvs.

    print(out_dir / failed_csv)  # '/users/path/my_file/Failed.csv'
    print(out_dir / passed_csv)  # '/users/path/my_file/Failed.csv'
    print(out_dir / import_csv)  # '/users/path/my_file/Passed.csv'
    print(out_dir / dup_csv)     # '/users/path/my_file/duplicated.csv'

    failed.to_csv(out_dir / failed_csv, index=False)
    passed.to_csv(out_dir / passed_csv, index=False)
    newimpomatic.to_csv(out_dir / import_csv, index=False)
    duplicated.to_csv(out_dir / dup_csv, index=False)


def main(base_dir: Path) -> None:

    print(f'Processing files in {base_dir}: \n')

    n_process = 0
    for csv_file in base_dir.glob('*.csv'):
        
        # ex. csv_file = "/users/path/my_file.csv"
        
        name: str = csv_file.stem   # name = "my_file"
        
        output_dir: Path = base_dir / name  # output_dir = "/users/path/my_file"

        print(f'Creating directory "{output_dir}"')
        Path.mkdir(output_dir, exist_ok=True)

        print(f'Processing "{csv_file}"')
        process(csv_file=csv_file, out_dir=output_dir)

        print(f'Completed processing\n')
        n_process  = 1

    print(f'\nProcessed {n_process} files')


if __name__ == '__main__':
    root = get_root()  # root = "users/path"
    main(base_dir=root)