I have a folder full of CSV's amounting to a total of 50 files that contain a lot of different data, I am trying to combine them into files that will contain around 500000 entries, as I'll be taking around 10000 rows from every file if possible and combine put them into the new file, like repeat this process for 50 files to make a single file
Right now, I have this code that I've written which I am still stuck with and can not figure out how to continue from here
import os
import pandas as pd
from tkinter import filedialog
from tkinter import *
class ExcelHandler:
def __init__(self):
self.path = ""
self.files = []
self.firstFile = True
def selectDirectory(self):
"""This function opens a file dialog to select a directory. It then sets the path to the directory and gets all the files in the directory."""
root = Tk()
root.withdraw()
self.path = filedialog.askdirectory(title="Select a directory")
self.files = os.listdir(os.path.abspath(self.path))
self.xlsx_files = [i for i in self.files if i.endswith(".xlsx")]
self.csv_files = [i for i in self.files if i.endswith(".csv")]
def createNewSamples(self, fromRows, toRows):
"""This function opens csv's one by one and takes 10000 entries from each and adds them to new dataframe. after it exports the combined sheet"""
if self.path and self.files:
try:
print(
"Creating new file containing 10000 entries from all the csv's..."
)
initial, final = fromRows, toRows
while True:
try:
df_total = pd.DataFrame(
columns=[
"Company Name",
"Email Address",
"Address",
"City",
"State",
"Zipcode",
"Phone",
"Fax",
"Sic Code",
"Sic Description",
"Website",
]
)
for file in self.csv_files:
if self.firstFile:
df = pd.read_csv(
file,
skiprows=initial,
# chunksize=final,
nrows=10000,
low_memory=False,
)
self.firstFile = False
else:
df = pd.read_csv(
file,
skiprows=initial,
# chunksize=final,
nrows=10000,
low_memory=False,
)
# df = df[initial:final]
df_total = pd.concat([df_total, df], ignore_index=True)
self.firstFile = True
df_total.to_csv(f"newData/Mixed-Data{final}.csv", index=False)
print(f"Mixed-Data{final}.csv created!")
initial, final = fromRows initial, toRows final
except:
print("Can not continue for some reason.")
break
except:
print("Error creating the csv.")
return
else:
print("Please select a directory first using selectDirectory().")
Is there any way I can take 10000 or as many as the files have remaining for the last few rows sake, from each file, combine them into a single file and export it, and repeat the process till I run out of rows in every file. I would also like to keep the Header column as the same for every output file as all of them have the same header so combined files should keep the same header too, hence there I still can't figure out the way to do this
To Add on, I am using pandas to handle xlsx and csv files, and tkinter to help the people using this library to select the folder they want form where all the csv's and xlsx's are selected and put in an array
ExcelHandler is just a name for the class, when i was initially working on it I just named it this for no specific or special reason
CodePudding user response:
My solution does open two CSV files, read them line by line, "glue" the lines together and write them to a third result file. It also checks for an expected header and takes care that this header, if it appears in each input file, is not repeated in the result file.
#!/usr/bin/env python3
import pathlib
def generate_sample_data():
"""This will generate two csv files."""
with pathlib.Path('one.csv').open('w') as f:
f.write('colA;colB;colC\n1;2;3\n4;5;6\n')
with pathlib.Path('two.csv').open('w') as f:
f.write('colA;colB;colC\n7;8;9\n10;11;12\n')
if __name__ == '__main__':
# Generate sample data in two csv files
generate_sample_data()
csv_header = 'colA;colB;colC\n'
result = []
# open each input csv file
for input_filename in ['one.csv', 'two.csv']:
with pathlib.Path(input_filename).open('r') as in_file:
# read the whole file and store them as a list line by line
content = in_file.readlines()
# remove the header if there is one
if content[0] == csv_header:
content = content[1:]
result.extend(content)
# insert the header in the first line
result.insert(0, csv_header)
# open the csv to where the result should be stored
with pathlib.Path('one_and_two.csv').open('w') as out_file:
out_file.writelines(result)
# check the result
with pathlib.Path('one_and_two.csv').open('r') as result_file:
print(result_file.read())
If you need to do more computation and checks on your file content I would suggest the csv
package in python itself. I think pandas
is a bit to much for your needs here.