I'm currently trying to split my csv files into multiple files, with the beginning of each split overlapping each other (for example: file 1 will be row 1-4000, then row 2 will be 3000-7000, and row 3 will be 6000-10000, etc.)
chunk_size = 4000
def write_chunk(part, lines):
with open('data_part_' str(part) '.csv', 'w') as f_out:
f_out.write(header)
f_out.writelines(lines)
with open("8-0new2.csv", "r") as f:
count = 0
header = f.readline()
lines = []
# for line in f:
for line in range():
count = 1
lines.append(line)
if count % chunk_size == 0:
write_chunk(count // chunk_size, lines)
lines = []
# write remainder
if len(lines) > 0:
write_chunk((count // chunk_size) 1, lines)
this is my current code to split the csv into 4 files, any idea to improve it so it can write the csv with overlapping rows?
CodePudding user response:
I don't have the data to test this thoroughly but should work:
CHUNK = 4_000
OVERLAP = 1_000
def write_csv(lines, filename, header):
with open(filename, 'w') as csv:
csv.write(header)
csv.writelines(lines)
def get_csv_gen():
part = 1
while True:
yield f'data_part_{part}.csv'
part = 1
get_csv_name = get_csv_gen()
with open('8-0new2.csv') as csv:
header = csv.readline()
lines = csv.readlines()
for offset in range(0, len(lines), CHUNK-OVERLAP):
write_csv(lines[offset:offset CHUNK], next(get_csv_name), header)