I have one excel file with 38 000 rows. I would like an excel file to be created after every 1000 rows. At the end I should have 38 excel files. This is my very simple script in Python:
import pandas as pd
import os
import deepl
WD = r'C:\Users\Admin\XXX\\'
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'
df = pd.read_excel(FILE)
print(FILE, sn)
for col in df.columns.to_list():
df[col] = df[col].map({True: '', False: ''}).fillna(df[col])
auth_key = 'XX'
translator = deepl.Translator(auth_key)
df['TRANSLATE'] = df['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,
target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df = df.reindex(columns = cn)
df.to_excel(r'C:\Users\Admin\\FINAL_FILE.xlsx', index=False)
Have you any Idea? Thank you very much!!
CodePudding user response:
Use DataFrame.groupby
by helper array by numpy.arange
with integer division by N
:
N = 1000
for val, df1 in df.groupby(np.arange(len(df)) // N):
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
EDIT: For processing by 1000 rows use:
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'
df = pd.read_excel(FILE)
print(FILE, sn)
for val, df1 in df.groupby(np.arange(len(df)) // N):
for col in df1.columns.to_list():
df1[col] = df1[col].map({True: '', False: ''}).fillna(df1[col])
auth_key = 'XX'
translator = deepl.Translator(auth_key)
df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,
target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df1 = df1.reindex(columns = cn)
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
CodePudding user response:
In the end, it's this script that solved my problem :-). But thank you @jezrael for your help:
auth_key = 'xxx'
translator = deepl.Translator(auth_key)
FILE = r"XXX\File.xlsx"
df = pd.read_excel(FILE)
nor = df.shape[0]
breakpoint = 0
chunk_size = 1000
iteration_number = int(round(nor/chunk_size))
for y in range(0,iteration_number 1):
df1 = df.iloc[breakpoint:breakpoint chunk_size,:]
if df1.empty == False:
# print(df1)
df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x, target_lang="CS") if type(x) == str else x)
cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
df1 = df1.reindex(columns = cn)
df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)
breakpoint = breakpoint chunk_size