Home > database >  Python pandas generate excel file with 1000 rows
Python pandas generate excel file with 1000 rows

Time:02-01

I have one excel file with 38 000 rows. I would like an excel file to be created after every 1000 rows. At the end I should have 38 excel files. This is my very simple script in Python:

import pandas as pd
import os
import deepl

WD = r'C:\Users\Admin\XXX\\'

for file in os.listdir(WD):
    if file.endswith('.xlsx'):
        FILE = file

        sheet_names = pd.ExcelFile(FILE).sheet_names  

        for sn in sheet_names:
            OUTPUT_FILE = '{}_{}'
            df = pd.read_excel(FILE)
            print(FILE, sn)

            for col in df.columns.to_list():
                df[col] = df[col].map({True: '', False: ''}).fillna(df[col])

            auth_key = 'XX'
            translator = deepl.Translator(auth_key)
        

            df['TRANSLATE'] = df['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,  
               target_lang="CS") if type(x) == str else x)


        cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
        df = df.reindex(columns = cn)

        df.to_excel(r'C:\Users\Admin\\FINAL_FILE.xlsx', index=False)

Have you any Idea? Thank you very much!!

CodePudding user response:

Use DataFrame.groupby by helper array by numpy.arange with integer division by N:

N = 1000
for val, df1 in df.groupby(np.arange(len(df)) // N):
     df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)

EDIT: For processing by 1000 rows use:

for file in os.listdir(WD):
    if file.endswith('.xlsx'):
        FILE = file

        sheet_names = pd.ExcelFile(FILE).sheet_names  

        for sn in sheet_names:
            OUTPUT_FILE = '{}_{}'
            df = pd.read_excel(FILE)
            print(FILE, sn)
            
            for val, df1 in df.groupby(np.arange(len(df)) // N):

                for col in df1.columns.to_list():
                    df1[col] = df1[col].map({True: '', False: ''}).fillna(df1[col])
    
                auth_key = 'XX'
                translator = deepl.Translator(auth_key)
            
    
                df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x,  
                   target_lang="CS") if type(x) == str else x)
    
    
                cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
                df1 = df1.reindex(columns = cn)
        
                df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)

CodePudding user response:

In the end, it's this script that solved my problem :-). But thank you @jezrael for your help:

auth_key = 'xxx'
translator = deepl.Translator(auth_key)

FILE = r"XXX\File.xlsx"
df = pd.read_excel(FILE)
nor = df.shape[0]
breakpoint = 0
chunk_size = 1000

iteration_number = int(round(nor/chunk_size))
for y in range(0,iteration_number 1):
    
    df1 = df.iloc[breakpoint:breakpoint chunk_size,:]

    if df1.empty == False:
       # print(df1)

        df1['TRANSLATE'] = df1['COLUMN TO TRANSLATE'].apply(lambda x: translator.translate_text(x, target_lang="CS") if type(x) == str else x)

        cn = ['COLUMN TO TRANSLATE', 'TRANSLATE']
        df1 = df1.reindex(columns = cn)
        df1.to_excel(rf'C:\Users\Admin\\FINAL_FILE_{val}.xlsx', index=False)

    breakpoint = breakpoint chunk_size 
  • Related