Reading and writing every 100 rows of an xlsx file using Python3.x-CodePudding

There is an xlsx file with one column with url. The number of rows is undefined. We need to read from the file every 100 rows and send them to write in .txt file until the rows run out. The last iteration can be less than 100 rows. I only figured out to send the first 100 rows and I don't understand the rest. I use openpyxl.

Example sheet https://docs.google.com/spreadsheets/d/16qietXcZbGEKCyVxBMuN1PRG2Xlp7I_xIi7mYIuWSHg/edit#gid=0

I know there is isslice() and itertools.groupby() but I don't understand how to use them. Here is a part of code, which reads and writes first 100 rows.

import datetime
from openpyxl import load_workbook

a_file = load_workbook("urls.xlsx") # get file
a_sheet = a_file["Sheet1"] # get sheet 

def write_result(work_type, url, date): 
    if work_type == 'txt_file':
        with open('result.txt', 'a', encoding='utf-8') as result_file:
            string_write = f"{url};{date}\n"
            result_file.write(string_write)  # write url to text file

def send_urls():
    row_num = 2
    for rows in a_sheet.iter_rows(min_row=2, max_row=100, max_col=1):
        for url in rows:
            if url.value is not None:
                url_new = str(a_sheet.cell(column=1, row=row_num).value).strip()
                write_result('txt_file', url_new, datetime.date.today())
                row_num  = 1
    a_file.save("urls.xlsx")
    a_file.close()

def main():
    send_urls()

if __name__ == "__main__":
    main()

CodePudding user response：

I have updated the code to handle the multiple files it needs to create. The output will be multiple text files with names result0.txt, result1.txt, result2.txt (based on how many urls you have in the excel file). See if this satisfies your requirement...

import datetime
def write_result(work_type, fileNumber, url, date):
    if work_type == 'txt_file':
        with open('result'   str(fileNumber)   '.txt', 'a', encoding='utf-8') as result_file:
            string_write = f"{url};{date}\n"
            result_file.write(string_write)  
                  
def send_urls():
    a_file=openpyxl.load_workbook('urls.xlsx') ##Open excel file and sheet
    a_sheet=a_file.active
    row_num = 2
    fileNumber = 0  ##Add number so you add it to result.txt
    while True:
        ##Note it should be 101 to get 100 urls in file. Start & end rows should keep incrementing
        for rows in a_sheet.iter_rows(min_row=(2   100 * fileNumber), max_row=(101   100 * fileNumber), max_col=1):  
            for url in rows:
                if url.value is not None:
                    url_new = str(a_sheet.cell(column=1, row=row_num).value).strip()
                    write_result('txt_file', fileNumber, url_new, datetime.date.today())
                    row_num  = 1

        if (101   (100 * fileNumber)) >= a_sheet.max_row: ##Exit once you reach the end
            break
        fileNumber  = 1 ##Increment filenumber till you reach the end...

#    a_file.save("urls.xlsx") ## Not required
#    a_file.close()           ## Not required

def main():
    send_urls()
    
if __name__ == "__main__":
    main()

CodePudding user response：

This is my solution using pandas (you also need openpyxl)

import pandas as pd

skiprows = 1 #skip header line
nrows = 100
infile = 'urls.xlsx'
outfile = 'urls.txt'

with open(outfile, 'w') as f:
    while True:
        df = pd.read_excel(infile, skiprows=skiprows, nrows=nrows, header=None)
        if len(df) == 0: break #if there is no data left to read, break the loop

        skiprows  = nrows #update for the next loop cycle
        urls = df[0].values #if there is no header, the column is called 0 by default
        f.write('\n'.join(urls)   '\n')

It is not fully clear from your question if you want to close the output file after every chunks. I just opened the file in write mode before the loop and I keep it open until the end. In case, you can open it in append mode inside the loop body, but I don't see the advantage of doing that.