There is an xlsx file with one column with url. The number of rows is undefined. We need to read from the file every 100 rows and send them to write in .txt file until the rows run out. The last iteration can be less than 100 rows. I only figured out to send the first 100 rows and I don't understand the rest. I use openpyxl.
Example sheet https://docs.google.com/spreadsheets/d/16qietXcZbGEKCyVxBMuN1PRG2Xlp7I_xIi7mYIuWSHg/edit#gid=0
I know there is isslice() and itertools.groupby() but I don't understand how to use them. Here is a part of code, which reads and writes first 100 rows.
import datetime
from openpyxl import load_workbook
a_file = load_workbook("urls.xlsx") # get file
a_sheet = a_file["Sheet1"] # get sheet
def write_result(work_type, url, date):
if work_type == 'txt_file':
with open('result.txt', 'a', encoding='utf-8') as result_file:
string_write = f"{url};{date}\n"
result_file.write(string_write) # write url to text file
def send_urls():
row_num = 2
for rows in a_sheet.iter_rows(min_row=2, max_row=100, max_col=1):
for url in rows:
if url.value is not None:
url_new = str(a_sheet.cell(column=1, row=row_num).value).strip()
write_result('txt_file', url_new, datetime.date.today())
row_num = 1
a_file.save("urls.xlsx")
a_file.close()
def main():
send_urls()
if __name__ == "__main__":
main()
CodePudding user response:
I have updated the code to handle the multiple files it needs to create. The output will be multiple text files with names result0.txt, result1.txt, result2.txt (based on how many urls you have in the excel file). See if this satisfies your requirement...
import datetime
def write_result(work_type, fileNumber, url, date):
if work_type == 'txt_file':
with open('result' str(fileNumber) '.txt', 'a', encoding='utf-8') as result_file:
string_write = f"{url};{date}\n"
result_file.write(string_write)
def send_urls():
a_file=openpyxl.load_workbook('urls.xlsx') ##Open excel file and sheet
a_sheet=a_file.active
row_num = 2
fileNumber = 0 ##Add number so you add it to result.txt
while True:
##Note it should be 101 to get 100 urls in file. Start & end rows should keep incrementing
for rows in a_sheet.iter_rows(min_row=(2 100 * fileNumber), max_row=(101 100 * fileNumber), max_col=1):
for url in rows:
if url.value is not None:
url_new = str(a_sheet.cell(column=1, row=row_num).value).strip()
write_result('txt_file', fileNumber, url_new, datetime.date.today())
row_num = 1
if (101 (100 * fileNumber)) >= a_sheet.max_row: ##Exit once you reach the end
break
fileNumber = 1 ##Increment filenumber till you reach the end...
# a_file.save("urls.xlsx") ## Not required
# a_file.close() ## Not required
def main():
send_urls()
if __name__ == "__main__":
main()
CodePudding user response:
This is my solution using pandas
(you also need openpyxl
)
import pandas as pd
skiprows = 1 #skip header line
nrows = 100
infile = 'urls.xlsx'
outfile = 'urls.txt'
with open(outfile, 'w') as f:
while True:
df = pd.read_excel(infile, skiprows=skiprows, nrows=nrows, header=None)
if len(df) == 0: break #if there is no data left to read, break the loop
skiprows = nrows #update for the next loop cycle
urls = df[0].values #if there is no header, the column is called 0 by default
f.write('\n'.join(urls) '\n')
It is not fully clear from your question if you want to close the output file after every chunks. I just opened the file in write mode before the loop and I keep it open until the end. In case, you can open it in append mode inside the loop body, but I don't see the advantage of doing that.