I was trying to use python web scraping then output a csv file but the print format is not matching csv format.
outputenter image description here
how to print this expecting results? enter image description here
Thanks
Below is my script
import urllib.request as req
import bs4
import csv
import pandas as pd
import re
from datetime import date, timedelta
def daterange(start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date timedelta(n)
start_date = date(2021, 12, 10)
end_date = date(2021, 12, 15)
url="https://hkgoldprice.com/history/"
with open('gprice.csv','w',newline="") as f1:
for single_date in daterange(start_date, end_date):
udate = single_date.strftime("%Y/%m/%d")
urld = url single_date.strftime("%Y/%m/%d")
writer=csv.writer(f1,delimiter = '\t',lineterminator='\n',)
writer.writerows(udate)
print(udate)
with req.urlopen(urld) as response:
data=response.read().decode("utf-8")
root=bs4.BeautifulSoup(data, "html.parser")
prices=root.find_all("div",class_="gp")
gshops=root.find_all("div",class_="gshop")
gpdate=root.find_all("div",class_="gp_date")
for price in prices:
print(price.text)
row = price
writer.writerows(row)
CodePudding user response:
The first problem is you use "writerows", which will lead csv write become several rows as it can. So when your text is "2021/12/23", the converter will become ['2', '0', '2', '1', '/', '1', '2', '/', '2', '3'], and write each row with one char. Same problem as the price. So we use "writerow" and save row data as a list to prevent csv convert our data to multiple rows.
The second is use .text
in BeautifulSoup will record all the text including whitespaces and lead csv behavior unpredictable. So I will delete all whitespace and #
first to prevent this situation.
Here is the modified code
with open('gprice.csv','w',newline="") as f1:
for single_date in daterange(start_date, end_date):
udate = single_date.strftime("%Y/%m/%d")
urld = url single_date.strftime("%Y/%m/%d")
#we will append row by row, so we just use default setting on csv write
writer=csv.writer(f1)
#define empty row list
row_list = []
#append datetime
row_list.append(udate)
with req.urlopen(urld) as response:
data=response.read().decode("utf-8")
root=bs4.BeautifulSoup(data, "html.parser")
prices=root.find_all("div",class_="gp")
gshops=root.find_all("div",class_="gshop")
gpdate=root.find_all("div",class_="gp_date")
for price in prices:
#get inner text and delete '#'
row = price.text.replace('#', '')
#delete all whitespaces and append price
row_list.append("".join(row.split()))
#we only append one row data, so use "writerow" instad of "writerows"
writer.writerow(row_list)