I am scraping links from multiple pages under multiple searches and want to output scraped results into multiple .csv files. The table shows the .csv file which lists both my source urls and desired output file names:
url | outputfile |
---|---|
https://www.marketresearch.com/search/results.asp?categoryid=230&qtype=2&publisher=IDCs&datepub=0&submit2=Search | outputPS1xIDC.csv |
https://www.marketresearch.com/search/results.asp?categoryid=90&qtype=2&publisher=IDC&datepub=0&submit2=Search | outputPS2xIDC.csv |
https://www.marketresearch.com/search/results.asp?categoryid=233&qtype=2&publisher=IDC&datepub=0&submit2=Search | outputPS3xIDC.csv |
https://www.marketresearch.com/search/results.asp?categoryid=169&qtype=2&publisher=IDC&datepub=0&submit2=Search | outputPS4xIDC.csv |
Now, with the code below, I managed to read the urls in sequence and the rest of the code also works well (when I specify the output filename directly). However, it only outputs the last of the 4 pages in the list, so it overwrites the result each time. What I actually want for it is to output the results from the first url to the first outputfile, second to second, etc. (Of course my actual list of source URLs is much longer than these 4).
Please help, especially with the last line, as clearly just writing [outputs] there doesn't work.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
with open('inputs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
urls = [row["url"] for row in reader]
outputs = [row["outputfile"] for row in reader]
data = []
for url in urls:
def scrape_it(url):
page = requests.get(url, headers={'Cookie': 'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
scrape_it(url)
myOutput = pd.DataFrame(data)
myOutput.to_csv([outputs], header=False) #works (but only for the last url) if instead of [outputs] I have f'filename.csv'
CodePudding user response:
I don't have Pandas, and I don't really want to run your input, but a couple of things jump out a me when I look at your code:
- It looks like you are not looping over
url
andoutput
together. It looks like you loop over all the URLs, and then after all those loops you write once. - Likewise,
data
is just having the HTML table data appended and appended, it's never reset for each individual URL.
Without being able to run this, I recommend something like this. The scraping is fully encapsulated and separate from the loop, and as such you can now more clearly see the flow of inputs and outputs:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
def scrape_it(url, data):
page = requests.get(url, headers={'Cookie': 'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' in stri:
data = scrape_it(nexturl, data)
return data
with open('inputs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
urls = [row["url"] for row in reader]
outputs = [row["outputfile"] for row in reader]
for (url, output) in zip(urls, outputs): # work on url and output together
data = scrape_it(url, [])
myOutput = pd.DataFrame(data)
myOutput.to_csv(output, header=False)
CodePudding user response:
You can try to inject column for each items as follows:
import csv
import pandas as pd
import requests
from bs4 import BeautifulSoup
site = 'https://www.amazon.com/PlayStation-5-Console/dp/B09DFCB66S'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
cookies = {'session': '141-2320098-4829807'}
def stock_check():
page = requests.get(site, headers=headers, cookies=cookies)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find('span', attrs={'id': 'productTitle'})
print(title.get_text(strip=True))
stock_check()
with open('inputs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
urls = [row["url"] for row in reader]
outputs = [row["outputfile"] for row in reader]
data = []
for url in urls:
def scrape_it(url):
page = requests.get(url, headers={'Cookie': 'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all(
"tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
title = report.find('a', class_='linkTitle').text,
price = report.find('div', class_='resultPrice').text,
date_author = report.find('div', class_='textGrey').text.replace(
' | published by: TechNavio', ''),
detail_link = report.a['href']
data.append([title, price, date_author, detail_link])
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
cols = ['title', 'price', 'date_author', 'detail_link']
scrape_it(url)
myOutput = pd.DataFrame(data, columns=cols)
print(myOutput)
# works (but only for the last url) if instead of [outputs] I have f'filename.csv'
#myOutput.to_csv([outputs], header=False)