Complete beginner. Please help. I've got this code, which worked when I did not try to output to .csv but instead had a print command there - so I didn't have the last 2 lines or anything related to variable 'data'. By 'worked' I mean it printed data from all 18 pages.
Now it outputs data into .csv but only from the first page (url).
I see that I'm not passing nexturl into the pandas at the end - because I don't know how to. Help greatly appreciated.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r'
def scrape_it(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
data = []
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
return data
myOutput = pd.DataFrame(scrape_it(url))
myOutput.to_csv(f'results-tec6.csv', header=False)
CodePudding user response:
Make data
global so you keep appending to it during loop rather than re-creating afresh. Then make your recursive function be called outside the DataFrame()
call so you can then pass data
to pandas.
Finally, you can pass a cookie to get the max possible results per request to reduce the number of requests.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r&page=1'
data = []
def scrape_it(url):
page = requests.get(url, headers = {'Cookie':'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
scrape_it(url)
myOutput = pd.DataFrame(data)
myOutput.to_csv(f'results-tec6.csv', header=False)