Fix BeautifulSoup code to get data from all pages and output into csv-CodePudding

Complete beginner. Please help. I've got this code, which worked when I did not try to output to .csv but instead had a print command there - so I didn't have the last 2 lines or anything related to variable 'data'. By 'worked' I mean it printed data from all 18 pages.

Now it outputs data into .csv but only from the first page (url).

I see that I'm not passing nexturl into the pandas at the end - because I don't know how to. Help greatly appreciated.

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r'


def scrape_it(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
    stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
    reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
    data = []

    for report in reports:
        data.append({
               'title': report.find('a', class_='linkTitle').text,
               'price': report.find('div', class_='resultPrice').text,
               'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
               'detail_link': report.a['href']
        })

    if 'next' not in stri:
        print("All pages completed")
    else:
        scrape_it(nexturl)

    return data

myOutput = pd.DataFrame(scrape_it(url))
myOutput.to_csv(f'results-tec6.csv', header=False)

CodePudding user response：

Make data global so you keep appending to it during loop rather than re-creating afresh. Then make your recursive function be called outside the DataFrame() call so you can then pass data to pandas.

Finally, you can pass a cookie to get the max possible results per request to reduce the number of requests.

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r&page=1'

data = []

def scrape_it(url):
    page = requests.get(url, headers = {'Cookie':'ResultsPerPage=100'})
    soup = BeautifulSoup(page.text, 'html.parser')
    nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
    stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
    reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
    
    for report in reports:
        data.append({
               'title': report.find('a', class_='linkTitle').text,
               'price': report.find('div', class_='resultPrice').text,
               'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
               'detail_link': report.a['href']
        })

    if 'next' not in stri:
        print("All pages completed")
    else:
        scrape_it(nexturl)

scrape_it(url)
myOutput = pd.DataFrame(data)
myOutput.to_csv(f'results-tec6.csv', header=False)