Python Web Scrape: not getting all results into csv-CodePudding

This is my first web scrape project so please forgive any noob language or not being able to explain myself well. I have managed to get a script to work that is pulling the data I want and putting it into a CSV using pandas.

The problem is I am trying to scrape almost 25,000 pages from the same website. I am running my script and it takes about an hour to create a CSV. When I check the CSV it appears my script is only able to scrape 16 pages at a time. It's not a huge amount of data it is taking, only about 4 elements.

Is there any reason I can look for that would suggest why my script is only able to make its way through a small amount of pages and taking a long time to do so? Is there anything to do with how much data your PC can hold before it has to write?

Currently I'm doing:

import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://google.co.uk",
    "DNT": "1"
}

page = 1
scotland_houses = []
while page != 5000:
    url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
    response = requests.get(url, headers=headers)
    html = response.content
    soup = bs(html, "lxml")
    for address in soup.find_all("div", class_="c-imWYEM c-imWYEM-pnwXf-hasUprn-true"):
        time.sleep(20)
        scotland_house = {}
        # Getting the address
        scotland_house["Address"] = address.h3.get_text(strip=True)
        # Getting each addresses unique URL
        scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
        scotland_house["URL"] = scotland_house_url
        scotland_house_url_response = requests.get(
            scotland_house_url, headers=headers)
        scotland_house_soup = bs(scotland_house_url_response.content, 'lxml')
        # Lists status of the property
        try:
            scotland_house["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
                "span", class_="css-1jwes4a-Tag-Status-TimelineBadge e15fknhb0")]
        except AttributeError:
            scotland_house["Status"] = ""
        # Lists the date of the status of the property
        try:
            scotland_house["Status Date"] = [status_date.get_text(strip=True) for status_date in scotland_house_soup.find_all(
                "p", class_="css-11dmkz9-Text-StyledDateText e15fknhb1")]
        except AttributeError:
            scotland_house["Status Date"] = ""
        # Lists the value of the property
        try:
            scotland_house["Value (£)"] = [value.get_text(strip=True).replace(",", "").replace("£", "") for value in scotland_house_soup.find_all(
                "p", class_="css-1x01gac-Text eczcs4p0")]
        except AttributeError:
            scotland_house["Value"] = ""
        scotland_houses.append(scotland_house)
    page = page   1

# for house in scotland_houses[:]:
#     print(house)

houses_df = pd.DataFrame(scotland_houses)# houses_df['ID'] = houses_df.index   1
houses_df = houses_df.set_index(['ID']).apply(pd.Series.explode).reset_index()
houses_df['Status Date'] = pd.to_datetime(
     houses_df['Status Date'], format='%B %Y', errors='coerce')
 houses_df["Value (£)"] = houses_df['Value (£)'].astype(str).astype(float)
print(houses_df)
print(houses_df.dtypes)

houses_df.to_csv(r'example\location.csv')

Was wondering if there is a more elegant way to write to a CSV where it is writing to a created CSV so the data is not being held in the RAM? I am assuming it is something to do with that and it is hitting a threshold and writing the CSV once it hits that limit.

Apologies if I miss out any information. I'm a bit of a novice and will do my best to explain myself.

Cheers!

CodePudding user response：

At the moment your script is adding a 20 second delay for each entry being processed on each page. Normally it is good practice to slow down requests to a website by adding a delay between page requests but your delay is per entry.

You need to move the time.sleep(20) call to above your for loop. Also I would suggest a value of 3 seconds will probably suffice.

For example:

import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://google.co.uk",
    "DNT": "1"
}

page = 1
scotland_houses = []

while page != 5000:
    url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
    print(url)
    response = requests.get(url, headers=headers)
    html = response.content
    soup = bs(html, "lxml")
    time.sleep(3)   # 3 second delay between page requests
    
    for address in soup.find_all("div", class_="c-imWYEM c-imWYEM-pnwXf-hasUprn-true"):
        scotland_house = {}
        # Getting the address
        scotland_house["Address"] = address.h3.get_text(strip=True)
        # Getting each addresses unique URL
        scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
        scotland_house["URL"] = scotland_house_url
        scotland_house_url_response = requests.get(
            scotland_house_url, headers=headers)
        scotland_house_soup = bs(scotland_house_url_response.content, 'lxml')
        # Lists status of the property
        try:
            scotland_house["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
                "span", class_="css-1jwes4a-Tag-Status-TimelineBadge e15fknhb0")]
        except AttributeError:
            scotland_house["Status"] = ""
        # Lists the date of the status of the property
        try:
            scotland_house["Status Date"] = [status_date.get_text(strip=True) for status_date in scotland_house_soup.find_all(
                "p", class_="css-11dmkz9-Text-StyledDateText e15fknhb1")]
        except AttributeError:
            scotland_house["Status Date"] = ""
        # Lists the value of the property
        try:
            scotland_house["Value (£)"] = [value.get_text(strip=True).replace(",", "").replace("£", "") for value in scotland_house_soup.find_all(
                "p", class_="css-1x01gac-Text eczcs4p0")]
        except AttributeError:
            scotland_house["Value"] = ""
        scotland_houses.append(scotland_house)
    page = page   1

# for house in scotland_houses[:]:
#     print(house)

houses_df = pd.DataFrame(scotland_houses)   
houses_df['ID'] = houses_df.index   1
houses_df = houses_df.set_index(['ID']).apply(pd.Series.explode).reset_index()

houses_df['Status Date'] = pd.to_datetime(houses_df['Status Date'], format='%B %Y', errors='coerce')
houses_df["Value (£)"] = houses_df['Value (£)'].astype(str).astype(float)

print(houses_df)
print(houses_df.dtypes)

houses_df.to_csv(r'example\location.csv')

You should also add code to stop making further requests when no more entries are being returned rather than requesting a fixed number of pages.

CodePudding user response：

Within your while loop right at the end of creating your data set for each page you could do something like this:

while page != 5000:
    page_houses = []

    ...
    #scrape the data however you want
    #append each property to page_houses 
    ...

    df = pd.DataFrame(page_houses)
    with open('output.csv', 'a',newline='') as f:
        df.to_csv(f, mode='a', header=f.tell()==0,index=False)

    page  = 1

which will write each page's data to a csv, it will write headers if the file doesn't exist but will append if it does exist, this is a far more memory efficient way of doing it.

While I'm here, Zoopla has a few ways to get data more easily than scraping volatile HTML tags. One way is to get the JSON data that each page loads via the script tag: soup.find('script',{'id':'__NEXT_DATA__'}).string this has a lot of raw data in nicely formated JSON, for example you can get the property results by doing this:

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import json

headers =   {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}

page = 1
while page != 3:
    url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
    response = requests.get(url, headers=headers)

    soup = bs(response.text,'lxml')
    data = soup.find('script',{'id':'__NEXT_DATA__'}).string
    json_clean = json.loads(data)

    properties = json_clean['props']['pageProps']['data']['propertiesSearch']['edges']

    df = pd.json_normalize(properties)
    with open('output.csv', 'a',newline='') as f:
        df.to_csv(f, mode='a', header=f.tell()==0,index=False)

    print(f'Scraped page: {page}')
    page  =1

Or you could reverse engineer the graphql queries that are going on whenever you make a search. You'll need to copy an api-key from your browser (open Developer Tools - Network - fetch/Xhr and look for a graphql query, click on it then scroll to the bottom of the Request Headers and you'll see it there)

import requests
import pandas as pd
import json

api_key = '3Vzj2wUfaP3euLsV4NV9h3UAVUR3BoWd5clv9Dvu' #copied from browser, inspect-elemnts

headers = {'x-api-key':api_key,
               'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}

api_url = 'https://api-graphql-lambda.prod.zoopla.co.uk/graphql'

query = "query getListingData($path: String!) {\n  breadcrumbs(path: $path) {\n    label\n    tracking\n    uri\n    noAppend\n    i18nLabelKey\n    __typename\n  }\n  seoAccordions(path: $path) {\n    category\n    expanded\n    geo\n    name\n    propertyType\n    section\n    transactionType\n    rows {\n      links {\n        category\n        geo\n        propertyType\n        section\n        transactionType\n        uri\n        __typename\n      }\n      __typename\n    }\n    links {\n      category\n      geo\n      propertyType\n      section\n      transactionType\n      uri\n      __typename\n    }\n    __typename\n  }\n  discoverMore(path: $path) {\n    housePricesUri\n    findAgentsUri\n    __typename\n  }\n  searchResults(path: $path) {\n    analyticsTaxonomy {\n      url\n      areaName\n      activity\n      brand\n      countryCode\n      countyAreaName\n      currencyCode\n      listingsCategory\n      outcode\n      outcodes\n      page\n      postalArea\n      radius\n      radiusAutoexpansion\n      regionName\n      resultsSort\n      searchGuid\n      searchIdentifier\n      section\n      searchLocation\n      viewType\n      searchResultsCount\n      expandedResultsCount\n      totalResults\n      emailAllFormShown\n      emailAllTotalAgents\n      bedsMax\n      bedsMin\n      priceMax\n      priceMin\n      location\n      propertyType\n      __typename\n    }\n    analyticsEcommerce {\n      currencyCode\n      impressions {\n        id\n        list\n        position\n        variant\n        __typename\n      }\n      __typename\n    }\n    adTargeting {\n      activity\n      areaName\n      bedsMax\n      bedsMin\n      brand\n      brandPrimary\n      countyAreaName\n      countryCode\n      currencyCode\n      listingsCategory\n      outcode\n      outcodes\n      page\n      postalArea\n      priceMax\n      priceMin\n      propertyType\n      regionName\n      resultsSort\n      searchLocation\n      searchResultsCount\n      section\n      totalResults\n      url\n      viewType\n      __typename\n    }\n    metaInfo {\n      title\n      description\n      canonicalUri\n      __typename\n    }\n    pagination {\n      pageNumber\n      totalResults\n      totalResultsWasLimited\n      __typename\n    }\n    listings {\n      regular {\n        numberOfVideos\n        numberOfImages\n        numberOfFloorPlans\n        numberOfViews\n        listingId\n        title\n        publishedOnLabel\n        publishedOn\n        availableFrom\n        priceDrop {\n          lastPriceChangeDate\n          percentageChangeLabel\n          __typename\n        }\n        isPremium\n        highlights {\n          description\n          label\n          url\n          __typename\n        }\n        otherPropertyImages {\n          small\n          large\n          caption\n          __typename\n        }\n        branch {\n          name\n          branchDetailsUri\n          phone\n          logoUrl\n          branchId\n          __typename\n        }\n        features {\n          content\n          iconId\n          __typename\n        }\n        image {\n          src\n          caption\n          responsiveImgList {\n            width\n            src\n            __typename\n          }\n          __typename\n        }\n        transports {\n          title\n          poiType\n          distanceInMiles\n          features {\n            zone\n            tubeLines\n            __typename\n          }\n          __typename\n        }\n        flag\n        listingId\n        priceTitle\n        price\n        alternativeRentFrequencyLabel\n        address\n        tags {\n          content\n          __typename\n        }\n        listingUris {\n          contact\n          detail\n          __typename\n        }\n        __typename\n      }\n      extended {\n        numberOfVideos\n        numberOfImages\n        numberOfFloorPlans\n        numberOfViews\n        listingId\n        title\n        publishedOnLabel\n        publishedOn\n        availableFrom\n        priceDrop {\n          lastPriceChangeDate\n          percentageChangeLabel\n          __typename\n        }\n        isPremium\n        highlights {\n          description\n          label\n          url\n          __typename\n        }\n        otherPropertyImages {\n          small\n          large\n          caption\n          __typename\n        }\n        branch {\n          name\n          branchDetailsUri\n          phone\n          logoUrl\n          branchId\n          __typename\n        }\n        features {\n          content\n          iconId\n          __typename\n        }\n        image {\n          src\n          caption\n          responsiveImgList {\n            width\n            src\n            __typename\n          }\n          __typename\n        }\n        transports {\n          title\n          poiType\n          distanceInMiles\n          features {\n            zone\n            tubeLines\n            __typename\n          }\n          __typename\n        }\n        flag\n        listingId\n        priceTitle\n        price\n        alternativeRentFrequencyLabel\n        address\n        tags {\n          content\n          __typename\n        }\n        listingUris {\n          contact\n          detail\n          __typename\n        }\n        __typename\n      }\n      featured {\n        numberOfVideos\n        numberOfImages\n        numberOfFloorPlans\n        numberOfViews\n        listingId\n        title\n        publishedOnLabel\n        publishedOn\n        availableFrom\n        priceDrop {\n          lastPriceChangeDate\n          percentageChangeLabel\n          __typename\n        }\n        isPremium\n        featuredType\n        highlights {\n          description\n          label\n          url\n          __typename\n        }\n        otherPropertyImages {\n          small\n          large\n          caption\n          __typename\n        }\n        branch {\n          name\n          branchDetailsUri\n          phone\n          logoUrl\n          __typename\n        }\n        features {\n          content\n          iconId\n          __typename\n        }\n        image {\n          src\n          caption\n          responsiveImgList {\n            width\n            src\n            __typename\n          }\n          __typename\n        }\n        transports {\n          title\n          poiType\n          distanceInMiles\n          features {\n            zone\n            tubeLines\n            __typename\n          }\n          __typename\n        }\n        flag\n        listingId\n        priceTitle\n        price\n        alternativeRentFrequencyLabel\n        address\n        tags {\n          content\n          __typename\n        }\n        listingUris {\n          contact\n          detail\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    filters {\n      fields {\n        group\n        fieldName\n        label\n        isRequired\n        inputType\n        placeholder\n        allowMultiple\n        options\n        value\n        __typename\n      }\n      __typename\n    }\n    links {\n      saveSearch\n      createAlert\n      __typename\n    }\n    sortOrder {\n      currentSortOrder\n      options {\n        i18NLabelKey\n        value\n        __typename\n      }\n      __typename\n    }\n    seoBlurb {\n      category\n      transactionType\n      __typename\n    }\n    title\n    userAlertId\n    savedSearchAndAlerts {\n      uri\n      currentFrequency {\n        i18NLabelKey\n        value\n        __typename\n      }\n      __typename\n    }\n    polyenc\n    __typename\n  }\n}\n"

page =1
while page != 3:
    payload = {"operationName":"getListingData",
                "variables":{"path":f"/for-sale/property/edinburgh/?page_size=25&q=Edinburgh&radius=1&results_sort=newest_listings&search_source=facets&pn={page}"},
                "query": query}

    data = requests.post(api_url,headers=headers,data=json.dumps(payload)).json()
    properties = data['data']['searchResults']['listings']['regular']
    df = pd.json_normalize(properties)
    with open('output_data.csv', 'a',newline='') as f:
        df.to_csv(f, mode='a', header=f.tell()==0,index=False)

    print(f'Scraped page: {page}')
    page =1