This is my first web scrape project so please forgive any noob language or not being able to explain myself well. I have managed to get a script to work that is pulling the data I want and putting it into a CSV using pandas.
The problem is I am trying to scrape almost 25,000 pages from the same website. I am running my script and it takes about an hour to create a CSV. When I check the CSV it appears my script is only able to scrape 16 pages at a time. It's not a huge amount of data it is taking, only about 4 elements.
Is there any reason I can look for that would suggest why my script is only able to make its way through a small amount of pages and taking a long time to do so? Is there anything to do with how much data your PC can hold before it has to write?
Currently I'm doing:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://google.co.uk",
"DNT": "1"
}
page = 1
scotland_houses = []
while page != 5000:
url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
response = requests.get(url, headers=headers)
html = response.content
soup = bs(html, "lxml")
for address in soup.find_all("div", class_="c-imWYEM c-imWYEM-pnwXf-hasUprn-true"):
time.sleep(20)
scotland_house = {}
# Getting the address
scotland_house["Address"] = address.h3.get_text(strip=True)
# Getting each addresses unique URL
scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
scotland_house["URL"] = scotland_house_url
scotland_house_url_response = requests.get(
scotland_house_url, headers=headers)
scotland_house_soup = bs(scotland_house_url_response.content, 'lxml')
# Lists status of the property
try:
scotland_house["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
"span", class_="css-1jwes4a-Tag-Status-TimelineBadge e15fknhb0")]
except AttributeError:
scotland_house["Status"] = ""
# Lists the date of the status of the property
try:
scotland_house["Status Date"] = [status_date.get_text(strip=True) for status_date in scotland_house_soup.find_all(
"p", class_="css-11dmkz9-Text-StyledDateText e15fknhb1")]
except AttributeError:
scotland_house["Status Date"] = ""
# Lists the value of the property
try:
scotland_house["Value (£)"] = [value.get_text(strip=True).replace(",", "").replace("£", "") for value in scotland_house_soup.find_all(
"p", class_="css-1x01gac-Text eczcs4p0")]
except AttributeError:
scotland_house["Value"] = ""
scotland_houses.append(scotland_house)
page = page 1
# for house in scotland_houses[:]:
# print(house)
houses_df = pd.DataFrame(scotland_houses)# houses_df['ID'] = houses_df.index 1
houses_df = houses_df.set_index(['ID']).apply(pd.Series.explode).reset_index()
houses_df['Status Date'] = pd.to_datetime(
houses_df['Status Date'], format='%B %Y', errors='coerce')
houses_df["Value (£)"] = houses_df['Value (£)'].astype(str).astype(float)
print(houses_df)
print(houses_df.dtypes)
houses_df.to_csv(r'example\location.csv')
Was wondering if there is a more elegant way to write to a CSV where it is writing to a created CSV so the data is not being held in the RAM? I am assuming it is something to do with that and it is hitting a threshold and writing the CSV once it hits that limit.
Apologies if I miss out any information. I'm a bit of a novice and will do my best to explain myself.
Cheers!
CodePudding user response:
At the moment your script is adding a 20 second delay for each entry being processed on each page. Normally it is good practice to slow down requests to a website by adding a delay between page requests but your delay is per entry.
You need to move the time.sleep(20)
call to above your for
loop. Also I would suggest a value of 3 seconds will probably suffice.
For example:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://google.co.uk",
"DNT": "1"
}
page = 1
scotland_houses = []
while page != 5000:
url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
print(url)
response = requests.get(url, headers=headers)
html = response.content
soup = bs(html, "lxml")
time.sleep(3) # 3 second delay between page requests
for address in soup.find_all("div", class_="c-imWYEM c-imWYEM-pnwXf-hasUprn-true"):
scotland_house = {}
# Getting the address
scotland_house["Address"] = address.h3.get_text(strip=True)
# Getting each addresses unique URL
scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
scotland_house["URL"] = scotland_house_url
scotland_house_url_response = requests.get(
scotland_house_url, headers=headers)
scotland_house_soup = bs(scotland_house_url_response.content, 'lxml')
# Lists status of the property
try:
scotland_house["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
"span", class_="css-1jwes4a-Tag-Status-TimelineBadge e15fknhb0")]
except AttributeError:
scotland_house["Status"] = ""
# Lists the date of the status of the property
try:
scotland_house["Status Date"] = [status_date.get_text(strip=True) for status_date in scotland_house_soup.find_all(
"p", class_="css-11dmkz9-Text-StyledDateText e15fknhb1")]
except AttributeError:
scotland_house["Status Date"] = ""
# Lists the value of the property
try:
scotland_house["Value (£)"] = [value.get_text(strip=True).replace(",", "").replace("£", "") for value in scotland_house_soup.find_all(
"p", class_="css-1x01gac-Text eczcs4p0")]
except AttributeError:
scotland_house["Value"] = ""
scotland_houses.append(scotland_house)
page = page 1
# for house in scotland_houses[:]:
# print(house)
houses_df = pd.DataFrame(scotland_houses)
houses_df['ID'] = houses_df.index 1
houses_df = houses_df.set_index(['ID']).apply(pd.Series.explode).reset_index()
houses_df['Status Date'] = pd.to_datetime(houses_df['Status Date'], format='%B %Y', errors='coerce')
houses_df["Value (£)"] = houses_df['Value (£)'].astype(str).astype(float)
print(houses_df)
print(houses_df.dtypes)
houses_df.to_csv(r'example\location.csv')
You should also add code to stop making further requests when no more entries are being returned rather than requesting a fixed number of pages.
CodePudding user response:
Within your while loop right at the end of creating your data set for each page you could do something like this:
while page != 5000:
page_houses = []
...
#scrape the data however you want
#append each property to page_houses
...
df = pd.DataFrame(page_houses)
with open('output.csv', 'a',newline='') as f:
df.to_csv(f, mode='a', header=f.tell()==0,index=False)
page = 1
which will write each page's data to a csv, it will write headers if the file doesn't exist but will append if it does exist, this is a far more memory efficient way of doing it.
While I'm here, Zoopla has a few ways to get data more easily than scraping volatile HTML tags. One way is to get the JSON data that each page loads via the script tag: soup.find('script',{'id':'__NEXT_DATA__'}).string
this has a lot of raw data in nicely formated JSON, for example you can get the property results by doing this:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import json
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
page = 1
while page != 3:
url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
response = requests.get(url, headers=headers)
soup = bs(response.text,'lxml')
data = soup.find('script',{'id':'__NEXT_DATA__'}).string
json_clean = json.loads(data)
properties = json_clean['props']['pageProps']['data']['propertiesSearch']['edges']
df = pd.json_normalize(properties)
with open('output.csv', 'a',newline='') as f:
df.to_csv(f, mode='a', header=f.tell()==0,index=False)
print(f'Scraped page: {page}')
page =1
Or you could reverse engineer the graphql queries that are going on whenever you make a search. You'll need to copy an api-key from your browser (open Developer Tools - Network - fetch/Xhr and look for a graphql query, click on it then scroll to the bottom of the Request Headers and you'll see it there)
import requests
import pandas as pd
import json
api_key = '3Vzj2wUfaP3euLsV4NV9h3UAVUR3BoWd5clv9Dvu' #copied from browser, inspect-elemnts
headers = {'x-api-key':api_key,
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
api_url = 'https://api-graphql-lambda.prod.zoopla.co.uk/graphql'
query = "query getListingData($path: String!) {\n breadcrumbs(path: $path) {\n label\n tracking\n uri\n noAppend\n i18nLabelKey\n __typename\n }\n seoAccordions(path: $path) {\n category\n expanded\n geo\n name\n propertyType\n section\n transactionType\n rows {\n links {\n category\n geo\n propertyType\n section\n transactionType\n uri\n __typename\n }\n __typename\n }\n links {\n category\n geo\n propertyType\n section\n transactionType\n uri\n __typename\n }\n __typename\n }\n discoverMore(path: $path) {\n housePricesUri\n findAgentsUri\n __typename\n }\n searchResults(path: $path) {\n analyticsTaxonomy {\n url\n areaName\n activity\n brand\n countryCode\n countyAreaName\n currencyCode\n listingsCategory\n outcode\n outcodes\n page\n postalArea\n radius\n radiusAutoexpansion\n regionName\n resultsSort\n searchGuid\n searchIdentifier\n section\n searchLocation\n viewType\n searchResultsCount\n expandedResultsCount\n totalResults\n emailAllFormShown\n emailAllTotalAgents\n bedsMax\n bedsMin\n priceMax\n priceMin\n location\n propertyType\n __typename\n }\n analyticsEcommerce {\n currencyCode\n impressions {\n id\n list\n position\n variant\n __typename\n }\n __typename\n }\n adTargeting {\n activity\n areaName\n bedsMax\n bedsMin\n brand\n brandPrimary\n countyAreaName\n countryCode\n currencyCode\n listingsCategory\n outcode\n outcodes\n page\n postalArea\n priceMax\n priceMin\n propertyType\n regionName\n resultsSort\n searchLocation\n searchResultsCount\n section\n totalResults\n url\n viewType\n __typename\n }\n metaInfo {\n title\n description\n canonicalUri\n __typename\n }\n pagination {\n pageNumber\n totalResults\n totalResultsWasLimited\n __typename\n }\n listings {\n regular {\n numberOfVideos\n numberOfImages\n numberOfFloorPlans\n numberOfViews\n listingId\n title\n publishedOnLabel\n publishedOn\n availableFrom\n priceDrop {\n lastPriceChangeDate\n percentageChangeLabel\n __typename\n }\n isPremium\n highlights {\n description\n label\n url\n __typename\n }\n otherPropertyImages {\n small\n large\n caption\n __typename\n }\n branch {\n name\n branchDetailsUri\n phone\n logoUrl\n branchId\n __typename\n }\n features {\n content\n iconId\n __typename\n }\n image {\n src\n caption\n responsiveImgList {\n width\n src\n __typename\n }\n __typename\n }\n transports {\n title\n poiType\n distanceInMiles\n features {\n zone\n tubeLines\n __typename\n }\n __typename\n }\n flag\n listingId\n priceTitle\n price\n alternativeRentFrequencyLabel\n address\n tags {\n content\n __typename\n }\n listingUris {\n contact\n detail\n __typename\n }\n __typename\n }\n extended {\n numberOfVideos\n numberOfImages\n numberOfFloorPlans\n numberOfViews\n listingId\n title\n publishedOnLabel\n publishedOn\n availableFrom\n priceDrop {\n lastPriceChangeDate\n percentageChangeLabel\n __typename\n }\n isPremium\n highlights {\n description\n label\n url\n __typename\n }\n otherPropertyImages {\n small\n large\n caption\n __typename\n }\n branch {\n name\n branchDetailsUri\n phone\n logoUrl\n branchId\n __typename\n }\n features {\n content\n iconId\n __typename\n }\n image {\n src\n caption\n responsiveImgList {\n width\n src\n __typename\n }\n __typename\n }\n transports {\n title\n poiType\n distanceInMiles\n features {\n zone\n tubeLines\n __typename\n }\n __typename\n }\n flag\n listingId\n priceTitle\n price\n alternativeRentFrequencyLabel\n address\n tags {\n content\n __typename\n }\n listingUris {\n contact\n detail\n __typename\n }\n __typename\n }\n featured {\n numberOfVideos\n numberOfImages\n numberOfFloorPlans\n numberOfViews\n listingId\n title\n publishedOnLabel\n publishedOn\n availableFrom\n priceDrop {\n lastPriceChangeDate\n percentageChangeLabel\n __typename\n }\n isPremium\n featuredType\n highlights {\n description\n label\n url\n __typename\n }\n otherPropertyImages {\n small\n large\n caption\n __typename\n }\n branch {\n name\n branchDetailsUri\n phone\n logoUrl\n __typename\n }\n features {\n content\n iconId\n __typename\n }\n image {\n src\n caption\n responsiveImgList {\n width\n src\n __typename\n }\n __typename\n }\n transports {\n title\n poiType\n distanceInMiles\n features {\n zone\n tubeLines\n __typename\n }\n __typename\n }\n flag\n listingId\n priceTitle\n price\n alternativeRentFrequencyLabel\n address\n tags {\n content\n __typename\n }\n listingUris {\n contact\n detail\n __typename\n }\n __typename\n }\n __typename\n }\n filters {\n fields {\n group\n fieldName\n label\n isRequired\n inputType\n placeholder\n allowMultiple\n options\n value\n __typename\n }\n __typename\n }\n links {\n saveSearch\n createAlert\n __typename\n }\n sortOrder {\n currentSortOrder\n options {\n i18NLabelKey\n value\n __typename\n }\n __typename\n }\n seoBlurb {\n category\n transactionType\n __typename\n }\n title\n userAlertId\n savedSearchAndAlerts {\n uri\n currentFrequency {\n i18NLabelKey\n value\n __typename\n }\n __typename\n }\n polyenc\n __typename\n }\n}\n"
page =1
while page != 3:
payload = {"operationName":"getListingData",
"variables":{"path":f"/for-sale/property/edinburgh/?page_size=25&q=Edinburgh&radius=1&results_sort=newest_listings&search_source=facets&pn={page}"},
"query": query}
data = requests.post(api_url,headers=headers,data=json.dumps(payload)).json()
properties = data['data']['searchResults']['listings']['regular']
df = pd.json_normalize(properties)
with open('output_data.csv', 'a',newline='') as f:
df.to_csv(f, mode='a', header=f.tell()==0,index=False)
print(f'Scraped page: {page}')
page =1