I'm trying to scrape rental listing data on Zillow. Specifically, I want the link, price, and address of each property. However, after scraping the first page successfully and clicking the next arrow button, it just displays the same listings even though the page shows I'm on page 2, 3, etc. How do I get the next page(s) listings? The project is supposed to use BeautifulSoup and Selenium, but after some research it looks like using only selenium is the easiest way to do this since Zillow uses lazy-loading.
main.py code:
DRIVER_PATH = "D:\chromedriver.exe"
FORM_URL = "HIDDEN"
WEBPAGE = "https://www.zillow.com/toronto-on/rentals/?searchQueryState={"pagination":{},"mapBounds":{"west":-79.40771727189582,"east":-79.35750631913703,"south":43.639155005365474,"north":43.66405824004801},"mapZoom":15,"regionSelection":[{"regionId":792680,"regionType":6}],"isMapVisible":true,"filterState":{"fore":{"value":false},"ah":{"value":true},"sort":{"value":"days"},"auc":{"value":false},"nc":{"value":false},"fr":{"value":true},"sf":{"value":false},"tow":{"value":false},"fsbo":{"value":false},"cmsn":{"value":false},"fsba":{"value":false}},"isListVisible":true}"
data_entry = DataEntry(DRIVER_PATH)
# Opens the webpage and gets count of total pages via self.next_btns_len)
data_entry.open_webpage(WEBPAGE)
# n is the iterator for the number of pages on the site.
n = 1
# Scrapes link, price, address data, adds each to a specified class list, and then goes to next page.
while n < (data_entry.next_btns_len 1):
# Scrapes one page of data and adds data to list in class object
data_entry.scrape_data()
# Goes to next page for scraping
sleep(5)
data_entry.next_page()
n = 1
enter_data.py code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from time import sleep
class DataEntry:
"""Enters the data from soup into Google Form"""
def __init__(self, driver_path):
# Options keeps the browser open after execution.
self.chrome_options = Options()
self.chrome_options.add_experimental_option("detach", True)
self.driver = webdriver.Chrome(executable_path=driver_path, chrome_options=self.chrome_options)
self.links = []
self.prices = []
self.addresses = []
self.next_btns_len = 0
def open_webpage(self, webpage):
# Opens desired webpage and gives two seconds to load
self.driver.get(webpage)
sleep(2)
# Gets total page numbers for main.py while loop
page_nums = self.driver.find_element(By.CSS_SELECTOR, '.Text-c11n-8-69-2__sc-aiai24-0.gCvDSp')
self.next_btns_len = int(page_nums.text.split()[3])
def scrape_data(self):
# Scrolls to each listing to make it visible to Selenium.
n = 1
while n < 41:
listing = self.driver.find_element(By.XPATH, f'/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[{n}]')
self.driver.execute_script("arguments[0].scrollIntoView(true);", listing)
print(n)
n = 1
# todo: Create a list of links for all the listings you scraped.
links = self.driver.find_elements(By.CSS_SELECTOR, ".list-card-info .list-card-link")
link_list = [link.get_attribute("href") for link in links]
# The if statement is to check if the DOM class name has changed, which produces an empty list.
# If the list is empty, then changes the css_selector. The website alternates between two.
if len(link_list) == 0:
links = self.driver.find_elements(By.CSS_SELECTOR, ".StyledPropertyCardDataArea-c11n-8-69-2__sc-yipmu-0.dZxoFm.property-card-link")
link_list = [link.get_attribute("href") for link in links]
self.links.extend(link_list)
print(len(self.links))
print(self.links)
# todo: Create a list of prices for all the listings you scraped.
prices = self.driver.find_elements(By.CSS_SELECTOR, ".list-card-price")
price_list = [price.text for price in prices]
if len(price_list) == 0:
prices = self.driver.find_elements(By.CSS_SELECTOR, ".StyledPropertyCardDataArea-c11n-8-69-2__sc-yipmu-0.kJFQQX")
price_list = [price.text for price in prices]
split_price_list = [price.split() for price in price_list]
final_price_list = [price[0].strip("C /mo") for price in split_price_list]
self.prices.extend(final_price_list)
print(len(self.prices))
print(self.prices)
# todo: Create a list of addresses for all the listings you scraped.
addresses = self.driver.find_elements(By.CSS_SELECTOR, ".list-card-addr")
address_list = [address.text for address in addresses]
if len(address_list) == 0:
addresses = self.driver.find_elements(By.CSS_SELECTOR, ".StyledPropertyCardDataArea-c11n-8-69-2__sc-yipmu-0.dZxoFm.property-card-link address")
address_list = [address.text for address in addresses]
self.addresses.extend(address_list)
print(len(self.addresses))
print(self.addresses)
def next_page(self):
# Clicks the next arrow and waits 2 seconds for page to load
next_arrow = self.driver.find_element(By.XPATH, "//a[@title='Next page']")
next_arrow.click()
sleep(5)
def close_webpage(self):
self.driver.quit()
def enter_data(self, form_url, address, rent, link):
# Opens the Google Form and waits 3 seconds to load.
self.driver.get(form_url)
sleep(2)
# Enters each address, rent, and link into the form. Clicks submit after.
address_input = self.driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div['
'2]/div/div[1]/div/div[1]/input')
address_input.send_keys(address)
rent_input = self.driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[2]/div/div/div['
'2]/div/div[1]/div/div[1]/input')
rent_input.send_keys(rent)
link_input = self.driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[2]/div[3]/div/div/div['
'2]/div/div[1]/div/div[1]/input')
link_input.send_keys(link)
submit_btn = self.driver.find_element(By.XPATH, '//*[@id="mG61Hd"]/div[2]/div/div[3]/div[1]/div['
'1]/div/span/span')
submit_btn.click()
CodePudding user response:
There is a less complex way to obtain the data you're looking for, using cloudscraper and pandas (and tqdm for convenience). You might also be in for a surprise, considering the time taken to get the data:
import cloudscraper
import pandas as pd
from tqdm import tqdm
scraper = cloudscraper.create_scraper()
df_list = []
for current_page in tqdm(range(1, 21)):
url = f'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":{current_page}},"mapBounds":{"west":-79.44174913987678,"east":-79.32347445115607,"south":43.57772225826024,"north":43.7254027835563},"mapZoom":13,"regionSelection":[{"regionId":792680,"regionType":6}],"isMapVisible":true,"filterState":{"isForSaleForeclosure":{"value":false},"isAllHomes":{"value":true},"sortSelection":{"value":"days"},"isAuction":{"value":false},"isNewConstruction":{"value":false},"isForRent":{"value":true},"isSingleFamily":{"value":false},"isTownhouse":{"value":false},"isForSaleByOwner":{"value":false},"isComingSoon":{"value":false},"isForSaleByAgent":{"value":false}},"isListVisible":true}&wants={"cat1":["listResults","mapResults"]}&requestId=6'
r = scraper.get(url)
for x in r.json()['cat1']['searchResults']['listResults']:
status = x['statusText']
address = x['address']
try:
price = x['units'][0]['price']
except Exception as e:
price = x['price']
if not 'https://www.' in x['detailUrl']:
url = 'https://zillow.com' x['detailUrl']
else:
url = x['detailUrl']
df_list.append((address, price, url))
df = pd.DataFrame(df_list, columns = ['Address', 'Price', 'Url'])
df.to_csv('renting_in_toronto.csv')
print(df)
This will save the data in a csv file, and print out:
100%
20/20 [00:16<00:00, 1.19it/s]
Address Price Url
0 2221 Yonge St, Toronto, ON C$1,900 https://zillow.com/b/Toronto-ON/43.70606,-79.3...
1 10 Yonge St, Toronto, ON C$2,100 https://zillow.com/b/10-yonge-st-toronto-on-BM...
2 924 Avenue Rd, Toronto, ON M5P 2K6 C$1,895/mo https://www.zillow.com/homedetails/924-Avenue-...
3 797 Don Mills Rd, Toronto, ON C$1,850 https://zillow.com/b/Toronto-ON/43.71951,-79.3...
4 15 Queens Quay E, Toronto, ON C$2,700 https://zillow.com/b/Toronto-ON/43.64202,-79.3...
... ... ...
You can install the packages with pip install cloudscraper
& pip install tqdm
. The urls accessed are visible in Dev Tools, Network tab, and are providing JSON data which is loaded by Javascript into page.