How can I scrape this website https://charliesmithrealty.net in the easiest way with Scrapy? What I am trying to do is, sending text parameters to the input fields for filtered search. Then scrape the data. This a dynamically loaded website. I tried selenium to automate it but failed to execute properly. It breaks at the end moment. I searched for the JSON file in the network tab but nothing was found. I hardly faced this kind of website before. So I have no idea about this. Here is what I tried so far,
import scrapy
from selenium import webdriver
import time
from scrapy.selector.unified import Selector
from selenium.webdriver.common.by import By
class RealSpider(scrapy.Spider):
name = 'real'
start_urls = ['https://charliesmithrealty.net']
def __init__(self):
self.driver = webdriver.Chrome()
self.driver.set_window_size(1920, 1080)
def parse(self, response):
self.driver.get(response.url)
# selecting Zip Code
self.driver.find_element_by_xpath("//input[@type='location']").send_keys("80023")
time.sleep(1)
self.driver.find_element_by_xpath("//li[@lookup_field='zip_code']").click()
time.sleep(1)
# clicking on Price field and choosing min & max value
self.driver.find_element(By.XPATH, "(.//*[@class='bfg-dropdown'])[1]").click()
time.sleep(1)
self.driver.find_element(By.XPATH, "(.//*[contains(@class,'bfg-input-option-list bfg-option-list-min')])[2]/li[contains(text(),'$150K')]").click()
time.sleep(1)
self.driver.find_element(By.XPATH, "(.//*[contains(@class,'bfg-input-option-list bfg-option-list-max')])[2]/li[contains(text(),'$600K')]").click()
time.sleep(1)
# selecting beds and baths
self.driver.find_element_by_xpath("//div[@id='BedsDropdownMenuButton']/select/option[@value='4']").click()
self.driver.find_element_by_xpath("//div[@id='BathsDropdownMenuButton']/select/option[@value='3']").click()
# search
self.driver.find_element_by_xpath("//button[@aria-label='Submit Search']").click()
time.sleep(3)
# pagination
while True:
try:
more_button = self.driver.find_element_by_xpath("//a[@data-page='next']")
more_button.click()
time.sleep(1)
except:
break
self.driver.close()
html = self.driver.page_source
resp = Selector(text=html)
for urls in resp.xpath("//div[@id='bfg-map-gallery']/mbb-galleryitem/div/a"):
yield {
'URL': urls.xpath("./@href")
}
CodePudding user response:
Hrer is the complete solution using standalone selenium. To extract data from this website the way you have presented here is very tough and only powerful original selenium can handle it smothly because I also wanted to integrate selenium with BeautifulSoup but didn't work because the elements are loaded dynamically via JavaScript after page loaded.
Script:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
import pandas as pd
import time
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
URL ='https://charliesmithrealty.net'
driver.get(URL)
#driver.set_window_size(1920, 1080)
time.sleep(5)
view_search_box = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@]/ul/li[2]/a'))).click()
view_wigdet = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '(//*[@]//ul)[2]/li[1]/a'))).click()
search_input = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[name="area_search"]'))).send_keys('80023')
price_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@]/button'))).click()
min_price = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, '(//*[@]/li)[2]'))).click()
max_price = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, '(//*[@]/li)[2]'))).click()
select = Select(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "__bedrooms_total_min"))))
# select by value
select.select_by_value("3")
select = Select(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, "baths_total_min"))))
# select by value
select.select_by_value("2")
click_view_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@]'))).click()
data = []
wait = WebDriverWait(driver, 10)
for card in range(2):
try:
cards = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'div[]')))
for card in cards:
price = card.find_element(By.XPATH, './/*[@]').text
#print(price)
bed = card.find_element(By.XPATH,'(.//*[@])[1]').text
bed = bed.replace(',', '').strip()
#print(bed)
bathe = card.find_element(By.XPATH,'(.//*[@])[2]').text
bathe = bathe.replace('\n,', '').strip()
#print(bathe)
data.append(
{"price":price,
'bed':bed,
'bathe':bathe})
loadMoreButton = driver.find_element(By.XPATH, "//a[contains(text(),'Show More')]")
if loadMoreButton:
loadMoreButton = loadMoreButton.click()
time.sleep(5)
except Exception as e:
break
df = pd.DataFrame(data)
print(df)
Output:
price bed bathe
0 $198,809 3 Beds 2 Baths
1 $198,103 8 Beds 4 Baths
2 $197,082 4 Beds 3 Baths
3 $196,963 4 Beds 4 Baths
4 $196,662 3 Beds 2 Baths
.. ... ... ...
67 $172,849 3 Beds 2 Baths
68 $172,624 5 Beds 2 Baths
69 $172,066 5 Beds 5 Baths
70 $171,187 3 Beds 3 Baths
71 $170,501 4 Beds 3 Baths
[72 rows x 3 columns]
CodePudding user response:
There is an ajax request behind any JS-based website. so I have figured out which call is behind this website.
this is the request which is used to rendered data so no need to use selenium so you can request this
you will get response and process it