How to get content from dynamically loaded website using Scrapy?-CodePudding

How can I scrape this website https://charliesmithrealty.net in the easiest way with Scrapy? What I am trying to do is, sending text parameters to the input fields for filtered search. Then scrape the data. This a dynamically loaded website. I tried selenium to automate it but failed to execute properly. It breaks at the end moment. I searched for the JSON file in the network tab but nothing was found. I hardly faced this kind of website before. So I have no idea about this. Here is what I tried so far,

import scrapy
from selenium import webdriver
import time
from scrapy.selector.unified import Selector
from selenium.webdriver.common.by import By


class RealSpider(scrapy.Spider):
    name = 'real'
    
    start_urls = ['https://charliesmithrealty.net']

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.driver.set_window_size(1920, 1080)

    def parse(self, response):
        self.driver.get(response.url)
        
        # selecting Zip Code
        self.driver.find_element_by_xpath("//input[@type='location']").send_keys("80023")
        time.sleep(1)
        self.driver.find_element_by_xpath("//li[@lookup_field='zip_code']").click()
        time.sleep(1)

        # clicking on Price field and choosing min & max value
        self.driver.find_element(By.XPATH, "(.//*[@class='bfg-dropdown'])[1]").click()
        time.sleep(1)
        self.driver.find_element(By.XPATH, "(.//*[contains(@class,'bfg-input-option-list bfg-option-list-min')])[2]/li[contains(text(),'$150K')]").click()
        time.sleep(1)
        self.driver.find_element(By.XPATH, "(.//*[contains(@class,'bfg-input-option-list bfg-option-list-max')])[2]/li[contains(text(),'$600K')]").click()
        time.sleep(1)

        # selecting beds and baths
        self.driver.find_element_by_xpath("//div[@id='BedsDropdownMenuButton']/select/option[@value='4']").click()
        self.driver.find_element_by_xpath("//div[@id='BathsDropdownMenuButton']/select/option[@value='3']").click()
        
        # search
        self.driver.find_element_by_xpath("//button[@aria-label='Submit Search']").click()
        time.sleep(3)

        # pagination
        while True:
            try:
                more_button = self.driver.find_element_by_xpath("//a[@data-page='next']")
                more_button.click()
                time.sleep(1)
            except:
                break

        self.driver.close()

        html = self.driver.page_source
        resp = Selector(text=html)

        for urls in resp.xpath("//div[@id='bfg-map-gallery']/mbb-galleryitem/div/a"):
            yield {
                'URL': urls.xpath("./@href")
            }

CodePudding user response：

Hrer is the complete solution using standalone selenium. To extract data from this website the way you have presented here is very tough and only powerful original selenium can handle it smothly because I also wanted to integrate selenium with BeautifulSoup but didn't work because the elements are loaded dynamically via JavaScript after page loaded.

Script:

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)   

URL ='https://charliesmithrealty.net'
driver.get(URL)
#driver.set_window_size(1920, 1080)
time.sleep(5)


view_search_box = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@]/ul/li[2]/a'))).click()
view_wigdet = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '(//*[@]//ul)[2]/li[1]/a'))).click()
search_input = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[name="area_search"]'))).send_keys('80023')

price_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@]/button'))).click()
min_price = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, '(//*[@]/li)[2]'))).click()
max_price = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, '(//*[@]/li)[2]'))).click()


select = Select(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "__bedrooms_total_min"))))
# select by value
select.select_by_value("3")

select = Select(WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, "baths_total_min"))))
# select by value
select.select_by_value("2")

click_view_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@]'))).click()

data = []

wait = WebDriverWait(driver, 10)

for card in range(2):
    try:
        cards = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'div[]')))
        for card in cards:
            price = card.find_element(By.XPATH, './/*[@]').text
            #print(price)
            bed = card.find_element(By.XPATH,'(.//*[@])[1]').text
            bed = bed.replace(',', '').strip()
            #print(bed)
            bathe = card.find_element(By.XPATH,'(.//*[@])[2]').text
            bathe = bathe.replace('\n,', '').strip()
            #print(bathe)
            data.append(
                {"price":price,
                'bed':bed,
                'bathe':bathe})  
   
        loadMoreButton = driver.find_element(By.XPATH, "//a[contains(text(),'Show More')]")
        if loadMoreButton:
            loadMoreButton = loadMoreButton.click()
            time.sleep(5)
    except Exception as e:
        break
df = pd.DataFrame(data)
print(df)

Output:

       price     bed    bathe
0   $198,809  3 Beds  2 Baths
1   $198,103  8 Beds  4 Baths
2   $197,082  4 Beds  3 Baths
3   $196,963  4 Beds  4 Baths
4   $196,662  3 Beds  2 Baths
..       ...     ...      ...
67  $172,849  3 Beds  2 Baths
68  $172,624  5 Beds  2 Baths
69  $172,066  5 Beds  5 Baths
70  $171,187  3 Beds  3 Baths
71  $170,501  4 Beds  3 Baths

[72 rows x 3 columns]

CodePudding user response：

There is an ajax request behind any JS-based website. so I have figured out which call is behind this website.

this is the request which is used to rendered data so no need to use selenium so you can request this

https://www.mbb2.com/version3/searchform/search-criteria?callback=mbbQuery34104041599161038898_1674040603155&acid=OLrJDh&LoadedTrigger=false&mbbid=44400357-9411-4b52-840d-35ae072a0ba&lacid=&mbbdomain=charliesmithrealty.net&mbbagent=&mbbassistant=&referer_https=true&mbbsource=&results_url=//charliesmithrealty.net/results.php&mls_id=demo&search_radius=.25&zip_code=80023&price_min=150000&bedrooms_total_min=2&price_max=300000&_=1674040603182

you will get response and process it