Home > Software engineering >  How to paginate dynamically loaded (Load More button) website using Scrapy?
How to paginate dynamically loaded (Load More button) website using Scrapy?

Time:10-23

Can anyone guide me on how to paginate gelbeseiten? I got stuck in the pagination part. In the network tab, there is ajaxsuche option but I don't know what to do from here. Your guidance is appreciated. My code is given below

import scrapy
from scrapy_selenium import SeleniumRequest


class Data2Spider(scrapy.Spider):
    name = 'data2'


    def start_requests(self):
        yield SeleniumRequest(
                url="https://www.gelbeseiten.de/suche/hotels/nürnberg",
                callback=self.parse,
            )
        return super().start_requests()

    def parse(self, response):
        temp = []

        for i in response.xpath("//article/@data-realid").getall():
            temp.append(i)

        for r in temp:
            yield SeleniumRequest(
                url=f"https://www.gelbeseiten.de/gsbiz/{r}",
                callback=self.parse_data,
            )

    def parse_data(self, response):
        yield {
                'URL': response.url,
                'Title': response.xpath("//div[@class='a']/h1/text()").get(),
                'Phone': response.xpath("//a[@class='nolink-black']/span/text()").get(),
                'Fax': response.xpath("//div[@class='mod-Kontaktdaten__list-item contains-icon-fax']/span/text()").get(),
                'email': response.xpath("normalize-space(//div[@class='mod-Kontaktdaten__list-item contains-icon-email']/a/text())").get(),
                'Website': response.xpath("normalize-space(//div[@class='mod-Kontaktdaten__list-item contains-icon-homepage']/a/text())").get()
            }

CodePudding user response:

I will demonstrate how to get those hotels' profile links with Requests. It's not difficult to translate it to Scrapy logic:

import requests
from bs4 import BeautifulSoup as bs
from tqdm import tqdm ## if using Jupyter notebook, import as from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)

url = 'https://www.gelbeseiten.de/ajaxsuche'
hotels_urls = []
payload = {
        'umkreis': '-1', 'WAS': 'hotels', 'WO': 'nürnberg', 'position': '0', 'anzahl': '50','sortierung': 'relevanz'
    }
r = s.post(url, data=payload)
html_data = bs(r.json()['html'], 'html.parser')
hotels = html_data.select('article')
for h in hotels:
    hotels_urls.append('https://www.gelbeseiten.de/gsbiz/'   h.get('data-realid'))

for x in tqdm(range(50, 200, 10)):
    payload = {
        'umkreis': '-1', 'WAS': 'hotels', 'WO': 'nürnberg', 'position': f'{x}', 'anzahl': '10','sortierung': 'relevanz'
    }
    r = s.post(url, data=payload)
    html_data = bs(r.json()['html'], 'html.parser')
    hotels = html_data.select('article')
    for h in hotels:
        hotels_urls.append('https://www.gelbeseiten.de/gsbiz/'   h.get('data-realid'))
print('got', len(set(hotels_urls)), 'hotels')
print(set(hotels_urls))

Result printed in terminal:

100%
15/15 [00:05<00:00, 3.33it/s]
got 185 hotels
{'https://www.gelbeseiten.de/gsbiz/5359beb3-4426-42a1-aacc-b385b47dd714', 'https://www.gelbeseiten.de/gsbiz/f799c238-283a-4228-802b-9d60ffef407e', 'https://www.gelbeseiten.de/gsbiz/6f3d3371-3fb5-43b6-a6b5-fe65c230e54d', 'https://www.gelbeseiten.de/gsbiz/3252f849-b33d-4902-a077-9c21f0c190d4', 'https://www.gelbeseiten.de/gsbiz/d616a81c-b48d-414e-b094-1fdafb5d1dae', [...]}

This logic can be incorporated in a start_urls() function, for example.

For TQDM visit https://pypi.org/project/tqdm/

  • Related