Home > Mobile >  Scraping infinite scroll page using Scrapy Python
Scraping infinite scroll page using Scrapy Python

Time:05-05

I am trying to scrape all the reviews from steam games, but steam has both infinite scroll, till it reaches a specific number of reviews, then a button for loading more content. How do I navigate that with scrapy?

My code:

i = 1
url_ = 'https://steamcommunity.com/app/1506830/homecontent/?userreviewscursor=AoIIPwYYan/HzbUD&userreviewsoffset={}0&p={}&workshopitemspage={}&readytouseitemspage={}&mtxitemspage={}&itemspage={}&screenshotspage={}&videospage={}&artpage={}&allguidepage={}&webguidepage={}&integratedguidepage={}&discussionspage={}&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&l=english&appHubSubSection=10&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1'

#url_ is url gathered from inspecting Network while scrolling 

class ReviewSpider(scrapy.Spider):
    name = 'steam_test'
    allowed_domains = ['store.steampowered.com']
    start_urls = ['https://steamcommunity.com/app/1517290/reviews/']

    
    def start_requests(self):
        yield Request(url, callback=self.parse)


    def parse(self, response):

        page = get_page(response)
        product_id = get_product_id(response)
    
        # Load all reviews on current page.
        reviews = response.css('div.apphub_CardContentMain')
        for review in (reviews):
                 
            yield{
                "text":review.css('div.apphub_CardTextContent::text').get(),
                "Recommend": review.css('div.title::text').get(), 
                "date": review.css('div.date_posted::text').get()
                }

Pagination section that's not working:


        if response.status == 200:    
            global i 
            i = 1
            SteamFifaSpider.start_urls = url_.format(i,i,i,i,i,i,i,i,i,i,i,i,i)

How do I fix the infinite scroll for Steam?

CodePudding user response:

I tried to make pagination using for loop with range function and so many ways using API url but did't work.If you scroll down,each time it produces new requested url with data and Only the following way I can grab the right data. You can add more requested urls whatever you need inside the list from API response.

import scrapy
class ReviewSpider(scrapy.Spider):
    name = 'steam_test'  
    def start_requests(self):
        urls = [
            'https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwwdfnPN+7QD&userreviewsoffset=10&p=2&workshopitemspage=2&readytouseitemspage=2&mtxitemspage=2&itemspage=2&screenshotspage=2&videospage=2&artpage=2&allguidepage=2&webguidepage=2&integratedguidepage=2&discussionspage=2&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&l=english&appHubSubSection=10&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1',
            'https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwcAR3zK6bUD&userreviewsoffset=10&p=3&workshopitemspage=4&readytouseitemspage=4&mtxitemspage=4&itemspage=4&screenshotspage=4&videospage=4&artpage=4&allguidepage=4&webguidepage=4&integratedguidepage=4&discussionspage=3&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&appid=1517290&appHubSubSection=10&appHubSubSection=10&l=english&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1',
            'https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwZ1R3CBq7UD&userreviewsoffset=20&p=4&workshopitemspage=5&readytouseitemspage=5&mtxitemspage=5&itemspage=5&screenshotspage=5&videospage=5&artpage=5&allguidepage=5&webguidepage=5&integratedguidepage=5&discussionspage=4&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&appid=1517290&appHubSubSection=10&appHubSubSection=10&l=english&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1',
            'https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwYYan750bUD&userreviewsoffset=30&p=5&workshopitemspage=6&readytouseitemspage=6&mtxitemspage=6&itemspage=6&screenshotspage=6&videospage=6&artpage=6&allguidepage=6&webguidepage=6&integratedguidepage=6&discussionspage=5&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&appid=1517290&appHubSubSection=10&appHubSubSection=10&l=english&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1',
            'https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwYYanGf+LQD&userreviewsoffset=40&p=6&workshopitemspage=7&readytouseitemspage=7&mtxitemspage=7&itemspage=7&screenshotspage=7&videospage=7&artpage=7&allguidepage=7&webguidepage=7&integratedguidepage=7&discussionspage=6&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&appid=1517290&appHubSubSection=10&appHubSubSection=10&l=english&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1',
            'https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwRTCXHZj7YD&userreviewsoffset=50&p=7&workshopitemspage=8&readytouseitemspage=8&mtxitemspage=8&itemspage=8&screenshotspage=8&videospage=8&artpage=8&allguidepage=8&webguidepage=8&integratedguidepage=8&discussionspage=7&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&appid=1517290&appHubSubSection=10&appHubSubSection=10&l=english&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1',
            'https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwJcDHbFhrYD&userreviewsoffset=60&p=8&workshopitemspage=9&readytouseitemspage=9&mtxitemspage=9&itemspage=9&screenshotspage=9&videospage=9&artpage=9&allguidepage=9&webguidepage=9&integratedguidepage=9&discussionspage=8&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&appid=1517290&appHubSubSection=10&appHubSubSection=10&l=english&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1'
            ]

        for url in urls:
            #url='https://steamcommunity.com/app/1517290/reviews'
            yield scrapy.Request(url,method='GET', callback=self.parse)

    def parse(self, response):
        # Load all reviews on current page.
        reviews = response.css('div.apphub_UserReviewCardContent')
        for review in reviews:  
            yield {
                "text":review.xpath(''.join('.//*[@]//text()')).getall()[-1].replace('\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t','').replace('\t\t\t',''),
                "Recommend": review.css('div.title::text').get(), 
                "date": review.css('div.date_posted::text').get()
                }

Alternative way to make the pagination

import scrapy
class ReviewSpider(scrapy.Spider):
    name = 'steam_test'  
    def start_requests(self):
        urls = ['https://steamcommunity.com/app/1517290/homecontent/?userreviewscursor=AoIIPwJcDHbFhrYD&userreviewsoffset=60&p=8&workshopitemspage=' str(x) '&readytouseitemspage=9&mtxitemspage=9&itemspage=9&screenshotspage=9&videospage=9&artpage=9&allguidepage=9&webguidepage=9&integratedguidepage=9&discussionspage=8&numperpage=10&browsefilter=trendweek&browsefilter=trendweek&appid=1517290&appHubSubSection=10&appHubSubSection=10&l=english&filterLanguage=default&searchText=&maxInappropriateScore=50&forceanon=1' for x in range(1,11)]

        for url in urls:
            #url='https://steamcommunity.com/app/1517290/reviews'
            yield scrapy.Request(url,method='GET', callback=self.parse)

    def parse(self, response):
        # Load all reviews on current page.
        reviews = response.css('div.apphub_UserReviewCardContent')
        for review in reviews:  
            yield {
                "text":review.xpath(''.join('.//*[@]//text()')).getall()[-1].replace('\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t','').replace('\t\t\t',''),
                "Recommend": review.css('div.title::text').get(), 
                "date": review.css('div.date_posted::text').get()
                }
  • Related