Home > database >  Scrap google search with pagination using Python
Scrap google search with pagination using Python

Time:10-07

Hi I have a code which scrapes google search results and returns me the link,title and description of the page .. however the issue is it scrapes only the first page. I want to add pagination and scrap multiple pages.

Can someone hep me figure out how to add pagination. I tried several other examples which support pagination but issue with them was that they return only url. I would appreciate if someone can help me figure out how to solve this.

Code:

import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession


def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

def get_results(query):
    
    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.google.co.uk/search?q="   query)
    
    return response


def parse_results(response):
    
    css_identifier_result = ".tF2Cxc"
    css_identifier_title = "h3"
    css_identifier_link = ".yuRUbf a"
    css_identifier_text = ".IsZvec"
    
    results = response.html.find(css_identifier_result)

    output = []
    
    for result in results:

        item = {
            'title': result.find(css_identifier_title, first=True).text,
            'link': result.find(css_identifier_link, first=True).attrs['href'],
            'text': result.find(css_identifier_text, first=True).text
        }
        
        output.append(item)
        
    return output

def google_search(query):
    response = get_results(query)
    return parse_results(response)


query = input("Enter your value: ")
results = google_search(query)
results

CodePudding user response:

Here is the working example. You can increase or decrease range of page numbers whatever you want. Sorry for late answer. I was so much busy.

Code:

import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest

class ElonSpider(scrapy.Spider):
    name = 'elon'

    def start_requests(self):
        urls = ['https://www.google.co.uk/search?q="Elon Musk"&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr='  
                str(x) '&productfilter=&sort=null' for x in range(1, 3)]
        for url in urls:

            yield SeleniumRequest(
                url=url,
                wait_time=6,
                callback=self.parse)

    def parse(self, response):

        boxs = response.xpath('//*[@class="tF2Cxc"]')
        for box in boxs:

            yield {
                'Title': box.xpath('.//*[@class="LC20lb DKV0Md"]/text()').get()
                }

    def spider_closed(self):
        self.driver.close()

settings.py file:

You have to change/update the uncomment portion from your settings.py file like this.

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'scrapy_sr.middlewares.ScrapySrSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'scrapy_sr.middlewares.ScrapySrDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'scrapy_sr.pipelines.ScrapySrPipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


# Middleware

DOWNLOADER_MIDDLEWARES = {
    'scrapy_selenium.SeleniumMiddleware': 800
}

# Selenium

SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
# '--headless' if using chrome instead of firefox
SELENIUM_DRIVER_ARGUMENTS = ['-headless']

Output:

{'Title': 'Elon Musk - Wikipedia'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk (@elonmusk) | Twitter'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk - Forbes'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk | Tesla'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': '@elonmusk • Instagram photos and videos'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk | Biography, SpaceX, Tesla, & Facts | Britannica'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - Wikipedia'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk - CNBC'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk - Tesla, Age & Family - Biography'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk (@elonmusk) | Twitter'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - Forbes'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk | Tesla'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': '@elonmusk • Instagram photos and videos'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk | Biography, SpaceX, Tesla, & Facts | Britannica'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - CNBC'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - Tesla, Age & Family - Biography'}
2021-10-06 19:45:45 [scrapy.core.engine] INFO: Closing spider (finished)
2021-10-06 19:45:45 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:50161/session/aa8a20e9cebf8c1f4e8d47187031d540 {}
2021-10-06 19:45:45 [urllib3.connectionpool] DEBUG: http://127.0.0.1:50161 "DELETE /session/aa8a20e9cebf8c1f4e8d47187031d540 HTTP/1.1" 200 14
2021-10-06 19:45:45 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2021-10-06 19:45:48 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/response_bytes': 964822,
 'downloader/response_count': 2,
 'downloader/response_status_count/200': 2,
 'elapsed_time_seconds': 5.076303,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2021, 10, 6, 13, 45, 45, 768405),
 'item_scraped_count': 18,
  • Related