Hi I have a code which scrapes google search results and returns me the link,title and description of the page .. however the issue is it scrapes only the first page. I want to add pagination and scrap multiple pages.
Can someone hep me figure out how to add pagination. I tried several other examples which support pagination but issue with them was that they return only url. I would appreciate if someone can help me figure out how to solve this.
Code:
import requests
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
def get_source(url):
"""Return the source code for the provided URL.
Args:
url (string): URL of the page to scrape.
Returns:
response (object): HTTP response object from requests_html.
"""
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def get_results(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" query)
return response
def parse_results(response):
css_identifier_result = ".tF2Cxc"
css_identifier_title = "h3"
css_identifier_link = ".yuRUbf a"
css_identifier_text = ".IsZvec"
results = response.html.find(css_identifier_result)
output = []
for result in results:
item = {
'title': result.find(css_identifier_title, first=True).text,
'link': result.find(css_identifier_link, first=True).attrs['href'],
'text': result.find(css_identifier_text, first=True).text
}
output.append(item)
return output
def google_search(query):
response = get_results(query)
return parse_results(response)
query = input("Enter your value: ")
results = google_search(query)
results
CodePudding user response:
Here is the working example. You can increase or decrease range of page numbers whatever you want. Sorry for late answer. I was so much busy.
Code:
import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
class ElonSpider(scrapy.Spider):
name = 'elon'
def start_requests(self):
urls = ['https://www.google.co.uk/search?q="Elon Musk"&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr='
str(x) '&productfilter=&sort=null' for x in range(1, 3)]
for url in urls:
yield SeleniumRequest(
url=url,
wait_time=6,
callback=self.parse)
def parse(self, response):
boxs = response.xpath('//*[@class="tF2Cxc"]')
for box in boxs:
yield {
'Title': box.xpath('.//*[@class="LC20lb DKV0Md"]/text()').get()
}
def spider_closed(self):
self.driver.close()
settings.py file:
You have to change/update the uncomment portion from your settings.py file like this.
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'scrapy_sr.middlewares.ScrapySrSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'scrapy_sr.middlewares.ScrapySrDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
# 'scrapy_sr.pipelines.ScrapySrPipeline': 300,
# }
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Middleware
DOWNLOADER_MIDDLEWARES = {
'scrapy_selenium.SeleniumMiddleware': 800
}
# Selenium
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver')
# '--headless' if using chrome instead of firefox
SELENIUM_DRIVER_ARGUMENTS = ['-headless']
Output:
{'Title': 'Elon Musk - Wikipedia'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk (@elonmusk) | Twitter'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk - Forbes'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk | Tesla'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': '@elonmusk • Instagram photos and videos'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk | Biography, SpaceX, Tesla, & Facts | Britannica'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - Wikipedia'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk - CNBC'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=1&productfilter=&sort=null>
{'Title': 'Elon Musk - Tesla, Age & Family - Biography'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk (@elonmusk) | Twitter'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - Forbes'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk | Tesla'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': '@elonmusk • Instagram photos and videos'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk | Biography, SpaceX, Tesla, & Facts | Britannica'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - CNBC'}
2021-10-06 19:45:45 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.google.co.uk/search?q="Elon Musk%22&ei=dEldYYS5Cpea4-EPnZGByAU&start=0&sa=N&ved=2ahUKEwiEw8OQm7XzAhUXzTgGHZ1IAFk4ChDy0wN6BAgBEDk&biw=1366&bih=625&dpr=2&productfilter=&sort=null>
{'Title': 'Elon Musk - Tesla, Age & Family - Biography'}
2021-10-06 19:45:45 [scrapy.core.engine] INFO: Closing spider (finished)
2021-10-06 19:45:45 [selenium.webdriver.remote.remote_connection] DEBUG: DELETE http://127.0.0.1:50161/session/aa8a20e9cebf8c1f4e8d47187031d540 {}
2021-10-06 19:45:45 [urllib3.connectionpool] DEBUG: http://127.0.0.1:50161 "DELETE /session/aa8a20e9cebf8c1f4e8d47187031d540 HTTP/1.1" 200 14
2021-10-06 19:45:45 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2021-10-06 19:45:48 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/response_bytes': 964822,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 5.076303,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 10, 6, 13, 45, 45, 768405),
'item_scraped_count': 18,