I'm trying to write a scraper that gets domains from database result. I'm able to get data from database but I can't wrap my head around how to feed it to Scrapy. I've looked here and found many suggestions but none is really what I'm doing. When I run my codes below, nothing happens not even an error.
scaper.py
#import json
import json
#import database library
import psycopg2
#import scrapy library
import scrapy
#create database connection
conn = psycopg2.connect(
host="localhost",
database="mydb",
user="dbuser",
password="postgres",
port=5432
)
#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()
#execute query from database
query.execute('SELECT info FROM domains')
#create scrapy class
class MySpider(scrapy.Spider):
name = "scrap_domains"
#start_requests with scrapy
def start_requests(self):
#iterate over database result
for url in query:
#iterate over each json object
for item in url:
#get domain name
domain_name = item['domain']
#grab information from url
yield scrapy.Request()
#print response
def parse(self, response):
print(response)
# we close the cursor and conn both
query.close()
conn.close()
CodePudding user response:
I finally got my scraper working. The problem was caused by closing the cursor and database connection on every iteration. Python is not async like Node, as I've been learning. A function should be written to detect when the iteration is finished then proceed with further tasks but for the purpose of this example, we just comment them out like we did at the bottom of the file. I'm posting a detailed answer for future references.
Notes : I use this scraper to scrape through a list of 300 millions records stored in my database. Just change your limit per page and the code below will do the rest for you until it's all done. When it' finished, just grab your json file and upload to your database. I suffered so that you don't have to.
I'm using PostgreSQL and store the data in JSONB. My table only has 2 columns and looks like this :
id (int) | info (jsonb)
1 | {"domain": "weerstation-aarschot.be","timestamp":1646106745}
2 | {"domain": "wereldvakanties.be","timestamp":1646106746}
3 | {"domain": "welzijnscentrum.be","timestamp":1646106747}
As per the scrapy documents, copy/paste codes below and run this command in your terminal to write all domains to a json file :
scrapy runspider scraper.py -o domains.json
Use the Selectors to extract HTML data from the body
scraper.py
#import datetime
from datetime import datetime
#calendar
import calendar
from email import header;
import time;
#import json
import json
from urllib import request
from wsgiref import headers
#import database library
import psycopg2
#import scrapy library
import scrapy
#create database connection
conn = psycopg2.connect(
host="localhost",
database="mydb",
user="username",
password="postgres",
port=5432
)
#create cursor from database
#cursor() is python equivalent to query() to fetch the rows
query = conn.cursor()
#spiderclass
class MySpider(scrapy.Spider):
name = "domains"
def start_requests(self):
#database pagination
#loop through extremely large datasets automatically
current_page = ""
offset = 0
limit = 1000000
flag = True
#while its true
while flag:
#execute query from database
query.execute("SELECT info FROM domains ORDER BY id ASC LIMIT " str(limit) " OFFSET " str(offset))
# query db with start and offset, example: select * from domains limit %start% offset %offset%
unique_domains = query.fetchone()
#condition
if not unique_domains:
flag = False
else:
# do processing with your data
offset = limit
#iterate over database result
for url in query:
#iterate over each json object
for item in url:
#variables from result
hostname = item['domain']
https_url = "https://" hostname
http_url = "http://" hostname
#fetch http request
yield scrapy.Request(url=https_url, callback=self.parse)
#print response
def parse(self, response):
#current date
currDate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
timestamp = calendar.timegm(time.gmtime())
date_created = currDate
#variables from response
url = response.url
status = response.status
headers = response.headers
request = response.request
body = response.body
#header information
content_language = ""
protocol = ""
#meta tags
favicon = response.css('link::attr(href)').get()
title = response.xpath('//title/text()').get()
description = response.xpath('//description/text()').get()
keywords = response.xpath('//keywords/text()').get()
author = response.xpath('//author/text()').get()
type = response.xpath('//content-type/text()').get()
#open graph tags
og_title = ""
og_type = ""
og_url = ""
og_image = ""
og_site_name = ""
og_description = ""
#get all the links to follow
links = response.css('a::attr(href)').getall()
#get text from every header tag (<h>)
h1_text = response.css('h1::text').getall()
h2_text = response.css('h2::text').getall()
h3_text = response.css('h3::text').getall()
#get span text
span_text = response.css('span::text').getall();
#get text from every paragraph tag (<p>)
p_text = response.css('p::text').getall()
#get text from every div tag (<div>)
div_text = response.css('div::text').getall()
#get every image
images = response.css('img').xpath('@src').getall()
#get every video
videos = []
#category, score
websiteScore = ""
#grab information from url
result = yield{
'url': url,
'status': status,
"score" : websiteScore,
"type" : type,
"category" : "",
"industry" : "",
"timestamp" : timestamp,
"date_created" :date_created,
"headers" :{
"content_language" : content_language,
"protocol" : protocol,
},
"metas" : [
{
"favicon ": favicon,
"title" :title,
"description": description,
"keywords" :keywords,
"author" : author,
}
],
"open_graph" : [
{
"og_title" :og_title,
"og_type" :og_type,
"og_url" :og_url,
"og_image" :og_image,
"og_site_name" :og_site_name,
"og_description" : og_description,
}
],
"links": links,
"h1_text": h1_text,
"h2_text": h2_text,
"h3_text": h3_text,
"div_text": div_text,
"p_text": p_text,
"span_text": span_text,
"images ": images,
"videos" : videos
}
#print result
#print(request)
# we close the cursor and conn both
#query.close()
#conn.close()
#scrapy runspider scraper.py -o domains.json
domains.json (example output)
[
{"url": "https://weerstation-aarschot.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535167.804621, "date_created": "2022-03-05 21:52:47", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://wereldvakanties.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.069924, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
{"url": "https://welzijnscentrum.be", "status": 200, "score": "", "page": "", "offset ": "", "per_page": "", "type": "", "category": "", "industry": "", "timestamp": 1646535168.096689, "date_created": "2022-03-05 21:52:48", "headers": {"content_language": "", "protocol": ""}, "metas": [{"favicon ": "", "title": "", "description": "", "keywords": "", "author": ""}], "open_graph": [{"og_title": "", "og_type": "", "og_url": "", "og_image": "", "og_site_name": "", "og_description": ""}], "links": [[]], "h_text": [[]], "div_text": [[]], "p_text": [[]], "images ": [[]], "videos": [[]]},
]