Here is my spider
It is supposed to assign a list attained from google sheet to global variable denied
. In the code this function is called just once , but in the logs it is executed as many times as post request to endpoint is executed (send_to_endpoint()
). Where is the error?
import scrapy
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
import json
from datetime import datetime
import json
import logging
import requests
# from scrapy.utils.project import get_project_settings
class Code1Spider(scrapy.Spider):
name = 'c_cointelegraph'
allowed_domains = ['cointelegraph.com']
start_urls = ['https://cointelegraph.com/press-releases/']
id = int(str(datetime.now().timestamp()).split('.')[0])
denied=[]
gs_id = ''
endpoint_url = ''
def parse(self, response):
#Returns settings values as dict
settings=self.settings.copy_to_dict()
self.gs_id = settings.get('GS_ID')
self.endpoint_url = settings.get('ENDPOINT_URL')
#assigns a list of stop words from GS to global variable
self.denied = self.load_gsheet()
for i in response.xpath('//a[@]/@href').getall():
yield Request(response.urljoin(i), callback = self.parsed)
def parsed(self, response):
#set deny_domains to current domain so we could get all external urls
denied_domains = self.allowed_domains[0]
links = LinkExtractor(deny_domains=denied_domains,restrict_xpaths=('//article[@]'))
links = links.extract_links(response)
links = [i.url for i in links]
#checks the list of external links agains the list of stop words
links = [i for i in links if not any(b in i for b in self.denied)]
company = response.xpath('//h2//text()').getall()
if company: company = [i.split('About ')[-1].strip() for i in company if 'About ' in i.strip()]
if company: company = company[0]
else: company = ''
d = {'heading' : response.xpath('//h1[@]/text()').get().strip(),
'url' : response.url,
'pubDate' : self.get_pub_date(response.xpath('//script[contains(text(),"datePublished")]/text()').get()),
'links' : links,
'company_name' : company,
'ScrapeID' : self.id,
}
# is used for debuging. just to see printed item.
yield d
#create post request to endpoint
req = self.send_to_endpoint(d)
#send request to endpoint
yield req
def get_pub_date(self, d):
d = json.loads(d)
pub_date = d['datePublished']
return pub_date
def load_gsheet(self):
#Loads a list of stop words from predefined google sheet
gs_id=self.gs_id
url = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(gs_id)
r = requests.get(url)
denied = r.text.splitlines()[1:]
logging.info(denied)
return denied
def send_to_endpoint(self, d):
url = self.endpoint_url
r = scrapy.Request( url, method='POST',
body=json.dumps(d),
headers={'Content-Type':'application/json'},
dont_filter = True)
return r
Whenever I yield req
, load_gsheet()
function is running as well triggering google sheets. If I comment out yield req
, load_gsheet()
is called just once as it is supposed to be.
Why does this happen? I have triple check the code line by line, added comments. Have no idea what i miss.
CodePudding user response:
This is happening because you don't assign a callback to the request object that you construct in the send_to_endpoint()
method.
The default callback is the parse
method so all of the requests created in the send_to_endpoint
method are automatically being sent to the parse
method which calls the load_gsheet
method for every single one of those post requests.
The solution is to either take the load_gsheet
call out of the parse
method, or explicitly assign a callback to all of the POST requests that isn't self.parse
.