Here is my spider It is supposed to assign a list attained from google sheet to global variable denied. In the code this function is called just once , but in the logs it is executed as many times as post request to endpoint is executed (send_to_endpoint()). Where is the error?

import scrapy
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
import json
from datetime import datetime
import json
import logging
import requests
# from scrapy.utils.project import get_project_settings

class Code1Spider(scrapy.Spider):
    name = 'c_cointelegraph'
    allowed_domains = ['cointelegraph.com']
    start_urls = ['https://cointelegraph.com/press-releases/']
    id = int(str(datetime.now().timestamp()).split('.')[0])
    gs_id = ''
    endpoint_url = ''
    def parse(self, response):
        #Returns settings values as dict
        self.gs_id = settings.get('GS_ID')
        self.endpoint_url = settings.get('ENDPOINT_URL')
        #assigns a list of stop words from GS to global variable
        self.denied = self.load_gsheet()        
        for i in response.xpath('//a[@]/@href').getall():
            yield Request(response.urljoin(i), callback = self.parsed)
    def parsed(self, response):
        #set deny_domains to current domain so we could get all external urls
        denied_domains = self.allowed_domains[0]
        links = LinkExtractor(deny_domains=denied_domains,restrict_xpaths=('//article[@]'))
        links = links.extract_links(response)
        links = [i.url for i in links]
        #checks the list of external links agains the list of stop words
        links = [i for i in links if not any(b in i  for b in self.denied)]
        company = response.xpath('//h2//text()').getall()
        if company: company = [i.split('About ')[-1].strip() for i in company if 'About ' in i.strip()]
        if company: company = company[0]
        else: company = ''
        d = {'heading' : response.xpath('//h1[@]/text()').get().strip(),
             'url' : response.url,
             'pubDate' : self.get_pub_date(response.xpath('//script[contains(text(),"datePublished")]/text()').get()),
             'links' : links, 
             'company_name' : company,
             'ScrapeID' : self.id,
        # is used for debuging. just to see printed item.    
        yield d    
        #create post request to endpoint
        req = self.send_to_endpoint(d)
        #send request to endpoint
        yield req
    def get_pub_date(self, d):
        d = json.loads(d)
        pub_date = d['datePublished']
        return pub_date
    def load_gsheet(self):
        #Loads a list of stop words from predefined google sheet
        url = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'.format(gs_id)        
        r = requests.get(url)
        denied = r.text.splitlines()[1:]
        return denied
    def send_to_endpoint(self, d):           
        url = self.endpoint_url
        r = scrapy.Request( url,  method='POST', 
                                  dont_filter = True)
        return r   

Whenever I yield req, load_gsheet() function is running as well triggering google sheets. If I comment out yield req, load_gsheet() is called just once as it is supposed to be. Why does this happen? I have triple check the code line by line, added comments. Have no idea what i miss.

CodePudding user response:

This is happening because you don't assign a callback to the request object that you construct in the send_to_endpoint() method.

The default callback is the parse method so all of the requests created in the send_to_endpoint method are automatically being sent to the parse method which calls the load_gsheet method for every single one of those post requests.

The solution is to either take the load_gsheet call out of the parse method, or explicitly assign a callback to all of the POST requests that isn't self.parse.

