Scrapy / Form request to the next page, callback does not go to the next function-CodePudding

recently i started studying about scrapy and web scraping. I'm working on my first project and i got stucked. I would appreciate if someone can help me with the problem :)

Im scraping the page http://esg.krx.co.kr/contents/02/02020000/ESG02020000.jsp

So far i got to the moment where my program scrapes all the 77pages (i know its a bit hardcoded, i will try to change it later on) and get's the company_name and company_share_id. So now i'm trying to go to the company_page_url and again send a post request to get the data from the graph (not every company has the graph). However it seems like it doesn't call the parse_company_result.

Below i upload my code:

import scrapy
import json
from scrapy.http import Request


class EsgKrx1Spider(scrapy.Spider):
name = 'esg_krx1'
allowed_domains = ['esg.krx.co.kr']

def start_requests(self):
    #sending a post request to the web
    return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
                               formdata={'sch_com_nm': '',
                                         'sch_yy': '2021',
                                         'pagePath': '/contents/02/02020000/ESG02020000.jsp',
                                         'code': '02/02020000/esg02020000',
                                         'pageFirstCall': 'Y'},
                               callback=self.parse)]

def parse(self, response):
    url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"

    total_pages = 77
    for page in range(total_pages):
        payload = {
            'sch_com_nm': '',
            'sch_yy': '2021',
            'pagePath': '/contents/02/02020000/ESG02020000.jsp',
            'code': '02/02020000/esg02020000',
            'curPage': str(page 1)
        }

        yield scrapy.FormRequest(url=url,
                                 method='POST',
                                 formdata=payload,
                                 callback=self.parse_result)

def parse_result(self, response):
    dict_data = json.loads(response.text)

    # looping in the result and assigning the company name
    for i in dict_data['result']:
        company_name = i['com_abbrv']
        compay_share_id = i['isu_cd']
        print(company_name, compay_share_id)

        company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={compay_share_id}"
        yield Request(company_page_url)

        data_url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"

        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
        }

        # yield response.follow(url=data_url, method='POST', callback=self.parse_company_result, headers=headers)
        yield scrapy.FormRequest(url=data_url,
                                 method='POST',
                                 headers=headers,
                                 callback=self.parse_company_result)


def parse_company_result(self, response):
    graph_data = json.loads(response.text)
    print(graph_data)

All the functions are of course in the class, it just didn't paste the code as i expected.

So my question is:

How do i go to the company page url?

Or maybe the request is correct, but later i do something wrong?

Maybe i don't get the response from the data_url?

I will appreciate all the help.

CodePudding user response：

I have updated your script as there were quite a few errors, namely:

In parse_result it's best to create another function to parse the company urls as opposed to parsing them in the same one.
You need to include the payload to parse the json from the Request Url, again it's best to split these into separate parsers that way you can see what is happening and what is going on.

I have built a scraper that does this in a hierarchical way so that you can understand what's happening top-down.

Additional note:

cb_kwargs allows you to take variables from one parser to another. Therefore, I can grab the company id and name from parse_result and yield this in the last parser. Note - the company id was important for the payload in parse_company. Therefore, you should get used to learning how cb_kwargs works.

import scrapy
import json
from scrapy.http import Request

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'en-GB,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Origin': 'http://esg.krx.co.kr',
    'Connection': 'keep-alive',
    'Referer': 'http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd=004710',
}

class EsgKrx1Spider(scrapy.Spider):
    name = 'esg_krx1'
    allowed_domains = ['esg.krx.co.kr']
    
    def start_requests(self):
        #sending a post request to the web
        return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
                                formdata={'sch_com_nm': '',
                                            'sch_yy': '2021',
                                            'pagePath': '/contents/02/02020000/ESG02020000.jsp',
                                            'code': '02/02020000/esg02020000',
                                            'pageFirstCall': 'Y'},
                                callback=self.parse)]
    
    def parse(self, response):
        url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
    
        total_pages = 77
        for page in range(total_pages):
            payload = {
                'sch_com_nm': '',
                'sch_yy': '2021',
                'pagePath': '/contents/02/02020000/ESG02020000.jsp',
                'code': '02/02020000/esg02020000',
                'curPage': str(page 1)
            }
    
            yield scrapy.FormRequest(url=url,
                                    method='POST',
                                    formdata=payload,
                                    callback=self.parse_result)
    
    def parse_result(self, response):
        dict_data = json.loads(response.text)
    
        # looping in the result and assigning the company name
        for i in dict_data['result']:
            company_name = i['com_abbrv']
            company_share_id = i['isu_cd']

            company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={company_share_id}"
            yield Request(company_page_url,
            #headers=headers, 
            callback = self.parse_company, cb_kwargs = {
                'company_share_id':company_share_id,
                'company_name':company_name
            })

    def parse_company(self, response, company_share_id, company_name):
    """ Grab the chart ID from the webpage and store it as a list"""

        chart_id = response.xpath("(//div[@class='CHART-AREA'])[1]//div//@id").get()
        chart_id = [chart_id.split("chart")[-1]]

""" Notice that the number at the end of code in payload changes for each chart"""    

        for id_of_chart in chart_id:
            for code_no in  range(1, 3):
                yield scrapy.FormRequest(
                    url = 'http://esg.krx.co.kr/contents/99/ESG99000001.jspx',
                    method='POST',
                    # headers=headers,
                    formdata = {
                            'url_isu_cd': str(company_share_id),
                            'isu_cd': '',
                            'sch_com_nm': '',
                            'pagePath': '/contents/02/02010000/ESG02010000.jsp',
                            'code': f'02/02010000/esg02010000_0{code_no}',
                            'chartNo': f'{id_of_chart}'
                                                                    },
                    callback = self.parse_company_result,
                    cb_kwargs = {
                        'company_share_id':company_share_id,
                        'company_name':company_name
                    }
                )
        
    def parse_company_result(self, response, company_share_id, company_name):
        graph_data = json.loads(response.text)
        yield {
            'data':graph_data, 
            'company_name':company_name,
            'company_share_id':company_share_id
        }

Output:

{'data': {'block1': [{'yy': '2019', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2020', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2021', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}]}, 'company_name': '아남전자', 'company_share_id': '008700'}

...
...