recently i started studying about scrapy and web scraping. I'm working on my first project and i got stucked. I would appreciate if someone can help me with the problem :)
Im scraping the page http://esg.krx.co.kr/contents/02/02020000/ESG02020000.jsp
So far i got to the moment where my program scrapes all the 77pages (i know its a bit hardcoded, i will try to change it later on) and get's the company_name and company_share_id. So now i'm trying to go to the company_page_url and again send a post request to get the data from the graph (not every company has the graph). However it seems like it doesn't call the parse_company_result.
Below i upload my code:
import scrapy
import json
from scrapy.http import Request
class EsgKrx1Spider(scrapy.Spider):
name = 'esg_krx1'
allowed_domains = ['esg.krx.co.kr']
def start_requests(self):
#sending a post request to the web
return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
formdata={'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'pageFirstCall': 'Y'},
callback=self.parse)]
def parse(self, response):
url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
total_pages = 77
for page in range(total_pages):
payload = {
'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'curPage': str(page 1)
}
yield scrapy.FormRequest(url=url,
method='POST',
formdata=payload,
callback=self.parse_result)
def parse_result(self, response):
dict_data = json.loads(response.text)
# looping in the result and assigning the company name
for i in dict_data['result']:
company_name = i['com_abbrv']
compay_share_id = i['isu_cd']
print(company_name, compay_share_id)
company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={compay_share_id}"
yield Request(company_page_url)
data_url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
# yield response.follow(url=data_url, method='POST', callback=self.parse_company_result, headers=headers)
yield scrapy.FormRequest(url=data_url,
method='POST',
headers=headers,
callback=self.parse_company_result)
def parse_company_result(self, response):
graph_data = json.loads(response.text)
print(graph_data)
All the functions are of course in the class, it just didn't paste the code as i expected.
So my question is:
How do i go to the company page url?
Or maybe the request is correct, but later i do something wrong?
Maybe i don't get the response from the data_url?
I will appreciate all the help.
CodePudding user response:
I have updated your script as there were quite a few errors, namely:
- In
parse_result
it's best to create another function to parse the company urls as opposed to parsing them in the same one. - You need to include the payload to parse the json from the
Request Url
, again it's best to split these into separate parsers that way you can see what is happening and what is going on.
I have built a scraper that does this in a hierarchical way so that you can understand what's happening top-down.
Additional note:
cb_kwargs
allows you to take variables from one parser to another. Therefore, I can grab the company id and name fromparse_result
and yield this in the last parser. Note - the company id was important for the payload inparse_company
. Therefore, you should get used to learning howcb_kwargs
works.
import scrapy
import json
from scrapy.http import Request
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-GB,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'http://esg.krx.co.kr',
'Connection': 'keep-alive',
'Referer': 'http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd=004710',
}
class EsgKrx1Spider(scrapy.Spider):
name = 'esg_krx1'
allowed_domains = ['esg.krx.co.kr']
def start_requests(self):
#sending a post request to the web
return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
formdata={'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'pageFirstCall': 'Y'},
callback=self.parse)]
def parse(self, response):
url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
total_pages = 77
for page in range(total_pages):
payload = {
'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'curPage': str(page 1)
}
yield scrapy.FormRequest(url=url,
method='POST',
formdata=payload,
callback=self.parse_result)
def parse_result(self, response):
dict_data = json.loads(response.text)
# looping in the result and assigning the company name
for i in dict_data['result']:
company_name = i['com_abbrv']
company_share_id = i['isu_cd']
company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={company_share_id}"
yield Request(company_page_url,
#headers=headers,
callback = self.parse_company, cb_kwargs = {
'company_share_id':company_share_id,
'company_name':company_name
})
def parse_company(self, response, company_share_id, company_name):
""" Grab the chart ID from the webpage and store it as a list"""
chart_id = response.xpath("(//div[@class='CHART-AREA'])[1]//div//@id").get()
chart_id = [chart_id.split("chart")[-1]]
""" Notice that the number at the end of code in payload changes for each chart"""
for id_of_chart in chart_id:
for code_no in range(1, 3):
yield scrapy.FormRequest(
url = 'http://esg.krx.co.kr/contents/99/ESG99000001.jspx',
method='POST',
# headers=headers,
formdata = {
'url_isu_cd': str(company_share_id),
'isu_cd': '',
'sch_com_nm': '',
'pagePath': '/contents/02/02010000/ESG02010000.jsp',
'code': f'02/02010000/esg02010000_0{code_no}',
'chartNo': f'{id_of_chart}'
},
callback = self.parse_company_result,
cb_kwargs = {
'company_share_id':company_share_id,
'company_name':company_name
}
)
def parse_company_result(self, response, company_share_id, company_name):
graph_data = json.loads(response.text)
yield {
'data':graph_data,
'company_name':company_name,
'company_share_id':company_share_id
}
Output:
{'data': {'block1': [{'yy': '2019', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2020', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2021', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}]}, 'company_name': '아남전자', 'company_share_id': '008700'}
...
...