I am trying to crawl this website but I am getting empty response I am using scrapy and I tried printing xpath but I am getting empty array I though this was straight forward task but now I am unable to get data from the table. here is my code.
import scrapy
from scrapy import Request
class ShareInfoSpider(scrapy.Spider):
name = 'share'
def start_requests(self):
url = "https://nepsealpha.com/investment-calandar/ipo"
yield Request(url, callback=self.parse)
def parse(self, response):
for tr in response.xpath("//table[@id='DataTables_Table_0']"):
print(tr)
CodePudding user response:
The data you see is requested from other URL via JavaScript, so you need other methods how to get the data. For example requests
/json
:
import json
import requests
import pandas as pd
params = {
"draw": "1",
"columns[0][data]": "symbol",
"columns[0][name]": "symbol",
"columns[0][searchable]": "true",
"columns[0][orderable]": "true",
"columns[0][search][value]": "",
"columns[0][search][regex]": "false",
"columns[1][data]": "units",
"columns[1][name]": "units",
"columns[1][searchable]": "true",
"columns[1][orderable]": "true",
"columns[1][search][value]": "",
"columns[1][search][regex]": "false",
"columns[2][data]": "opening_date",
"columns[2][name]": "opening_date",
"columns[2][searchable]": "true",
"columns[2][orderable]": "true",
"columns[2][search][value]": "",
"columns[2][search][regex]": "false",
"columns[3][data]": "closing_date",
"columns[3][name]": "closing_date",
"columns[3][searchable]": "true",
"columns[3][orderable]": "true",
"columns[3][search][value]": "",
"columns[3][search][regex]": "false",
"columns[4][data]": "issue_manager",
"columns[4][name]": "issue_manager",
"columns[4][searchable]": "true",
"columns[4][orderable]": "true",
"columns[4][search][value]": "",
"columns[4][search][regex]": "false",
"columns[5][data]": "status",
"columns[5][name]": "status",
"columns[5][searchable]": "true",
"columns[5][orderable]": "true",
"columns[5][search][value]": "",
"columns[5][search][regex]": "false",
"columns[6][data]": "view",
"columns[6][name]": "view",
"columns[6][searchable]": "true",
"columns[6][orderable]": "true",
"columns[6][search][value]": "",
"columns[6][search][regex]": "false",
"start": "0",
"length": "10",
"search[value]": "",
"search[regex]": "false",
}
api_url = "https://nepsealpha.com/investment-calandar/ipo"
headers = {"X-Requested-With": "XMLHttpRequest"}
data = requests.get(api_url, params=params, headers=headers).json()
# uncomment to print all data
# print(json.dumps(data, indent=4))
df = pd.DataFrame(data["data"])
print(df)
Prints:
id symbol company_name units opening_date closing_date issue_manager created_at updated_at price last_closing_date type url status view DT_RowIndex
0 3033 <a href='/stocks/BARAHI (Local)/info' target='_blank'><b>BARAHI (Local)</b></a> Barahi Hydropower Public Limited 250000 <span class='no-wrap'>2022-10-21</span> <span class='no-wrap'>2022-11-20</span> NIBL Ace Capital Limited 2022-11-04T03:37:01.000000Z 2022-11-04T03:37:01.000000Z 100 None Ordinary None <span text-align: center" >Open</span> None 1
1 3034 <a href='/stocks/PEOPLES/info' target='_blank'><b>PEOPLES</b></a> Peoples Hydropower Company Limited 5606390 <span class='no-wrap'>2022-11-11</span> <span class='no-wrap'>2022-11-15</span> Sanima Capital Limited 2022-11-04T06:05:01.000000Z 2022-11-04T06:05:01.000000Z 100 None Ordinary https://nepsealpha.com/announcement/peoples-hydropower-company-limited-issued-a-letter-of-invitation-to-the-general-public-to-issue-56-lakh-6-thousand-390-shares-of-rs-100-face-value-from-25th-of-kartik <span text-align: center" >Open</span> <a href='https://nepsealpha.com/announcement/peoples-hydropower-company-limited-issued-a-letter-of-invitation-to-the-general-public-to-issue-56-lakh-6-thousand-390-shares-of-rs-100-face-value-from-25th-of-kartik' target='_blank'><i class='fa fa-file-text'></i></a> 2
2 3031 <a href='/stocks/TJVCL (Final Call Money)/info' target='_blank'><b>TJVCL (Final Call Money)</b></a> None 3705000 <span class='no-wrap'>2022-09-20</span> <span class='no-wrap'>2022-11-03</span> Global IME Capital Limited 2022-09-22T10:32:00.000000Z 2022-09-22T10:34:43.000000Z 87 None Ordinary None <span style="color: #E81E62;text-align: center" >Closed</span> None 3
3 3032 <a href='/stocks/EASTERN/info' target='_blank'><b>EASTERN</b></a> None 670310 <span class='no-wrap'>2022-10-14</span> <span class='no-wrap'>2022-10-19</span> NMB Capital Limited 2022-09-29T11:02:00.000000Z 2022-10-18T04:27:53.000000Z 100 None Ordinary None <span style="color: #E81E62;text-align: center" >Closed</span> None 4
4 3030 <a href='/stocks/PEOPLES (Local)/info' target='_blank'><b>PEOPLES (Local)</b></a> None 3200000 <span class='no-wrap'>2022-09-16</span> <span class='no-wrap'>2022-10-16</span> Sanima Capital Limited 2022-09-08T05:14:00.000000Z 2022-09-30T04:52:52.000000Z 100 None Ordinary https://nepsealpha.com/announcement/peoples-hydropower-company-limited-has-issued-an-invitation-letter-to-local-residents-affected-by-the-project-to-issue-32-lakh-ordinary-shares-of-rs-100 <span style="color: #E81E62;text-align: center" >Closed</span> <a href='https://nepsealpha.com/announcement/peoples-hydropower-company-limited-has-issued-an-invitation-letter-to-local-residents-affected-by-the-project-to-issue-32-lakh-ordinary-shares-of-rs-100' target='_blank'><i class='fa fa-file-text'></i></a> 5
5 3025 <a href='/stocks/SJCL (Local)/info' target='_blank'><b>SJCL (Local)</b></a> None 3650000 <span class='no-wrap'>2022-08-21</span> <span class='no-wrap'>2022-10-16</span> Citizen Investment Trust 2022-08-09T06:56:00.000000Z 2022-09-22T07:34:50.000000Z 100 None Ordinary https://nepsealpha.com/announcement/invitation-letter-regarding-the-issuance-and-sale-of-36-lakh-50-thousand-shares-of-sanjen-jalvidyut-company-limited-with-a-face-value-of-rs-100 <span style="color: #E81E62;text-align: center" >Closed</span> <a href='https://nepsealpha.com/announcement/invitation-letter-regarding-the-issuance-and-sale-of-36-lakh-50-thousand-shares-of-sanjen-jalvidyut-company-limited-with-a-face-value-of-rs-100' target='_blank'><i class='fa fa-file-text'></i></a> 6
6 3029 <a href='/stocks/SIKLES/info' target='_blank'><b>SIKLES</b></a> None 1058000 <span class='no-wrap'>2022-09-14</span> <span class='no-wrap'>2022-09-18</span> B.O.K Capital Market Limited 2022-09-06T06:55:28.000000Z 2022-09-06T06:55:28.000000Z 100 None Ordinary https://nepsealpha.com/announcement/10-lakh-58-thousand-ordinary-shares-of-siklesvhydropower-limited-with-face-value-of-rs100 <span style="color: #E81E62;text-align: center" >Closed</span> <a href='https://nepsealpha.com/announcement/10-lakh-58-thousand-ordinary-shares-of-siklesvhydropower-limited-with-face-value-of-rs100' target='_blank'><i class='fa fa-file-text'></i></a> 7
7 3026 <a href='/stocks/EASTERN (Local)/info' target='_blank'><b>EASTERN (Local)</b></a> Eastern Hydropower Limited 620000 <span class='no-wrap'>2022-08-23</span> <span class='no-wrap'>2022-09-13</span> NMB Capital Limited 2022-08-14T06:40:00.000000Z 2022-09-06T06:47:21.000000Z 100 None Ordinary https://nepsealpha.com/announcement/eastern-hydropower-limited-issued-a-letter-of-invitation-to-issue-620000-shares-for-the-locals-of-bhojpur-district <span style="color: #E81E62;text-align: center" >Closed</span> <a href='https://nepsealpha.com/announcement/eastern-hydropower-limited-issued-a-letter-of-invitation-to-issue-620000-shares-for-the-locals-of-bhojpur-district' target='_blank'><i class='fa fa-file-text'></i></a> 8
8 3028 <a href='/stocks/SRIJNSIL/info' target='_blank'><b>SRIJNSIL</b></a> None 393750 <span class='no-wrap'>2022-08-31</span> <span class='no-wrap'>2022-09-04</span> Sunrise Capital Limited 2022-08-23T04:39:00.000000Z 2022-08-25T08:07:41.000000Z 100 None Ordinary https://nepsealpha.com/announcement/sirjansheel-laghubitta-to-float-ipo-from-15th-bhadra-to-general-public <span style="color: #E81E62;text-align: center" >Closed</span> <a href='https://nepsealpha.com/announcement/sirjansheel-laghubitta-to-float-ipo-from-15th-bhadra-to-general-public' target='_blank'><i class='fa fa-file-text'></i></a> 9
9 3027 <a href='/stocks/KHAPTAD/info' target='_blank'><b>KHAPTAD</b></a> None 132000 <span class='no-wrap'>2022-08-24</span> <span class='no-wrap'>2022-08-28</span> Prabhu Capital Limited 2022-08-16T01:17:10.000000Z 2022-08-16T01:17:10.000000Z 100 None Ordinary None <span style="color: #E81E62;text-align: center" >Closed</span> None 10
CodePudding user response:
Implementation using Scrapy
Example:
from scrapy.crawler import CrawlerProcess
import scrapy
import json
from bs4 import BeautifulSoup
from urllib.parse import urlencode
class ShareSpider(scrapy.Spider):
name = "market"
custom_settings = {
'USER_AGENT' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
def start_requests(self):
params = {
"draw": "1",
"columns[0][data]": "symbol",
"columns[0][name]": "symbol",
"columns[0][searchable]": "true",
"columns[0][orderable]": "true",
"columns[0][search][value]": "",
"columns[0][search][regex]": "false",
"columns[1][data]": "units",
"columns[1][name]": "units",
"columns[1][searchable]": "true",
"columns[1][orderable]": "true",
"columns[1][search][value]": "",
"columns[1][search][regex]": "false",
"columns[2][data]": "opening_date",
"columns[2][name]": "opening_date",
"columns[2][searchable]": "true",
"columns[2][orderable]": "true",
"columns[2][search][value]": "",
"columns[2][search][regex]": "false",
"columns[3][data]": "closing_date",
"columns[3][name]": "closing_date",
"columns[3][searchable]": "true",
"columns[3][orderable]": "true",
"columns[3][search][value]": "",
"columns[3][search][regex]": "false",
"columns[4][data]": "issue_manager",
"columns[4][name]": "issue_manager",
"columns[4][searchable]": "true",
"columns[4][orderable]": "true",
"columns[4][search][value]": "",
"columns[4][search][regex]": "false",
"columns[5][data]": "status",
"columns[5][name]": "status",
"columns[5][searchable]": "true",
"columns[5][orderable]": "true",
"columns[5][search][value]": "",
"columns[5][search][regex]": "false",
"columns[6][data]": "view",
"columns[6][name]": "view",
"columns[6][searchable]": "true",
"columns[6][orderable]": "true",
"columns[6][search][value]": "",
"columns[6][search][regex]": "false",
"start": "0",
"length": "10",
"search[value]": "",
"search[regex]": "false",
}
headers = {"X-Requested-With": "XMLHttpRequest"}
for params["draw"] in range(1, 3):
yield scrapy.Request(
url= f'https://nepsealpha.com/investment-calandar/ipo?{urlencode(params)}',
method = "GET",
callback=self.parse,
headers=headers
)
def parse(self,response):
json_response = json.loads(response.body)
res = json_response["data"]
print(res)
for data in res:
yield {
"symbol": BeautifulSoup(data["symbol"],'html.parser').get_text(strip=True)
}
if __name__ == "__main__":
process = CrawlerProcess(ShareSpider)
process.crawl()
process.start()
Output:
{'symbol': 'KHAPTAD'}
2022-11-14 23:11:45 [scrapy.core.engine] INFO: Closing spider (finished)
2022-11-14 23:11:45 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 3948,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 17666,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'elapsed_time_seconds': 1.153155,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 11, 14, 17, 11, 45, 921830),
'item_scraped_count': 20,