Hi so i am using srapy to scrape a website https://www.centralbankofindia.co.in and I am getting a response but on finding address by XPath I am getting None
start_urls = [
"https://www.centralbankofindia.co.in/en/branch-locator?field_state_target_id=All&combine=&page={}".format(
i
)
for i in range(0, 5)
]
brand_name = "Central Bank of India"
spider_type = "chain"
# //*[@id="block-cbi-content"]/div/div/div/div[3]/div/table/tbody/tr[1]/td[2]/div/span[2]
# //*[@id="block-cbi-content"]/div/div/div/div[3]/div/table/tbody/tr[2]/td[2]/div/span[2]
# //*[@id="block-cbi-content"]/div/div/div/div[3]/div/table/tbody/tr[3]/td[2]/div/span[2]
def parse(self, response, **kwargs):
"""Parse response."""
# print(response.text)
for id in range(1, 11):
address = self.get_text(
response,
f'//*[@id="block-cbi-content"]/div/div/div/div[3]/div/table/tbody/tr[{id}]/td[2]/div/span[2]',
)
print(address)
def get_text(self, response, path):
sol = response.xpath(path).extract_first()
return sol
The span class for address in the website doesn't have a unique id, is that what is causing the problem?
CodePudding user response:
I think you created too complex xpath
. You should skip some elements and use //
instead.
Some browsers may show tbody
in DevTools
but it may not exists in HTML which scrapy
gets from server so better always skip it.
And you could use extract()
instead of tr[{id}]
and extract_first()
This xpath works for me.
all_items = response.xpath('//*[@id="block-cbi-content"]//td[2]//span[2]/text()').extract()
for address in all_items:
print(address)
BTW: I used text()
in xpath
to get address without HTML tags.
Full working code.
You can put all in one file and run it as python script.py
without creating project
.
It saves results in output.csv
.
In start_urls
I set only link to first page because parse()
searchs link to next page in HTML - so it can get all pages instead of range(0, 5)
#!/usr/bin/env python3
import scrapy
class MySpider(scrapy.Spider):
start_urls = [
# f"https://www.centralbankofindia.co.in/en/branch-locator?field_state_target_id=All&combine=&page={i}"
# for i in range(0, 5)
# only first page - links to other pages it will find in HTML
"https://www.centralbankofindia.co.in/en/branch-locator?field_state_target_id=All&combine=&page=0"
]
name = "Central Bank of India"
def parse(self, response):
print(f'url: {response.url}')
all_items = response.xpath('//*[@id="block-cbi-content"]//td[2]//span[2]/text()').extract()
for address in all_items:
print(address)
yield {'address': address}
# get link to next page
next_page = response.xpath('//a[@rel="next"]/@href').extract_first()
if next_page:
print(f'Next Page: {next_page}')
yield response.follow(next_page)
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(MySpider)
c.start()