I am trying to scrape data from these page
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['http://www.cuma.fr/annuaires?page=1e']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
coordinate=response.xpath("//div[@class='adr']/text()").getall()
yield{
'coordoness':coordinate
}
CodePudding user response:
Read the comments.
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['http://www.cuma.fr/annuaires?page=1e']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
# coordinate = response.xpath("//div[@class='adr']/text()").getall()
# replace '/text()' with '//text()' to get all the text inside div tag:
coordinate = response.xpath("//div[@class='adr']//text()").getall()
# strip the strings in the list:
coordinate = [i.strip() for i in coordinate]
# remove empty strings:
coordinate = [i for i in coordinate if i]
yield{
'coordoness': coordinate
}