I have the following code that I'm so close to getting working (I think). I can get an array of Selectors containing the values of every anchor element with an id
containing the string class_id
. What I'm trying to do is get the text node child of all these anchor elements. Could anyone tell me how to do that? Thanks.
import scrapy;
#with open('../econ.html', 'r') as f:
#html_string = f.read()
econ_headers = {'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://pisa.ucsc.edu',
'Accept-Language': 'en-us',
'Host': 'pisa.ucsc.edu',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Referer': 'https://pisa.ucsc.edu/class_search/',
'Accept-Encoding': ['gzip', 'deflate', 'br'],
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded'}
class EconSpider(scrapy.Spider):
name = "econ"
def start_requests(self):
urls = [
'https://pisa.ucsc.edu/class_search/index.php'
]
for url in urls:
yield scrapy.Request(url=url, method="POST", headers=econ_headers, body='action=results&binds[:term]=2210&binds[:subject]=ECON&binds[:reg_status]=O&rec_start=0&rec_dur=1000', callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
print(response.xpath('//a[contains(@id, "class_id")] *::text'))
filename = f'class-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
CodePudding user response:
NOTE: the ::text
only work with css selectors
print(response.xpath('//a[contains(@id, "class_id")]/text()').getall())