I'm building a person scraper for Indeed primarily to practice on - I've set it up so that I extract details per 100 results in each page. By using the search query, I have a seed-list of cities and types of jobs looped within an f-string of the indeed url. I have these results stored as a dictionary, so that I can get the degree types as a column when these results are read into pandas.
My issue is that I keep getting Redirecting (301)
, I suppose that's because not all the links fulfil the requirement of a salary. Alternatively, I have included meta={'handle_httpstatus_list': [301]}
but then I get no results regardless.
Here's my scraper:
class IndeedItem(scrapy.Item):
job_title = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
company = Field(output_processor = TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
max_results_per_city = 1000
#names = pd.read_csv("indeed_names.csv")
#degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names':['London', 'Manchester']})
degree = pd.DataFrame({'degrees':['degree Finance £25','degree Engineering £25'], 'degree_type':['Finance', 'Engineering']})
start_urls = defaultdict(list)
for city in names.names:
for qualification,name in zip(degree.degrees, degree.degree_type):
start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification},000&l={city}&fromage=7&filter=0&limit=100')
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':2
}
def start_requests(self):
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback = self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs = {
'page_count':0,
'category':category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[@id="mosaic-zone-jobcards"]//div')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector = jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/h2[@]/span//text()')
loader.add_xpath('salary', './/div[@]/span//text()')
loader.add_xpath('company', './/a/div[@]/div[@]/div[@]/div[@]/table[@]/tbody/tr/td[@]/div[@]/pre/span[@]//text()')
yield loader.load_item
next_page = response.xpath('//ul[@]/li[5]/a//@href').get()
page_count = 1
if next_page is not None:
yield response.follow(
next_page,
callback = self.parse,
cb_kwargs = {
'page_count': page_count,
'category': category
}
)
CodePudding user response:
I didn't had any 301 status, but the start_urls gave me problems and your xpath was off
This fix the xpath:
import scrapy
from pandas._libs.internals import defaultdict
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
import pandas as pd
class IndeedItem(scrapy.Item):
job_title = Field(output_processor=TakeFirst())
salary = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
company = Field(output_processor=TakeFirst())
class IndeedSpider(scrapy.Spider):
name = 'indeed'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY': 2
}
max_results_per_city = 1000
# names = pd.read_csv("indeed_names.csv")
# degree = pd.read_csv("degree_names2.csv",encoding='unicode_escape')
names = pd.DataFrame({'names': ['London', 'Manchester']})
degree = pd.DataFrame({'degrees': ['degree Finance £25,000', 'degree Engineering £25,000'], 'degree_type': ['Finance', 'Engineering']})
start_urls = defaultdict(list)
def start_requests(self):
for city in self.names.names:
for qualification, name in zip(self.degree.degrees, self.degree.degree_type):
self.start_urls[name].append(f'https://uk.indeed.com/jobs?q={qualification}&l={city}&fromage=7&filter=0&limit=100')
for category, url in self.start_urls.items():
for link in url:
yield scrapy.Request(
link,
callback=self.parse,
#meta={'handle_httpstatus_list': [301]},
cb_kwargs={
'page_count': 0,
'category': category
}
)
def parse(self, response, page_count, category):
if page_count > 30:
return
indeed = response.xpath('//div[@]')
for jobs in indeed:
loader = ItemLoader(IndeedItem(), selector=jobs)
loader.add_value('category', category)
loader.add_xpath('job_title', './/span[@title]//text()')
loader.add_xpath('salary', './/div[@]/span//text()')
loader.add_xpath('company', './/span[@]//text()')
yield loader.load_item()
next_page = response.xpath('//ul[@]//li[last()]/a/@href').get()
page_count = 1
if next_page:
yield response.follow(
next_page,
callback=self.parse,
cb_kwargs={
'page_count': page_count,
'category': category
}
)
If you can give an example for a url that redirects I can try to help you.