Home > front end >  Python Web Scraping Not Working -- Multiple Pages Project
Python Web Scraping Not Working -- Multiple Pages Project

Time:10-23

I'm fairly new to webscraping and have been working on a project to scrape data from a jobbank site. I wanted to know why my code isn't working. When running as a single site, it works just fine, I'm not sure what I'm doing wrong in regards to multiple pages.

import libraries

from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from time import sleep
from random import randint
import datetime

Connect to Website and pull data

#for page in range(37001458,37001470):
pages = np.arange (37001458, 37001470, 1)
data = []

for page in pages:
    
    URL = 'https://www.jobbank.gc.ca/jobsearch/jobposting/'   str(page)
    sleep(randint(1,5))
    
# To find Your User-Agent: https://httpbin.org/get
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"}


#for single page scraping - delete everything before this line except headers and readdress response
page = requests.get(URL, headers=headers)

soup1 = BeautifulSoup(page.content, "html.parser")

soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

#page = page   1

try:
      job_title = soup2.find(property='title').get_text()
except:
      job_title = ''

try:
    date_posted = soup2.find(property='datePosted').get_text()
except:
    date_posted = ''

try:
    company = soup2.find(property='hiringOrganization').get_text()
except:
    company = ''

try:
    address = soup2.find(property='streetAddress').get_text()
except:
    address = ''

try:
    city = soup2.find(property='addressLocality').get_text()
except:
    city = ''

try:
    province = soup2.find(property='addressRegion').get_text()
except:
    province = ''

try:
    wage = soup2.find(property='minValue').get_text()
except:
    wage = ''

try:
    wage_reference = soup2.find(property='unitText').get_text()
except:
    wage_reference = ''

try:
    work_hours = soup2.find(property='workHours').get_text()
except:
    work_hours = ''

try:
    employment_type = soup2.find(property='employmentType').get_text()
except:
    employment_type = ''

try:
    language = soup2.find(property='qualification').get_text()
except:
    language = ''

try:
    required_education = soup2.find(property='educationRequirements qualification').get_text()
except:
    required_education = ''

try:
    required_experience = soup2.find(property='experienceRequirements qualification').get_text()
except:
    required_experience = ''

try:
    skills = soup2.find(property='experienceRequirements').get_text()
except:
    skills = ''

try:
    employment_groups = soup2.find(id='employmentGroup').get_text()
except:
    employment_groups = ''
    

Data Cleaning

job_title = job_title.strip()

date_posted = date_posted.strip()[10:]

company = company.strip()

address = address.strip()

city = city.strip()

province = province.strip()

wage = wage.strip()

wage_reference = wage_reference.strip()

work_hours = work_hours.strip()

employment_type = employment_type.strip()

language = language.strip()

required_education = required_education.strip()

required_experience = required_experience.strip()

skills = skills.strip()

employment_groups = employment_groups.strip()[238:] 

print(job_title)
print(date_posted)
print(company)
print(address)
print(city)
print(province)
print(wage)
print(wage_reference)
print(work_hours)
print(employment_type)
print(language)
print(required_education)
print(required_experience)
print(skills)
print(employment_groups)

Timestamp for output to track when data was collected

import datetime

today = datetime.date.today()

print(today)

Data Entry into csv file (prior creation)

import csv

header = ['Job Title', 'Date Posted', 'Company', 'Address', 'City', 'Province', 'Wage', 'Wage Reference', 'Work Hours', 'Employment Type', 'Language', 'Required Education', 'Required Experience', 'Skills', 'Employment Groups']
values = [job_title, date_posted, company, address, city, province, wage, wage_reference, work_hours, employment_type, language, required_education, required_experience, skills, employment_groups]


with open('CanadaJobBankWebScraperDataset.csv', 'a ', newline='', encoding='utf8') as f:
    writer = csv.writer(f)
    writer.writerow(values)

Pandas

df = pd.read_csv(r'C:\Users\AM\CanadaJobBankWebScraperDataset.csv')

print(df)

CodePudding user response:

pages = np.arange (37001458, 37001470, 1)

for page in pages:
    
    URL = 'https://www.jobbank.gc.ca/jobsearch/jobposting/'   str(page)
    sleep(randint(1,5))

page = requests.get(URL, headers=headers)

This loop creates a url string, and then immediately recreates it with a different number on the end, and then does it again, and again, and again. Only the final url value actually survives when the loop ends.

The requests.get() call (and all the related processing) needs to be inside the for loop.

  • Related