import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
job_title = []
company_name = []
location_name = []
job_skill = []
links = []
salary = []
requirements = []
date = []
page_num = 0
while page_num != 20:
result = requests.get(f"https://www.indeed.com/jobs?q=web development&start={page_num}")
source = result.content
soup = BeautifulSoup(source, "lxml")
job_titles = soup.find_all("a", {"class", "jcs-JobTitle"})
company_names = soup.find_all("span", {"class": "companyName"})
location_names = soup.find_all("div", {"class": "companyLocation"})
job_skills = soup.find_all("div", {"class": "job-snippet"})
dates = soup.find_all("span", {"class": "date"})
for i in range(len(job_titles)):
job_title.append(job_titles[i].text.strip())
links.append("https://www.indeed.com" job_titles[i].attrs["href"])
company_name.append(company_names[i].text.strip())
location_name.append(location_names[i].text.strip())
job_skill.append(job_skills[i].text.strip())
date.append(dates[i].text.strip())
page_num = 10
print("Page switched...")
for link in links:
result = requests.get(link)
source = result.content
soup = BeautifulSoup(source, "lxml")
salaries = soup.find("span", {"class": "icl-u-xs-mr--xs attribute_snippet"})
salary.append(salaries.text if salaries else "None".strip())
requirement = soup.find("div", {"id": "jobDescriptionText", "class": "jobsearch-jobDescriptionText"}).ul
requirements_text = ""
if requirement:
for li in requirement.find_all("li"):
requirements_text = li.text "| "
else:
requirements_text = "None"
requirements_text = requirements_text[:-2]
requirements.append(requirements_text)
my_file = [job_title, company_name, location_name, job_skill, salary, links, date, requirements]
exported = zip_longest(*my_file)
with open("/Users/Rich/Desktop/testing/indeed.csv", "w") as myfile:
writer = csv.writer(myfile)
writer.writerow(["Job titles", "Company names", "Location names", "Job skills", "Salaries", "Links", "Dates", "Requirements"])
writer.writerows(exported)
i was scraping multiple pages in a website and everythin working well till i added the requirements list to the csv file, if i remove the requirements it works fine. i get this error: UnicodeEncodeError: 'charmap' codec can't encode character '\ufb02' in position 582: character maps to please help me
CodePudding user response:
I fixed the problem by adding: encoding="utf-8"
to the csv file...
the code:
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
job_title = []
company_name = []
location_name = []
job_skill = []
links = []
salary = []
requirements = []
date = []
page_num = 0
num = 1
while page_num != 5000:
result = requests.get(f"https://www.indeed.com/jobs?q=web development&start={page_num}")
source = result.content
soup = BeautifulSoup(source, "lxml")
job_titles = soup.find_all("a", {"class", "jcs-JobTitle"})
company_names = soup.find_all("span", {"class": "companyName"})
location_names = soup.find_all("div", {"class": "companyLocation"})
job_skills = soup.find_all("div", {"class": "job-snippet"})
dates = soup.find_all("span", {"class": "date"})
for i in range(len(job_titles)):
job_title.append(job_titles[i].text.strip())
links.append("https://www.indeed.com" job_titles[i].attrs["href"])
company_name.append(company_names[i].text.strip())
location_name.append(location_names[i].text.strip())
job_skill.append(job_skills[i].text.strip())
date.append(dates[i].text.strip())
page_num = 10
print(f"{num}.Page switched...")
num = 1
for link in links:
result = requests.get(link)
source = result.content
soup = BeautifulSoup(source, "lxml")
salaries = soup.find("span", {"class": "icl-u-xs-mr--xs attribute_snippet"})
salary.append(salaries.text if salaries else "None".strip())
requirement = soup.find("div", {"id": "jobDescriptionText", "class": "jobsearch-jobDescriptionText"}).ul
requirements_text = ""
if requirement:
for li in requirement.find_all("li"):
requirements_text = li.text.strip() "| "
else:
requirements_text = "None"
requirements_text = requirements_text[:-2]
requirements.append(requirements_text)
my_file = [job_title, company_name, location_name, job_skill, salary, links, date, requirements]
exported = zip_longest(*my_file)
with open("/Users/Rich/Desktop/testing/indeed.csv", "w", encoding="utf-8") as myfile:
writer = csv.writer(myfile)
writer.writerow(["Job titles", "Company names", "Location names", "Job skills", "Salaries", "Links", "Dates", "Requirements"])
writer.writerows(exported)
but I don't know what encoding="utf-8"
is for, any idea ??