I have a PYTHON code to scrape data from a website and write it into a CSV file. But after running my code, only the last row(joblink) is displayed in my excel while other rows are empty with the headers only.
Please how do I fix? Below is my codeblock.
import requests
import csv
from csv import writer
from bs4 import BeautifulSoup
for x in range(1, 210):
html_text = requests.get(f'https://www.timesjobs.com/candidate/job-search.html?from=submit&actualTxtKeywords=Python&searchBy=0&rdoOperator=OR&searchType=personalizedSearch&luceneResultSize=25&postWeek=60&txtKeywords=Python&pDate=I&sequence={x}&startPage=1').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
with open('jobberman.csv', 'w ', newline = '', encoding = 'utf-8') as f:
header = ['Company Name', 'Keyskill', 'Joblink']
writer = csv.writer(f, delimiter = '')
writer.writerow(header)
for job in jobs:
company_name = job.find('h3', class_ = 'joblist-comp-name').text.replace(' ','')
keyskill = job.find('span', class_ = 'srp-skills').text.replace(' ','')
joblink = job.header.h2.a['href']
print(f"Company Name: {company_name.strip()}")
print(f"Required Skills: {keyskill.strip()}")
print(f"Joblink: {joblink}")
print('')
joblist = [company_name, keyskill, joblink]
writer.writerow(joblist)
CodePudding user response:
Main issue is, that you overwrite your contents in each iteration, so perform your outer for-loop
while file is open.
...
with open('jobberman.csv', 'w ', newline = '', encoding = 'utf-8') as f:
header = ['Company Name', 'Keyskill', 'Joblink']
writer = csv.writer(f)
writer.writerow(header)
for x in range(1, 120):
html_text = requests.get(f'https://www.timesjobs.com/candidate/job-search.html?from=submit&actualTxtKeywords=Python&searchBy=0&rdoOperator=OR&searchType=personalizedSearch&luceneResultSize=25&postWeek=60&txtKeywords=Python&pDate=I&sequence={x}&startPage=1').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
for job in jobs:
company_name = job.find('h3', class_ = 'joblist-comp-name').get_text(strip=True)
keyskill = job.find('span', class_ = 'srp-skills').get_text(strip=True)
joblink = job.header.h2.a['href']
joblist = [company_name, keyskill, joblink]
writer.writerow(joblist)
Example
import csv
from csv import writer
from bs4 import BeautifulSoup
with open('jobberman.csv', 'w ', newline = '', encoding = 'utf-8') as f:
header = ['Company Name', 'Keyskill', 'Joblink']
writer = csv.writer(f)
writer.writerow(header)
for x in range(1, 120):
#### requesting and scraping info
joblist = ['Company Name' str(x), 'Keyskill' str(x), 'Joblink' str(x)]
writer.writerow(joblist)
Output
Company Name,Keyskill,Joblink
Company Name1,Keyskill1,Joblink1
Company Name2,Keyskill2,Joblink2
Company Name3,Keyskill3,Joblink3
Company Name4,Keyskill4,Joblink4
Company Name5,Keyskill5,Joblink5
Company Name6,Keyskill6,Joblink6
Company Name7,Keyskill7,Joblink7
CodePudding user response:
Same here. Can't access the site. But give this a try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
df = pd.DataFrame([], columns = ['Company Name', 'Keyskill', 'Joblink'])
df.to_csv('jobberman.csv', index=False)
for x in range(1, 210):
html_text = requests.get(f'https://www.timesjobs.com/candidate/job-search.html?from=submit&actualTxtKeywords=Python&searchBy=0&rdoOperator=OR&searchType=personalizedSearch&luceneResultSize=25&postWeek=60&txtKeywords=Python&pDate=I&sequence={x}&startPage=1').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
rows = []
for job in jobs:
company_name = job.find('h3', class_ = 'joblist-comp-name').text.replace(' ','')
keyskill = job.find('span', class_ = 'srp-skills').text.replace(' ','')
joblink = job.header.h2.a['href']
row = {
'Company Name':company_name,
'Keyskill': keyskill,
'Joblink': joblink}
rows.append(row)
print(f"Company Name: {company_name.strip()}")
print(f"Required Skills: {keyskill.strip()}")
print(f"Joblink: {joblink}")
print('')
df = pd.DataFrame(rows)
df.to_csv('jobberman.csv', mode='a', header=False, index=False)