On one of the sites for which I am writing a parser, I encountered the following problem: I need to take all the data from the table, but they are not signed in the html code and are swapped html example
The table looks like this: table
At first I used XPATH for this, but when parsing, I found that some data was swapped, such as engine and registration number, or not at all. So XPATH is not suitable, because data with mileage can get into the line with the engine in the csv file
Is it possible somehow in selenium or through bs4 to first search for a word, and then parse the data after it?
That is, what would find the word Engine in the html code, and then take the data below html text that I need
My code:
import csv
import time
import schedule
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
def collect_data():
global driver
options = webdriver.ChromeOptions()
options.set_preference('general.useragent.override',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 '
'Safari/537.36')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
# Background mode
# options.add_argument('headless')
try:
driver = webdriver.Chrome(options=options)
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
driver.get(
url='https://www.nettiauto.com/en/ford/mustang?yfrom=1980'
)
time.sleep(10)
'''Collect all URLs'''
soup = BeautifulSoup(driver.page_source, 'lxml')
car_url_list = []
total_page = soup.find('span', class_='totPage').text
print('Ford Mustang')
print(f'Total pages: {total_page}')
print(f'Page 1 of {total_page} URL collected')
r = (int(total_page) 1)
count = 1
for i in range(1, r, 1):
driver.get(
url=f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={i}'
)
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
car_cards = soup.find_all('a', class_='tricky_link')
count = 1
print(f'Page {count} of {total_page} URL collected')
for car_ulr in car_cards:
car_ulr = car_ulr.get('href')
car_url_list.append(car_ulr)
with open('ford_mustang_url.txt', 'w', encoding='utf8') as file:
for line in car_url_list:
file.write(f'{line}\n')
count = 0
row = []
'''Collect car's data'''
with open('ford_mustang_url.txt', encoding='utf8') as f:
r = len(car_url_list)
print('Total cars: ' str(r))
for i in range(r):
driver.get(f.readline())
driver.implicitly_wait(30)
soup = BeautifulSoup(driver.page_source, 'lxml')
count = 1
'''Car Data'''
car_name = soup.find('title').text.replace('Nettiauto', '').replace('-', '').replace('Used vehicle', '').replace('Vaihtoauto', '').replace(' ', ' ').strip()
car_price = soup.find('span', class_='GAPrice').find('span').text
car_year = soup.find('div', class_='mid_border').get('data-year')
car_mileage = soup.find('div', class_='mid_border').get('data-mileage')
car_reg_number = soup.find('div', class_='rekkari-banner__body_input').text.strip()
car_url = soup.find('link', hreflang='en').get('href')
# car_engine
'''If section'''
if car_reg_number == 'ABC-123':
car_reg_number = None
if car_mileage == '100000000':
car_mileage = None
print(f'{count}. ' car_name)
print('Price: ' f'{car_price}')
print('Year: ' f'{car_year}')
print('Mileage: ' f'{car_mileage}')
print('Reg.Number: ' f'{car_reg_number}')
print('URL: ' f'{car_url}\n')
data = {
'Name': car_name,
'Price': car_price,
'Year': car_year,
'Mileage': car_mileage,
'Reg.Number': car_reg_number,
'URL': car_url,
}
row.append(data)
csv_title = ['Name', 'Price', 'Year', 'Mileage', 'Reg.Number', 'URL']
with open('ford_mustang.csv', 'w', encoding='utf8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=csv_title)
writer.writeheader()
writer.writerows(row)
except Exception as ex:
print(ex)
finally:
driver.close()
driver.quit()
def main():
collect_data()
if __name__ == '__main__':
main()
CodePudding user response:
Here is a solution for your problem, not based on selenium (it's not the right tool for this job), which will produce a dataframe/csv with all the details you're after:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
scraper = cloudscraper.create_scraper()
big_df = pd.DataFrame()
urls_list = []
for x in tqdm(range(1, 8)):
r = scraper.get(f'https://www.nettiauto.com/en/ford/mustang?yfrom=1980&page={x}')
soup = BeautifulSoup(r.text, 'html.parser')
car_links = [x.get('href') for x in soup.select_one('div#listingData').select('a.tricky_link')]
for link in car_links:
urls_list.append(link)
for url in tqdm(urls_list):
r = scraper.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
dfs = pd.read_html(str(r.text))
df_list = []
title = soup.select_one('#heightForSlogan').select_one('h1').get_text(strip=True)
subtitle = soup.select_one('#heightForSlogan').select_one('h2').get_text(strip=True)
df_list.append(('make_model', title))
df_list.append(('variant', subtitle))
for i, row in dfs[0].iterrows():
df_list.append((row[0], row[1]))
df_list.append((row[3], row[4]))
correct_df = pd.DataFrame(df_list).T
new_header = correct_df.iloc[0]
correct_df = correct_df[1:]
correct_df.columns = new_header
big_df = big_df.append(correct_df)
big_df.to_csv('finnish_cars.csv')
A couple of notes: first 2 cars descriptions are in Finnish, the rest are in English, so the end df/csv will be a bit funny, but the data will be there. Also, you might get some warnings in the terminal about pd append/use concat, but those are just warnings, the program will run.
You can install cloudscraper with pip install cloudscraper
, and tqdm with pip install tqdm
. Of course, if you're keen on using Selenium, you can apply the same methods on html obtained from selenium.