I have such a code which gets the values of all paragraphs from a div and inserts them into a list as a new element for each car model year. I wanted to add the possibility of creating a dictionary which would contain values in such form
d = { 'reviewer_name': 'xyz', 'car_model' : '2017 Audi A4', 'review_content' : 'all paragraphs from the div which is already visible in the code' }
this dictionary should contain values for different years, so if I specify years to 2017 and 2018 I would like entries in the dictionary for both of those years.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from fake_useragent import UserAgent
import random
articles = []
ua = UserAgent()
header = {'User-Agent':str(ua.safari)}
for i in range(2017, 2019):
url = f'https://www.caranddriver.com/audi/a4-{i}'
response = requests.get(url, headers=header)
print(response)
html_soup = BeautifulSoup(response.text, 'lxml')
article = html_soup.find('div', attrs={'class': 'review-body-content'}).findAll('p')
article_text = ''
for element in article:
article_text = article_text '\n' ''.join(element.findAll(text = True))
articles.append(article_text)
CodePudding user response:
Here you go, just add it to a dictionary then append the dictionary into your list.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from fake_useragent import UserAgent
import random
import re
articles = []
ua = UserAgent()
header = {'User-Agent':str(ua.safari)}
for i in range(2017, 2020):
url = f'https://www.caranddriver.com/audi/a4-{i}'
response = requests.get(url, headers=header)
print(response)
html_soup = BeautifulSoup(response.text, 'lxml')
article = html_soup.find('div', attrs={'class': 'review-body-content'}).findAll('p')
article_text = ''
for element in article:
article_text = article_text '\n' ''.join(element.findAll(text = True))
article_text = re.sub('.css.*}', '', article_text)
article_text = article_text.strip()
car_model = html_soup.find('div', class_= re.compile("^review-header-inner")).find('h1').text
try:
reviewer_name = html_soup.find('a', {'href':re.compile("^/author")}).text
except:
reviewer_name = 'NA'
row = {
'reviewer_name': reviewer_name,
'car_model' : car_model,
'review_content' : article_text }
articles.append(row)
df = pd.DataFrame(articles)
Output:
print(df)
reviewer_name ... review_content
0 NA ... The A4 embodies everything we love about Audi:...
1 NA ... The 2018 Audi A4 is perhaps the most well-roun...
2 Drew Dorian ... Audi's A4 has proven to be a wündercar that ou...
[3 rows x 3 columns]