How can I get consistent length for all the attributes and also the correct information when compared to the detail page. Although I'm able to create a DataFrame but I have to make the length consistent which makes the details inconsistent
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://www.amazon.in/s?k=smart watch&page=1"
title = []
stars =[]
rating=[]
list_price = []
original_price=[]
url_list =[]
def getdata (url):
amazon_data = urlopen(url)
amazon_html = amazon_data.read()
a_soup = soup(amazon_html,'html.parser')
all_title = a_soup.findAll('span',{'class':'a-size-medium a-color-base a-text-normal'})
all_title = [t.text.split(">") for t in all_title]
for item in all_title:
title.append(item)
all_stars = a_soup.findAll('span',{'class':'a-icon-alt'})
all_stars = [r.text.split('>') for r in all_stars[:-4]]
for item in all_stars:
stars.append(item)
all_rating = a_soup.findAll('div',{'class':'a-row a-size-small'})
all_rating = [r.text.split('>') for r in all_rating]
for item in all_rating:
rating.append(item)
all_list_price = a_soup.findAll('span',{'class':'a-price-whole'})
all_list_price = [r.text.split('>') for r in all_list_price]
for item in all_list_price:
list_price.append(item)
all_original_price = a_soup.findAll('span',{'class':'a-price a-text-price'})
all_original_price = [o.find('span', {'class': 'a-offscreen'}).text.split('>') for o in all_original_price]
for item in all_original_price:
original_price.append(item)
return a_soup
def getnextpage(a_soup):
page= a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
page = page['href']
url = 'http://www.amazon.in' str(page)
return url
while True:
geturl = getdata(url)
url = getnextpage(geturl)
url_list.append(url)
if not url:
break
print(url)
****OUTPUT****
http://www.amazon.in/smart-watch/s?k=smart watch&page=2
http://www.amazon.in/smart-watch/s?k=smart watch&page=3
http://www.amazon.in/smart-watch/s?k=smart watch&page=4
http://www.amazon.in/smart-watch/s?k=smart watch&page=5
http://www.amazon.in/smart-watch/s?k=smart watch&page=6
http://www.amazon.in/smart-watch/s?k=smart watch&page=7
http://www.amazon.in/smart-watch/s?k=smart watch&page=8
http://www.amazon.in/smart-watch/s?k=smart watch&page=9
http://www.amazon.in/smart-watch/s?k=smart watch&page=10
http://www.amazon.in/smart-watch/s?k=smart watch&page=11
http://www.amazon.in/smart-watch/s?k=smart watch&page=12
http://www.amazon.in/smart-watch/s?k=smart watch&page=13
http://www.amazon.in/smart-watch/s?k=smart watch&page=14
http://www.amazon.in/smart-watch/s?k=smart watch&page=15
http://www.amazon.in/smart-watch/s?k=smart watch&page=16
http://www.amazon.in/smart-watch/s?k=smart watch&page=17
http://www.amazon.in/smart-watch/s?k=smart watch&page=18
http://www.amazon.in/smart-watch/s?k=smart watch&page=19
http://www.amazon.in/smart-watch/s?k=smart watch&page=20
**The length is not the same for all the attributes
len(title) 306 len(stars) 286 len(rating) 286 len(list_price) 306 len(original_price) 306**
**Only when I make the length consistent, I am able to create the dataframe, but the problem is that the information is inconsistent **
title = title[:-20]
list_price = list_price[:-20]
original_price = original_price[:-20]
df = pd.DataFrame({'Title': title, 'Stars': stars, 'Rating':rating, 'List_Price': list_price, 'Original_Price':original_price})
CodePudding user response:
Try to avoid these bunch of lists, use a more structured approach and process the data in a leaner way:
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
try:
stars = e.find('span',{'class':'a-icon-alt'}).text.split(' ')[0]
except:
stars = None
try:
rating = e.find('span',{'class':'a-size-base s-underline-text'}).text
except:
rating = None
try:
list_price = e.find('span',{'class':'a-price-whole'}).text
except:
list_price = None
try:
original_price = e.find('span',{'class':'a-price a-text-price'}).find('span', {'class': 'a-offscreen'}).text
except:
original_price = None
data.append({
'title':title,
'stars':stars,
'rating':rating,
'list_price':list_price,
'original_price':original_price
})
return a_soup
Simply create your DataFrame
from you list of dicts:
pd.DataFrame(data)
Output
title | stars | rating | list_price | original_price |
---|---|---|---|---|
Fire-Boltt Thunder Bluetooth Calling Full Touch 1.32inch Amoled LCD Smartwatch with SpO2, Heart Rate & Sleep Monitoring, 30 Sports Modes (Gold Black) | 4,999 | ₹12,999 | ||
Fire-Boltt Beast SpO2 1.69” Industry’s Largest Display Size Full Touch Smart Watch with Blood Oxygen Monitoring, Heart Rate Monitor, Multiple Watch Faces & Long Battery Life (Black) | 3.9 | 9,990 | 2,499 | ₹7,999 |
Noise ColorFit Pulse Smartwatch with 1.4" Full Touch HD Display, SpO2, Heart Rate, Sleep Monitors & 10-Day Battery - Deep Wine | 4 | 32,619 | 2,499 | ₹4,999 |
Noise ColorFit Pulse Spo2 Smart Watch with 10 days battery life, 60 Watch Faces, 1.4" Full Touch HD Display Smartwatch, 24*7 Heart Rate Monitor Smart Band, Sleep Monitoring Smart Watches for Men and Women & IP68 Waterproof (Jet Black) | 4 | 32,619 | 2,499 | ₹4,999 |
Noise ColorFit Ultra Bezel-Less Smart Watch with 1.75" HD TruView Display, 60 Sports Modes, SpO2, Heart Rate, Stress, REM & Sleep Monitor, Calls & SMS Quick Reply, Stock Market Info (Gunmetal Grey) | 4.1 | 22,634 | 2,999 | ₹5,999 |
Noise ColorFit Ultra Smart Watch with 1.75" HD Display, Aluminium Alloy Body, 60 Sports Modes, Spo2, Lightweight, Stock Market Info, Calls & SMS Reply (Lush Olive) | 4.1 | 22,634 | 3,499 | ₹6,400 |
boAt Flash Edition Smartwatch with Activity Tracker,Multiple Sports Modes,Full Touch 1.3" Screen,Sleep Monitor,Gesture, Camera & Music Control,IP68 Dust,Sweat & Splash Resistance(Lightning Black) | 4.1 | 13,714 | 2,499 | ₹6,990 |
CodePudding user response:
Change your strategy to maintain consistent information. Don't extract all titles, all stars, all ratings, ... in a page. I think you should extract data for each item:
data = []
def get_data(url)
...
for item in a_soup.find_all('div', {'class': 's-result-item'}):
if 's-widget' in item['class']:
continue
# extract information for each item
title = ...
stars = ...
rating = ...
price = ...
original = ...
data.append({'Title': title, 'Stars': stars, 'Rating': rating,
'List_Price': price, 'Original_Price': original})
df = pd.DataFrame(data)