How can I get consistent length for all the attributes and also the correct information when compare-CodePudding

How can I get consistent length for all the attributes and also the correct information when compared to the detail page. Although I'm able to create a DataFrame but I have to make the length consistent which makes the details inconsistent

    from urllib.request import urlopen
    from bs4 import BeautifulSoup as soup
    import pandas as pd
    
    url = "https://www.amazon.in/s?k=smart watch&page=1"
    
    title = []
    stars =[]
    rating=[]
    list_price = []
    original_price=[]
    url_list =[] 
    
    def getdata (url):
        amazon_data = urlopen(url)
        amazon_html = amazon_data.read()
        a_soup = soup(amazon_html,'html.parser')
        all_title = a_soup.findAll('span',{'class':'a-size-medium a-color-base a-text-normal'})
        all_title = [t.text.split(">") for t in all_title]
        for item in all_title:
            title.append(item)
            
        all_stars = a_soup.findAll('span',{'class':'a-icon-alt'})
        all_stars = [r.text.split('>') for r in all_stars[:-4]]            
        for item in all_stars:
            stars.append(item) 
            
        all_rating = a_soup.findAll('div',{'class':'a-row a-size-small'})   
        all_rating = [r.text.split('>') for r in all_rating]
        for item in all_rating:
            rating.append(item)
            
        all_list_price = a_soup.findAll('span',{'class':'a-price-whole'})
        all_list_price = [r.text.split('>') for r in all_list_price]
        for item in all_list_price:
            list_price.append(item)
            
        
        all_original_price = a_soup.findAll('span',{'class':'a-price a-text-price'})
        all_original_price = [o.find('span', {'class': 'a-offscreen'}).text.split('>') for o in all_original_price]
        for item in all_original_price:
            original_price.append(item)
        return a_soup
        
        
    def getnextpage(a_soup):
        page= a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
        page = page['href']
        url =  'http://www.amazon.in'  str(page)
        return url
            
    while True:
        geturl = getdata(url)
        url = getnextpage(geturl)
        url_list.append(url)
        if not url:
            break
        print(url)
    
       

****OUTPUT****
http://www.amazon.in/smart-watch/s?k=smart watch&page=2
http://www.amazon.in/smart-watch/s?k=smart watch&page=3
http://www.amazon.in/smart-watch/s?k=smart watch&page=4
http://www.amazon.in/smart-watch/s?k=smart watch&page=5
http://www.amazon.in/smart-watch/s?k=smart watch&page=6
http://www.amazon.in/smart-watch/s?k=smart watch&page=7
http://www.amazon.in/smart-watch/s?k=smart watch&page=8
http://www.amazon.in/smart-watch/s?k=smart watch&page=9
http://www.amazon.in/smart-watch/s?k=smart watch&page=10
http://www.amazon.in/smart-watch/s?k=smart watch&page=11
http://www.amazon.in/smart-watch/s?k=smart watch&page=12
http://www.amazon.in/smart-watch/s?k=smart watch&page=13
http://www.amazon.in/smart-watch/s?k=smart watch&page=14
http://www.amazon.in/smart-watch/s?k=smart watch&page=15
http://www.amazon.in/smart-watch/s?k=smart watch&page=16
http://www.amazon.in/smart-watch/s?k=smart watch&page=17
http://www.amazon.in/smart-watch/s?k=smart watch&page=18
http://www.amazon.in/smart-watch/s?k=smart watch&page=19
http://www.amazon.in/smart-watch/s?k=smart watch&page=20


**The length is not the same for all the attributes

len(title) 306 len(stars) 286 len(rating) 286 len(list_price) 306 len(original_price) 306**

**Only when I make the length consistent, I am able to create the dataframe, but the problem is that the information is inconsistent **

    title = title[:-20]
    
    list_price = list_price[:-20]
    
    original_price = original_price[:-20]
    
    df = pd.DataFrame({'Title': title, 'Stars': stars, 'Rating':rating, 'List_Price': list_price, 'Original_Price':original_price})

CodePudding user response：

Try to avoid these bunch of lists, use a more structured approach and process the data in a leaner way:

data =[]

def getdata (url):
    header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }     
    req = urllib.request.Request(url, headers=header)
    amazon_html = urllib.request.urlopen(req).read()
    a_soup = soup(amazon_html,'html.parser')
    
    for e in a_soup.select('div[data-component-type="s-search-result"]'):
        try:
            title = e.find('h2').text
        except:
            title = None
        
        try:
            stars = e.find('span',{'class':'a-icon-alt'}).text.split(' ')[0]
        except:
            stars = None
            
        try:
            rating = e.find('span',{'class':'a-size-base s-underline-text'}).text
        except:
            rating = None

        try:
            list_price = e.find('span',{'class':'a-price-whole'}).text
        except:
            list_price = None
            
        try:
            original_price = e.find('span',{'class':'a-price a-text-price'}).find('span', {'class': 'a-offscreen'}).text
        except:
            original_price = None
            
        data.append({
            'title':title,
            'stars':stars,
            'rating':rating,
            'list_price':list_price,
            'original_price':original_price
        })

    return a_soup

Simply create your DataFrame from you list of dicts:

pd.DataFrame(data)

Output

title	stars	rating	list_price	original_price
Fire-Boltt Thunder Bluetooth Calling Full Touch 1.32inch Amoled LCD Smartwatch with SpO2, Heart Rate & Sleep Monitoring, 30 Sports Modes (Gold Black)			4,999	₹12,999
Fire-Boltt Beast SpO2 1.69” Industry’s Largest Display Size Full Touch Smart Watch with Blood Oxygen Monitoring, Heart Rate Monitor, Multiple Watch Faces & Long Battery Life (Black)	3.9	9,990	2,499	₹7,999
Noise ColorFit Pulse Smartwatch with 1.4" Full Touch HD Display, SpO2, Heart Rate, Sleep Monitors & 10-Day Battery - Deep Wine	4	32,619	2,499	₹4,999
Noise ColorFit Pulse Spo2 Smart Watch with 10 days battery life, 60 Watch Faces, 1.4" Full Touch HD Display Smartwatch, 24*7 Heart Rate Monitor Smart Band, Sleep Monitoring Smart Watches for Men and Women & IP68 Waterproof (Jet Black)	4	32,619	2,499	₹4,999
Noise ColorFit Ultra Bezel-Less Smart Watch with 1.75" HD TruView Display, 60 Sports Modes, SpO2, Heart Rate, Stress, REM & Sleep Monitor, Calls & SMS Quick Reply, Stock Market Info (Gunmetal Grey)	4.1	22,634	2,999	₹5,999
Noise ColorFit Ultra Smart Watch with 1.75" HD Display, Aluminium Alloy Body, 60 Sports Modes, Spo2, Lightweight, Stock Market Info, Calls & SMS Reply (Lush Olive)	4.1	22,634	3,499	₹6,400
boAt Flash Edition Smartwatch with Activity Tracker,Multiple Sports Modes,Full Touch 1.3" Screen,Sleep Monitor,Gesture, Camera & Music Control,IP68 Dust,Sweat & Splash Resistance(Lightning Black)	4.1	13,714	2,499	₹6,990

CodePudding user response：

Change your strategy to maintain consistent information. Don't extract all titles, all stars, all ratings, ... in a page. I think you should extract data for each item:

data = []

def get_data(url)
    ...

    for item in a_soup.find_all('div', {'class': 's-result-item'}):
        if 's-widget' in item['class']:
            continue
        # extract information for each item
        title = ...
        stars = ...
        rating = ...
        price = ...
        original = ...
        data.append({'Title': title, 'Stars': stars, 'Rating': rating,
                     'List_Price': price, 'Original_Price': original})


df = pd.DataFrame(data)