Home > Software engineering >  How do I extract the Discount (% off ) value?
How do I extract the Discount (% off ) value?

Time:04-09

I am getting the desired results but I'm not sure how to extract the percentage value from the listing as it doesn't have a class.

from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib

data =[]

def getdata (url):
    header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' } 
    req = urllib.request.Request(url, headers=header)
    amazon_html = urllib.request.urlopen(req).read()
    a_soup = soup(amazon_html,'html.parser')
    
    for e in a_soup.select('div[data-component-type="s-search-result"]'):
        

        try:
            title = e.find('h2').text
        except:
            title = None
            
        try:
            sponsored = e.find('span',{'class':'a-color-secondary'}).text
        except:
            sponsored = None
            
        try:
            limited_deal = e.find('span',{'class':'a-badge-label-inner a-text-ellipsis'}).find('span', {'class': 'a-badge-text'}).text
        except:
            limited_deal = None
            
        
            
        data.append({
            'list_price':list_price,
            'sponsored':sponsored,
            'limited_deal':limited_deal
            
        })
        
    return a_soup

def getnextpage(a_soup):
    try:
        page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
        url =  'http://www.amazon.in'  str(page)
    except:
        url = None
    return url


keywords = ['earphones']

for k in keywords:
    url = 'https://www.amazon.in/s?k=' k
    while True:
        geturl = getdata(url)
        url = getnextpage(geturl)

        if not url:
            break
        print(url)

How do I get the discount (% off ). I have not written any code for that yet, rest of the results are showing up correctly

<The value is highlighted on the screenshot

CodePudding user response:

You can get discounted price from span

from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib

data =[]

def getdata (url):
    header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' } 
    req = urllib.request.Request(url, headers=header)
    amazon_html = urllib.request.urlopen(req).read()
    a_soup = soup(amazon_html,'html.parser')
    
    for e in a_soup.select('div[data-component-type="s-search-result"]'):
        

        try:
            title = e.find('h2').text
        except:
            title = None
            
        try:
            sponsored = e.find('span',{'class':'a-color-secondary'}).text
        except:
            sponsored = None
            
        try:
            limited_deal = e.find('span',{'class':'a-badge-label-inner a-text-ellipsis'}).find('span', {'class': 'a-badge-text'}).text
        except:
            limited_deal = None
        
        try:
            list_price = e.select_one('.a-letter-space  span').text
            print(list_price)
        except:
            limited_deal = None
            
        
            
        data.append({
            #'list_price':list_price,
            'sponsored':sponsored,
            'limited_deal':limited_deal
            
        })
        
    return a_soup

def getnextpage(a_soup):
    try:
        page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
        url =  'http://www.amazon.in'  str(page)
    except:
        url = None
    return url


keywords = ['earphones']

for k in keywords:
    url = 'https://www.amazon.in/s?k=' k
    while True:
        geturl = getdata(url)
        url = getnextpage(geturl)

        if not url:
            break
        #print(url)

Output:

(70% off)
(56% off)
(70% off)
(70% off)
(63% off)
(25% off)
(53% off)
(50% off)
(63% off)
(43% off)
(57% off)
(62% off)
(50% off)
(60% off)
(69% off)
(50% off)
(41% off)
(60% off)
(70% off)

... so on

If you need only digit

from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib

data =[]

def getdata (url):
    header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' } 
    req = urllib.request.Request(url, headers=header)
    amazon_html = urllib.request.urlopen(req).read()
    a_soup = soup(amazon_html,'html.parser')
    
    for e in a_soup.select('div[data-component-type="s-search-result"]'):
        

        try:
            title = e.find('h2').text
        except:
            title = None
            
        try:
            sponsored = e.find('span',{'class':'a-color-secondary'}).text
        except:
            sponsored = None
            
        try:
            limited_deal = e.find('span',{'class':'a-badge-label-inner a-text-ellipsis'}).find('span', {'class': 'a-badge-text'}).text
        except:
            limited_deal = None
        
        try:
            list_price = e.select_one('.a-letter-space  span').text.split('%')[0].replace('(','')
            print(list_price)
        except:
            limited_deal = None
            
        
            
        data.append({
            #'list_price':list_price,
            'sponsored':sponsored,
            'limited_deal':limited_deal
            
        })
        
    return a_soup

def getnextpage(a_soup):
    try:
        page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
        url =  'http://www.amazon.in'  str(page)
    except:
        url = None
    return url


keywords = ['earphones']

for k in keywords:
    url = 'https://www.amazon.in/s?k=' k
    while True:
        geturl = getdata(url)
        url = getnextpage(geturl)

        if not url:
            break
        #print(url)

Output:

70
56
70
70
63
25
53
50
63
50
57
62
60
69
50
43
60
70
41
61
53
61
57
53
61
70
70
60
75
57
75
18
62
61
38
60
80
71
70
60
81
47
70
53
57
62
53
64
57
37
80
42
83
55
53
78
63

CodePudding user response:

You could use a css selector .a-letter-space span with BeautifulSoup's select method and loop through the result to extract the discount text

Here is a sample code:

import requests
from bs4 import BeautifulSoup

def extract_discount(discount_str):
    """
    Extract discount from an unformatted
    discount string like: (70% off)
    Returns: number extracted as string. Ex: 70
    """
    return discount_str.text.split('%')[0].replace('(','')

# using googlebot's user agent
headers = {
    'User-Agent': 'Mozilla/5.0 AppleWebKit/27.0.1453 (KHTML, like Gecko; compatible; Googlebot/2.1;  http://www.google.com/bot.html) Safari/27.0.1453'
}
res = requests.get('https://www.amazon.in/s?k=earphones', headers=headers)

soup = BeautifulSoup(res.content, 'html.parser')

for discount in soup.select('.a-letter-space  span'):
    # just remove extract_discount if you just want (70% off)   
    print(extract_discount(discount))
  • Related