Home > other >  Prices webscraping using BeautifulSoup
Prices webscraping using BeautifulSoup

Time:02-19

Goal: I'm trying to scrape prices Expected Output: 2 columns 1)productName (OK) 2)price (Not OK, I have NaN) I tried the following:

import urllib3
from bs4 import BeautifulSoup
import pandas as pd
import time
urllib3.disable_warnings()
t0 = time.time() 

page_proximus = urlopen("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus, 'html.parser')

scrap_list=pd.DataFrame(columns =['Item_name','Item_price'])

url = 'https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html'  str(page_list)
req = urllib3
res = req.request
soup = BeautifulSoup(page_proximus, 'html.parser')

html = urlopen('https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html').read().decode("utf-8")
bs = BeautifulSoup(html, 'html.parser')
scrap_name = bs.find_all(["h1"])
product_name=pd.DataFrame(scrap_name,columns =['Item_name'])
     
scrap_price = bs.find_all ("span",{'class': 'rs-unit'})
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])

scrap_list=scrap_list.append(pd.concat([product_name['Item_name'], product_price['Item_price']],
                                  axis=1))
t1 = time.time()
r=t1-t0            
print(r)
print(scrap_list)

CodePudding user response:

The data is within the <meta> tags.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

t0 = time.time() 

page_proximus = requests.get("https://www.proximus.be/fr/id_cr_apple-iphone-13-256gb-pink/particuliers/equipement/boutique/apple-iphone-13-256gb-pink.html")
soup = BeautifulSoup(page_proximus.text, 'html.parser')

rows = []
metaData = soup.find_all('meta',{'property':'og:description'})
for meta in metaData:
    row = {'Item_name':meta.find('meta',{'name':'device_model'})['content'],
           'Item_price':meta.find('meta',{'name':'device_price'})['content']}
    
    rows.append(row)

t1 = time.time()
r=t1-t0    
print(r)            
df = pd.DataFrame(rows)
print(df)

Output:

              Item_name Item_price
0  iPhone 13 256GB Pink    1029,99
  • Related