I am trying to scrape data such as name, price, features, discount, ratings from flipkart website. It's getting scraped properly but when I am trying to convert it to the dataframe using pandas I am getting the error as ValueError: Length of values (1670) does not match length of index (1440). I am searching for mobiles and on the first page there are total of 24 results and additional 5 results are coming at the bottom. I checked the name field is taking 24 data but the price field is taking 29 additional 5 data also it's taking. What should I do so that it price fields does not takes up the bottom data. Below is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
res = requests.get("https://www.flipkart.com/search?q=samsung mobiles&sid=tyy,4io&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_1_2_na_na_ps&otracker1=AS_QueryStore_OrganicAutoSuggest_1_2_na_na_ps&as-pos=1&as-type=RECENT&suggestionId=samsung mobiles|Mobiles&equestId=346f99ae-2791-4d89-b63a-2e4af06e0a63")
soup = BeautifulSoup(res.content, 'html.parser')
product=[]
price=[]
actual_price=[]
features=[]
discount=[]
rating=[]
stars=[]
lst = ['samsung', 'redmi', 'realme', 'vivo', 'motto']
n_pages = 0
for page in range(1, 15):
n_pages =1
res = requests.get(f"https://www.flipkart.com/search?q={lst} mobiles&sid=tyy,4io&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_1_2_na_na_ps&otracker1=AS_QueryStore_OrganicAutoSuggest_1_2_na_na_ps&as-pos=1&as-type=RECENT&suggestionId=samsung mobiles|Mobiles&equestId=346f99ae-2791-4d89-b63a-2e4af06e0a63&page=" str(page))
soup = BeautifulSoup(res.content, 'html.parser')
mobile_data = soup.find_all("div", class_="_1YokD2")
for data in mobile_data:
product_= data.find_all("div", class_="_4rR01T")
for i in product_:
d=i.text
product.append(d)
price_ = data.find_all("div", class_="_30jeq3")
for i in price_:
d=i.text
price.append(d)
actual_price_ = data.find_all("div", class_="_3I9_wc")
for i in actual_price_:
d=i.text
actual_price.append(d)
features_ = data.find_all("ul", class_="_1xgFaf")
for i in features_:
d=i.text
features.append(d)
discount_ = data.find_all("span", class_="_3Ay6Sb")
for i in discount_:
d=i.text
discount.append(d)
rating_ = data.find_all("span", class_="_2_R_DZ")
for i in rating_:
d=i.text
rating.append(d)
stars_ = data.find_all("div", class_="_3LWZlK")
for i in stars_:
d=i.text
stars.append(d)
df = pd.DataFrame()
df['product']=product
df['price']=price
df['actual_price']=actual_price
df['features']=features
df['discount']=discount
df['rating']=rating
df['stars']=stars
This is the error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [15], in <cell line: 3>()
1 df['product']=product
2 df['price']=price
----> 3 df['actual_price']=actual_price
4 df['features']=features
5 df['discount']=discount
File d:\datascience-main\datascience-main\ds\lib\site-packages\pandas\core\frame.py:3655, in DataFrame.__setitem__(self, key, value)
3652 self._setitem_array([key], value)
3653 else:
3654 # set column
-> 3655 self._set_item(key, value)
File d:\datascience-main\datascience-main\ds\lib\site-packages\pandas\core\frame.py:3832, in DataFrame._set_item(self, key, value)
3822 def _set_item(self, key, value) -> None:
3823 """
3824 Add series to DataFrame in specified column.
3825
(...)
3830 ensure homogeneity.
3831 """
-> 3832 value = self._sanitize_column(value)
3834 if (
3835 key in self.columns
3836 and value.ndim == 1
3837 and not is_extension_array_dtype(value)
3838 ):
3839 # broadcast across multiple columns if necessary
3840 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File d:\datascience-main\datascience-main\ds\lib\site-packages\pandas\core\frame.py:4535, in DataFrame._sanitize_column(self, value)
4532 return _reindex_for_setitem(value, self.index)
4534 if is_list_like(value):
-> 4535 com.require_length_match(value, self.index)
4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)
File d:\datascience-main\datascience-main\ds\lib\site-packages\pandas\core\common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (1233) does not match length of index (1344)
CodePudding user response:
Try to avoid these bunch of lists, that will lead in 90% cases to errors, cause of different lengths. Instead iterate each row, scrape all the information need and store them in a single list with dicts. Also try to avoid selections by dynamic classes and use more of the HTML structure.
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
data = []
n_pages = 0
for page in range(1,15):
n_pages =1
res = requests.get(f"https://www.flipkart.com/search?q={lst} mobiles&sid=tyy,4io&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_1_2_na_na_ps&otracker1=AS_QueryStore_OrganicAutoSuggest_1_2_na_na_ps&as-pos=1&as-type=RECENT&suggestionId=samsung mobiles|Mobiles&equestId=346f99ae-2791-4d89-b63a-2e4af06e0a63&page=" str(page))
soup = BeautifulSoup(res.content, 'html.parser')
mobile_data = soup.select('a:-soup-contains("Add to Compare")')
for e in mobile_data:
prices = list(e.select_one('div.row div.col div.col > div').stripped_strings)
data.append({
'title':e.select_one('div.row div.col > div').text,
'price_actual':prices[0][1:],
'price':prices[2] if len(prices) >= 2 else prices[0][1:],
'features':list(e.ul.stripped_strings),
'discount': e.select_one('div.row div.col div.col span').text.split('%')[0] if e.select_one('div.row div.col div.col span') else None,
'stars': e.select_one('[id^="productRating"]').text if e.select_one('[id^="productRating"]') else None,
'ratings':e.select_one('span:-soup-contains("Ratings")').text.split()[0] if e.select_one('span:-soup-contains("Ratings")') else None
}
)
pd.DataFrame(data)
Output
title | price_actual | price | features | discount | stars | ratings | |
---|---|---|---|---|---|---|---|
0 | realme C25_Y (Metal Grey, 128 GB) | 11,999 | 13,999 | ['4 GB RAM | 128 GB ROM', '16.51 cm (6.5 inch) HD Display', '50MP 2MP 2MP | 8MP Front Camera', '5000 mAh LiPo Battery', 'Unisoc T618 Processor', '1 Year Domestic Warranty'] | 14 | 4.4 | 36,858 |
1 | realme C25_Y (Metal Grey, 64 GB) | 10,999 | 12,999 | ['4 GB RAM | 64 GB ROM', '16.51 cm (6.5 inch) HD Display', '50MP 2MP 2MP | 8MP Front Camera', '5000 mAh LiPo Battery', 'Unisoc T618 Processor', '1 Year Domestic Warranty'] | 15 | 4.4 | 36,858 |
2 | realme C31 (Dark Green, 32 GB) | 9,299 | 10,999 | ['3 GB RAM | 32 GB ROM | Expandable Upto 1 TB', '16.56 cm (6.52 inch) HD Display', '13MP 2MP 0.3MP | 5MP Front Camera', '5000 mAh Battery', 'Unisoc T612 Processor', '1 Year Warranty for Phone and 6 Months Warranty for In-Box Accessories'] | 15 | 4.6 | 29,060 |
3 | realme C31 (Light Silver, 32 GB) | 9,299 | 10,999 | ['3 GB RAM | 32 GB ROM | Expandable Upto 1 TB', '16.56 cm (6.52 inch) HD Display', '13MP 2MP 0.3MP | 5MP Front Camera', '5000 mAh Battery', 'Unisoc T612 Processor', '1 Year Warranty for Phone and 6 Months Warranty for In-Box Accessories'] | 15 | 4.6 | 29,060 |
4 | realme C31 (Dark Green, 64 GB) | 9,999 | 11,999 | ['4 GB RAM | 64 GB ROM | Expandable Upto 1 TB', '16.56 cm (6.52 inch) HD Display', '13MP 2MP 0.3MP | 5MP Front Camera', '5000 mAh Battery', 'Unisoc T612 Processor', '1 Year Warranty for Phone and 6 Months Warranty for In-Box Accessories'] | 16 | 4.4 | 23,910 |
...