I'm using below code scrape table element from url (www.sfda.gov.sa/en/cosmetics-list). But its coming empty
from bs4 import BeautifulSoup
import requests
import pandas as pd
url="https://www.sfda.gov.sa/en/cosmetics-list"
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
table = soup.find('table', attrs={'class':'table table-striped display'})
table_rows = table.find_all('tr')
res = []
for tr in table_rows:
td = tr.find_all('td')
row = [tr.text.strip() for tr in td if tr.text.strip()]
if row:
res.append(row)
df = pd.DataFrame(res, columns=["ProductName", "Category", "Country", "Company"])
print(df)
Running above code but not getting data
CodePudding user response:
Data is loaded via XHR so you should use this to get your information:
url = 'https://www.sfda.gov.sa/GetCosmetics.php?page=1'
pd.DataFrame(requests.get(url).json()['results'])
Example
Loop over number of pages in range()
and collect all data.
import requests
import pandas as pd
data = []
for i in range(1,5):
url = f'https://www.sfda.gov.sa/GetCosmetics.php?page={i}'
data.extend(requests.get(url).json()['results'])
pd.DataFrame(data)
Output
id | cosmatics_Id | productNotificationsId | productNumber | status | productArName | productEnName | brandName | catArabic | catEnglish | counrtyAr | counrtyEn | manufactureType | packageVolume | unitAr | unitEn | barcode | manufacturearabicname | manufactureenglishname | listedNameAr | listedNameEn | imageUrl | batchNumber | country_of_manufacturing_English | country_of_manufacturing_Arabic | productCreationDate | productexpireddate | subCategory1 | subCategoryAR | storageCircumstances | protectionInstructions | usageInstructions | notes | mainCommercialRecordNumber | manufacturingLicenseNumber | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 549105 | 58472 | 10518 | 2020-011019101291-245945 | Active | ليتسيا كوبيبا | Litsea cubeba oil | MOKSHA LIFE STYLE | منتجات العناية بالبشرة | Skin products | الهند | India | Foreign | 250 | ملي لتر | Milliliter (ml) | 0 | موكشا لايف ستايل برودكت | Moksha lifestyle products | مؤسسة شجور الارض للتجارة | shojoor alearth trading | India | الهند | 2020-09-28T09:40:46 | 2025-10-05T09:40:46 | Perfumes | العطور | room temperature | تاريخ انتهاء الصلاحية | الاستعمال الخارجي | 7016000957 | FR555666 | |||
... | |||||||||||||||||||||||||||||||||||
9 | 84386 | 58481 | 4031 | 2016-0120132-048982 | Active | جودي ثيرابي سيستيم للشعر بالبروتين | Judy protein & Silk hair therapy system | Judy | منتجات العناية بالشعر وفروة الرأس | Hair and scalp products | الولايات المتحدة | United States | Foreign | 1000 | ملي لتر | Milliliter (ml) | 641243925950 | معامل ناتيورال كوزماتيك | natural cosmetic labs USA Inc., | شركه بيت جودي الدوليه للتجارة | bait gody for trading co. | United States | الولايات المتحدة | 2016-12-25T14:40:44 | 2027-01-01T14:40:44 | Hair styling products | منتجات تصفيف الشعر | 7007289163 | FR555666 |
CodePudding user response:
You can use concurrent.futures to concurrently scrape pages and when all pages are complete concat the results into a single dataframe:
import concurrent.futures
import json
import os
import pandas as pd
import requests
class Scrape:
def __init__(self):
self.root_url = "https://www.sfda.gov.sa/GetCosmetics.php?"
self.pages = self.get_page_count()
self.processors = os.cpu_count()
def get_page_count(self) -> int:
return self.get_data(url=self.root_url).get("pageCount")
@staticmethod
def get_data(url: str) -> dict:
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
return json.loads(response.text)
def process_pages(self) -> pd.DataFrame:
page_range = list(range(1, self.pages 1))
with concurrent.futures.ProcessPoolExecutor(max_workers=self.processors) as executor:
return pd.concat(executor.map(self.parse_data, page_range)).reset_index(drop=True)
def parse_data(self, page: int) -> pd.DataFrame:
url = f"{self.root_url}page={page}"
data = self.get_data(url=url)
return pd.json_normalize(data=data, record_path="results")
if __name__ == "__main__":
final_df = Scrape().process_pages()
print(final_df)