How to scrape table from website, while BS4 selection won`t find it?-CodePudding

I'm using below code scrape table element from url (www.sfda.gov.sa/en/cosmetics-list). But its coming empty

from bs4 import BeautifulSoup
import requests
import pandas as pd
url="https://www.sfda.gov.sa/en/cosmetics-list"
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
table = soup.find('table', attrs={'class':'table table-striped display'})
table_rows = table.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)


df = pd.DataFrame(res, columns=["ProductName", "Category", "Country", "Company"])
print(df)

Running above code but not getting data

CodePudding user response：

Data is loaded via XHR so you should use this to get your information:

url = 'https://www.sfda.gov.sa/GetCosmetics.php?page=1'
pd.DataFrame(requests.get(url).json()['results'])

Example

Loop over number of pages in range() and collect all data.

import requests
import pandas as pd

data = []

for i in range(1,5):
    url = f'https://www.sfda.gov.sa/GetCosmetics.php?page={i}'
    data.extend(requests.get(url).json()['results'])

pd.DataFrame(data)

Output

	id	cosmatics_Id	productNotificationsId	productNumber	status	productArName	productEnName	brandName	catArabic	catEnglish	counrtyAr	counrtyEn	manufactureType	packageVolume	unitAr	unitEn	barcode	manufacturearabicname	manufactureenglishname	listedNameAr	listedNameEn	country_of_manufacturing_English	country_of_manufacturing_Arabic	productCreationDate	productexpireddate	subCategory1	subCategoryAR	storageCircumstances	protectionInstructions	usageInstructions	mainCommercialRecordNumber	manufacturingLicenseNumber
0	549105	58472	10518	2020-011019101291-245945	Active	ليتسيا كوبيبا	Litsea cubeba oil	MOKSHA LIFE STYLE	منتجات العناية بالبشرة	Skin products	الهند	India	Foreign	250	ملي لتر	Milliliter (ml)	0	موكشا لايف ستايل برودكت	Moksha lifestyle products	مؤسسة شجور الارض للتجارة	shojoor alearth trading	India	الهند	2020-09-28T09:40:46	2025-10-05T09:40:46	Perfumes	العطور	room temperature	تاريخ انتهاء الصلاحية	الاستعمال الخارجي	7016000957	FR555666
...
9	84386	58481	4031	2016-0120132-048982	Active	جودي ثيرابي سيستيم للشعر بالبروتين	Judy protein & Silk hair therapy system	Judy	منتجات العناية بالشعر وفروة الرأس	Hair and scalp products	الولايات المتحدة	United States	Foreign	1000	ملي لتر	Milliliter (ml)	641243925950	معامل ناتيورال كوزماتيك	natural cosmetic labs USA Inc.,	شركه بيت جودي الدوليه للتجارة	bait gody for trading co.	United States	الولايات المتحدة	2016-12-25T14:40:44	2027-01-01T14:40:44	Hair styling products	منتجات تصفيف الشعر				7007289163	FR555666

CodePudding user response：

You can use concurrent.futures to concurrently scrape pages and when all pages are complete concat the results into a single dataframe:

import concurrent.futures
import json
import os

import pandas as pd
import requests


class Scrape:
    def __init__(self):
        self.root_url = "https://www.sfda.gov.sa/GetCosmetics.php?"
        self.pages = self.get_page_count()
        self.processors = os.cpu_count()

    def get_page_count(self) -> int:
        return self.get_data(url=self.root_url).get("pageCount")

    @staticmethod
    def get_data(url: str) -> dict:
        with requests.Session() as request:
            response = request.get(url, timeout=30)
        if response.status_code != 200:
            print(response.raise_for_status())

        return json.loads(response.text)

    def process_pages(self) -> pd.DataFrame:
        page_range = list(range(1, self.pages   1))

        with concurrent.futures.ProcessPoolExecutor(max_workers=self.processors) as executor:
            return pd.concat(executor.map(self.parse_data, page_range)).reset_index(drop=True)

    def parse_data(self, page: int) -> pd.DataFrame:
        url = f"{self.root_url}page={page}"
        data = self.get_data(url=url)

        return pd.json_normalize(data=data, record_path="results")


if __name__ == "__main__":
    final_df = Scrape().process_pages()
    print(final_df)