Extracting Carbon offset projects from website using beautiful soup and getting nothing-CodePudding

I'm trying to extract the data from this website('https://alliedoffsets.com/#/profile/2). It has many such projects and I want to get the values of Estimated Average Wholesale Price and Estimated Annual Emission Reduction. When, I trying to print the code using beautiful soup it is not giving those tags and giving empty values. I know it could be a basic thing but I'm stuck. May be the data is getting populated on the website using javascript but I cannot figure out a way to do it.

import pandas as pd
import requests
from bs4 import BeautifulSoup

url='https://alliedoffsets.com/#/profile/1'
r=requests.get(url)
url=r.content
soup = BeautifulSoup(url,'html.parser')

tab=soup.find("thead",{"class":"sr-only"})
print(tab)

CodePudding user response：

The data you see is loaded via JavaScript from external URL. To load the data using requests/json module you can use this example:

import json
import requests

url = "https://carbon-registry.herokuapp.com/1.0/provider/1"
params = {
    "embedded": '{"provider_capital_types":1,"provider_capital_types.capital_type":1,"provider_countries":1,"provider_countries.country":1,"contacts":1,"contacts.office":1,"provider_currencies":1,"provider_currencies.currency":1,"provider_languages":1,"provider_languages.language":1,"offices":1,"offices.country":1,"provider_sectors":1,"provider_sectors.sector":1,"provider_social_medias":1,"provider_social_medias.social_media":1,"provider_provider_types":1,"provider_provider_types.provider_type":1,"provider_stats":1,"provider_stats.stat":1,"provider_descriptions":1,"provider_descriptions.description":1,"relationships":1,"relationships.description":1,"provider_statuses":1,"provider_statuses.status":1}'
}
headers = {"Authorization": "Bearer 8hCH4MuPCa5t6ra8wtAz8xOQfJdjLvDVZk07ib60TZ"}

data = requests.get(url, headers=headers, params=params).json()

# uncomment to print all data:
# print(json.dumps(data, indent=4))

stats = {s["stat"]["name"]: s for s in data["provider_stats"]}

print(f"{stats['Estimated Direct Price']['value']=}")
print(f"{stats['Estimated Annual Emission Reduction']['value']=}")

Prints:

stats['Estimated Direct Price']['value']=5.0630778182036105
stats['Estimated Annual Emission Reduction']['value']=11603

CodePudding user response：

The web page is rendered in JavaScript so the HTML elements cannot be extracted directly using BeautifulSoup. Selenium can be used to extract the rendered HTML then search for elements by ID, class, XPath, etc.

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import re

url = 'https://alliedoffsets.com/#/profile/1'

s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)

# web driver goes to page
driver.get(url)

# use WebDriverWait to wait until page is rendered

# find Estimated Average Wholesale Price
elt = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, 'direct-price-panel'))
    )
# extract just the price from the text
print(re.sub(r'.*(\$\S ).*', r'\1', elt.text))

# find Estimated Annual Emission Reduction
elt = driver.find_element(By.XPATH, "//*[strong[contains(., 'Estimated Annual Emission Reduction')]]")
print(elt.text.split(":")[1])

Output:

 $5.06
 11603 tCO2

CodePudding user response：

The website is dynamic and under cloudflare protection. So you can follow the next example selenium with bs4 to grab right data.

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import time

url = 'https://alliedoffsets.com/#/profile/1' 

options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')

driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
                            
driver.get(url)
time.sleep(5)

soup = BeautifulSoup(driver.page_source,'lxml')
driver.close()

Price = soup.select_one('p#direct-price-panel').contents[1].strip().replace('/tCO2e','')
Reduction= soup.select('.panel')[-1].contents[1].strip().replace('tCO2','')
print('Estimated Average Wholesale Price: '  str(Price))
print('Estimated Annual Emission Reduction: '   str(Reduction))

Output:

Estimated Average Wholesale Price: $5.06
Estimated Annual Emission Reduction: 11603