Home > Software design >  Extracting JSON from HTML with BeautifulSoup
Extracting JSON from HTML with BeautifulSoup

Time:08-14

I've now tried everything for the past few hours but I can't extract a specific thing from the HTML below. I want to grab the "sessionCartId" but I can't figure out how....

Thats what i tried so far :

sessioncartid = BeautifulSoup(response.text, "html.parser").findAll("script", {"type":"text/javascript"})[2]
data = json.loads(sessioncartid.text)
        print(data)

^^ This gives me the correct script tag but i cant transform it into a json nor get the sessioncarId

<script type="text/javascript">
/*<![CDATA[*/

var ACC = {config: {}};
ACC.config.contextPath = "";
ACC.config.encodedContextPath = "/de/web";
ACC.config.commonResourcePath = "/_ui/20220811221438/responsive/common";
ACC.config.themeResourcePath = "/_ui/20220811221438/responsive/theme-gh";
ACC.config.siteResourcePath = "/_ui/20220811221438/responsive/site-ghstore";
ACC.config.rootPath = "/_ui/20220811221438/responsive";
ACC.config.CSRFToken = "81b0156a-5a78-4969-b52e-e5080473fb83";
ACC.pwdStrengthVeryWeak = 'password.strength.veryweak';
ACC.pwdStrengthWeak = 'password.strength.weak';
ACC.pwdStrengthMedium = 'password.strength.medium';
ACC.pwdStrengthStrong = 'password.strength.strong';
ACC.pwdStrengthVeryStrong = 'password.strength.verystrong';
ACC.pwdStrengthUnsafePwd = 'password.strength.unsafepwd';
ACC.pwdStrengthTooShortPwd = 'password.strength.tooshortpwd';
ACC.pwdStrengthMinCharText = 'password.strength.minchartext';
ACC.accessibilityLoading = 'aria.pickupinstore.loading';
ACC.accessibilityStoresLoaded = 'aria.pickupinstore.storesloaded';
ACC.config.googleApiKey = "";
ACC.config.googleApiVersion = "3.7";


ACC.autocompleteUrl = '/de/web/search/autocompleteSecure';


ACC.config.loginUrl = '/de/web/login';


ACC.config.authenticationStatusUrl = '/de/web/authentication/status';


/*]]>*/

var OCC = 


{

    "token": "1799248c-8de0-4199-b5fe-1d610452010a",
    
    "currentUser": "[email protected]",
"sessionCartGuid": "2323121232323",
"sessionCartId": "121212123435324",
    "sessionLanguageIso": "de",
    "sessionCountryIso": "DE",
    "urlPosCode": "web",
    "isASM": false,
    "intermediaryID": "",
    "isASMCustomerEmulated": false,
    "siteId": "ghstore",
    "OCCBaseUrl": "/ghcommercewebservices/v2/ghstore",
    "availablePointsOfService": "BUD,FRA,DTM,HAM,GRZ,HAJ,SZG,VIE,WEB,BER",
    "primaryPointOfSevice": "WEB",
    "clientChannel": "web-eu"
};
</script>

CodePudding user response:

This is how you can extract that dictionary:

from bs4 import BeautifulSoup
import json
import re

html = '''
<script type="text/javascript">
/*<![CDATA[*/

var ACC = {config: {}};
ACC.config.contextPath = "";
ACC.config.encodedContextPath = "/de/web";
ACC.config.commonResourcePath = "/_ui/20220811221438/responsive/common";
ACC.config.themeResourcePath = "/_ui/20220811221438/responsive/theme-gh";
ACC.config.siteResourcePath = "/_ui/20220811221438/responsive/site-ghstore";
ACC.config.rootPath = "/_ui/20220811221438/responsive";
ACC.config.CSRFToken = "81b0156a-5a78-4969-b52e-e5080473fb83";
ACC.pwdStrengthVeryWeak = 'password.strength.veryweak';
ACC.pwdStrengthWeak = 'password.strength.weak';
ACC.pwdStrengthMedium = 'password.strength.medium';
ACC.pwdStrengthStrong = 'password.strength.strong';
ACC.pwdStrengthVeryStrong = 'password.strength.verystrong';
ACC.pwdStrengthUnsafePwd = 'password.strength.unsafepwd';
ACC.pwdStrengthTooShortPwd = 'password.strength.tooshortpwd';
ACC.pwdStrengthMinCharText = 'password.strength.minchartext';
ACC.accessibilityLoading = 'aria.pickupinstore.loading';
ACC.accessibilityStoresLoaded = 'aria.pickupinstore.storesloaded';
ACC.config.googleApiKey = "";
ACC.config.googleApiVersion = "3.7";


ACC.autocompleteUrl = '/de/web/search/autocompleteSecure';


ACC.config.loginUrl = '/de/web/login';


ACC.config.authenticationStatusUrl = '/de/web/authentication/status';


/*]]>*/

var OCC = 


{

    "token": "1799248c-8de0-4199-b5fe-1d610452010a",
    
    "currentUser": "[email protected]",
"sessionCartGuid": "2323121232323",
"sessionCartId": "121212123435324",
    "sessionLanguageIso": "de",
    "sessionCountryIso": "DE",
    "urlPosCode": "web",
    "isASM": false,
    "intermediaryID": "",
    "isASMCustomerEmulated": false,
    "siteId": "ghstore",
    "OCCBaseUrl": "/ghcommercewebservices/v2/ghstore",
    "availablePointsOfService": "BUD,FRA,DTM,HAM,GRZ,HAJ,SZG,VIE,WEB,BER",
    "primaryPointOfSevice": "WEB",
    "clientChannel": "web-eu"
};
</script>
'''
soup = BeautifulSoup(html, 'html.parser')
info = soup.select_one('script', string = re.compile('sessionCartGuid'))
json_obj = json.loads(info.text.split('var OCC =')[1].split(';')[0])
# print(json_obj)
print(json_obj['token'])
print(json_obj['currentUser'])
print(json_obj['sessionCartId'])

Result:

1799248c-8de0-4199-b5fe-1d610452010a
[email protected]
121212123435324

BeautifulSoup docs: https://beautiful-soup-4.readthedocs.io/en/latest/index.html

  • Related