Home > OS >  Extract html script variables into JSON
Extract html script variables into JSON

Time:06-11

Here is a section of a HTML that I'm interested in extracting from.

<script id="P4EPconfig" type="text/javascript">
  REA = REA || {};

    REA.propertyId = "2879292";
    REA.state = "Vic";
    REA.suburb = "Gladstone Park";
    REA.channel = "property";
    REA.suburbForAds = "gladstonepark";
    REA.rawSuburb = "gladstone park";
    REA.postcode = "3043";
    REA.fullSuburb = "Gladstone Park, Vic 3043";
      REA.marketFlags = [];
    REA.buildingType = "house";
    REA.longStreetAddress = "1 Adam Court";
    REA.longStreetAddressWithSuburb = "1 Adam Court, Gladstone Park, Vic 3043";
    REA.lat = "-37.687113";
    REA.lon = "144.899982";
    REA.allImages = [{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/2ecfc132e00792c90e1a5eb569d249672e0e9d9ec60b364d1f482fc2477b66b6/main.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/ad424079d8c83822637f96cab86994ef0bb6fe6abb4b5a442e48b0df5e41c69b/image2.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/539aa0167f2d1e04a4ab47cddc90f6cf2aa007c5bc0dbc925487742966596db7/image3.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/dc017c3e0e9d28471cdc0586d3e1b01e5860fee6863847f26a832775b5751f35/image4.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/35d03effe995ca4ca78b4846b137dc6d85259fc41d0f70b6acc5fd38a4714d09/image5.jpg","caption":"Listed November 2010"}];
    REA.propertyType = "house";
    REA.pluralPropertyType = "houses";
    REA.bedrooms = "3";
    REA.bathrooms = "2";
    REA.carSpaces = "2";
    REA.yearBuilt = "";
    REA.landArea = "551";
    REA.landAreaDisplay = "551 m²";
    REA.floorArea = "";
    REA.floorAreaDisplay = "-";
    REA.rawBedrooms = "3";
    REA.rawBathrooms = "2";
    REA.offMarket = true;
    REA.avmData = {"confidence":"high","range":{"text":"$710,000 - $850,000","min":710000,"max":850000},"value":"NzgwOTQ0","lastUpdated":"30 May, 2022"};
    REA.powerProfile = null;
    REA.propertyListing = null;
    REA.findAgentsURI ="https://www.realestate.com.au/find-agent/gladstone-park-vic-3043";
    REA.propertyMarketTrends = {"propertyType":"house","bedrooms":"3","medianSoldPrice":733000.0,"medianRentalPrice":400.0,"annualGrowth":0.057,"soldProperties":88,"rentalProperties":68,"soldDataIngestDateDisplay":"3 June 2022","rentDataIngestDateDisplay":"5 June 2022","trends":{"medianSoldPrice":[{"bedrooms":"ALL","yearly":[{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":392500.0,"count":99},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":400000.0,"count":127},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":460000.0,"count":142},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":525000.0,"count":119},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":645000.0,"count":105},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":630000.0,"count":104},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":640000.0,"count":93},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":640000.0,"count":85},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":725000.0,"count":144}],"monthly":[{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":692500.0,"count":118},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":700000.0,"count":119},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":700500.0,"count":120},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":705000.0,"count":127},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":713000.0,"count":130},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":720000.0,"count":139},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":725000.0,"count":144},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":726000.0,"count":143},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":732000.0,"count":143},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":736000.0,"count":138},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":740000.0,"count":139},{"intervalStart":"2021-06-01","intervalEnd":"2022-05-31","value":745000.0,"count":119}]},{"bedrooms":"3","yearly":[{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":385750.0,"count":74},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":398000.0,"count":106},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":455000.0,"count":109},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":520000.0,"count":93},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":635000.0,"count":84},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":620000.0,"count":80},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":625000.0,"count":67},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":631000.0,"count":60},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":714000.0,"count":102}],"monthly":[{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":677000.0,"count":81},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":680000.0,"count":83},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":685000.0,"count":85},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":700000.0,"count":89},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":705000.0,"count":89},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":710250.0,"count":98},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":714000.0,"count":102},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":715000.0,"count":103},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":720000.0,"count":102},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":725500.0,"count":98},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":732000.0,"count":103},{"intervalStart":"2021-06-01","intervalEnd":"2022-05-31","value":733000.0,"count":88}]}],"medianRentalPrice":[{"bedrooms":"ALL","yearly":[{"intervalStart":"2012-01-01","intervalEnd":"2012-12-31","value":330.0,"count":63},{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":330.0,"count":43},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":350.0,"count":104},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":350.0,"count":116},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":360.0,"count":79},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":390.5,"count":66},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":400.0,"count":86},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":400.0,"count":100},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":400.5,"count":92},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":410.0,"count":105}],"monthly":[{"intervalStart":"2020-06-01","intervalEnd":"2021-05-31","value":400.0,"count":105},{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":400.0,"count":102},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":400.0,"count":103},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":400.5,"count":106},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":400.0,"count":109},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":410.0,"count":110},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":410.0,"count":110},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":410.0,"count":105},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":410.0,"count":104},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":410.0,"count":97},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":410.0,"count":98},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":415.0,"count":104}]},{"bedrooms":"3","yearly":[{"intervalStart":"2012-01-01","intervalEnd":"2012-12-31","value":330.0,"count":46},{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":320.0,"count":32},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":337.5,"count":68},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":340.0,"count":94},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":360.0,"count":59},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":380.0,"count":43},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":387.5,"count":60},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":400.0,"count":66},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":400.0,"count":62},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":400.0,"count":72}],"monthly":[{"intervalStart":"2020-06-01","intervalEnd":"2021-05-31","value":393.0,"count":70},{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":395.0,"count":67},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":397.5,"count":70},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":397.5,"count":70},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":395.0,"count":73},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":400.0,"count":73},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":400.0,"count":76},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":400.0,"count":72},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":397.5,"count":70},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":400.0,"count":68},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":400.0,"count":69},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":400.0,"count":72}]}]}};
    REA.leadGen = {"actionUrl":"https://property.value.realestate.com.au","data":{"listingCompany":{"id":"JR…
</script>

What would be a good method for formatting all these variables into a JSON format as shown below:

{
  "propertyId": "2879292",
  "state": "Vic",
  "suburb": "Gladstone Park"
  .
  .
  .
  "propertyMarketTrends": {...}
}

I can think of doing it using RE, but it may be a little tedious. Is there an easier way of parsing this data structure into JSON?

CodePudding user response:

If html_string contains the data from your question you can try this example to parse it (but beware, there are many corner cases):

import re
import json

d = {
    k: json.loads(v)
    for k, v in re.findall(r"REA\.(.*?)\s*=\s*(.*);", html_string)
}

print(d)

Prints:

{
    "propertyId": "2879292",
    "state": "Vic",
    "suburb": "Gladstone Park",
    "channel": "property",
    "suburbForAds": "gladstonepark",
    "rawSuburb": "gladstone park",
    "postcode": "3043",
    "fullSuburb": "Gladstone Park, Vic 3043",
    "marketFlags": [],
    "buildingType": "house",
    "longStreetAddress": "1 Adam Court",
    "longStreetAddressWithSuburb": "1 Adam Court, Gladstone Park, Vic 3043",
    "lat": "-37.687113",
    "lon": "144.899982",
    "allImages": [
        {
            "name": "photo",
            "server": "https://i2.au.reastatic.net",
            "uri": "/2ecfc132e00792c90e1a5eb569d249672e0e9d9ec60b364d1f482fc2477b66b6/main.jpg",
            "caption": "Listed November 2010",
        },
        {
            "name": "photo",
            "server": "https://i2.au.reastatic.net",
            "uri": "/ad424079d8c83822637f96cab86994ef0bb6fe6abb4b5a442e48b0df5e41c69b/image2.jpg",
            "caption": "Listed November 2010",
        },

...and so on.
  • Related