Here is a section of a HTML that I'm interested in extracting from.
<script id="P4EPconfig" type="text/javascript">
REA = REA || {};
REA.propertyId = "2879292";
REA.state = "Vic";
REA.suburb = "Gladstone Park";
REA.channel = "property";
REA.suburbForAds = "gladstonepark";
REA.rawSuburb = "gladstone park";
REA.postcode = "3043";
REA.fullSuburb = "Gladstone Park, Vic 3043";
REA.marketFlags = [];
REA.buildingType = "house";
REA.longStreetAddress = "1 Adam Court";
REA.longStreetAddressWithSuburb = "1 Adam Court, Gladstone Park, Vic 3043";
REA.lat = "-37.687113";
REA.lon = "144.899982";
REA.allImages = [{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/2ecfc132e00792c90e1a5eb569d249672e0e9d9ec60b364d1f482fc2477b66b6/main.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/ad424079d8c83822637f96cab86994ef0bb6fe6abb4b5a442e48b0df5e41c69b/image2.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/539aa0167f2d1e04a4ab47cddc90f6cf2aa007c5bc0dbc925487742966596db7/image3.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/dc017c3e0e9d28471cdc0586d3e1b01e5860fee6863847f26a832775b5751f35/image4.jpg","caption":"Listed November 2010"},{"name":"photo","server":"https://i2.au.reastatic.net","uri":"/35d03effe995ca4ca78b4846b137dc6d85259fc41d0f70b6acc5fd38a4714d09/image5.jpg","caption":"Listed November 2010"}];
REA.propertyType = "house";
REA.pluralPropertyType = "houses";
REA.bedrooms = "3";
REA.bathrooms = "2";
REA.carSpaces = "2";
REA.yearBuilt = "";
REA.landArea = "551";
REA.landAreaDisplay = "551 m²";
REA.floorArea = "";
REA.floorAreaDisplay = "-";
REA.rawBedrooms = "3";
REA.rawBathrooms = "2";
REA.offMarket = true;
REA.avmData = {"confidence":"high","range":{"text":"$710,000 - $850,000","min":710000,"max":850000},"value":"NzgwOTQ0","lastUpdated":"30 May, 2022"};
REA.powerProfile = null;
REA.propertyListing = null;
REA.findAgentsURI ="https://www.realestate.com.au/find-agent/gladstone-park-vic-3043";
REA.propertyMarketTrends = {"propertyType":"house","bedrooms":"3","medianSoldPrice":733000.0,"medianRentalPrice":400.0,"annualGrowth":0.057,"soldProperties":88,"rentalProperties":68,"soldDataIngestDateDisplay":"3 June 2022","rentDataIngestDateDisplay":"5 June 2022","trends":{"medianSoldPrice":[{"bedrooms":"ALL","yearly":[{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":392500.0,"count":99},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":400000.0,"count":127},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":460000.0,"count":142},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":525000.0,"count":119},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":645000.0,"count":105},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":630000.0,"count":104},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":640000.0,"count":93},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":640000.0,"count":85},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":725000.0,"count":144}],"monthly":[{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":692500.0,"count":118},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":700000.0,"count":119},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":700500.0,"count":120},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":705000.0,"count":127},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":713000.0,"count":130},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":720000.0,"count":139},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":725000.0,"count":144},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":726000.0,"count":143},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":732000.0,"count":143},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":736000.0,"count":138},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":740000.0,"count":139},{"intervalStart":"2021-06-01","intervalEnd":"2022-05-31","value":745000.0,"count":119}]},{"bedrooms":"3","yearly":[{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":385750.0,"count":74},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":398000.0,"count":106},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":455000.0,"count":109},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":520000.0,"count":93},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":635000.0,"count":84},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":620000.0,"count":80},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":625000.0,"count":67},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":631000.0,"count":60},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":714000.0,"count":102}],"monthly":[{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":677000.0,"count":81},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":680000.0,"count":83},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":685000.0,"count":85},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":700000.0,"count":89},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":705000.0,"count":89},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":710250.0,"count":98},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":714000.0,"count":102},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":715000.0,"count":103},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":720000.0,"count":102},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":725500.0,"count":98},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":732000.0,"count":103},{"intervalStart":"2021-06-01","intervalEnd":"2022-05-31","value":733000.0,"count":88}]}],"medianRentalPrice":[{"bedrooms":"ALL","yearly":[{"intervalStart":"2012-01-01","intervalEnd":"2012-12-31","value":330.0,"count":63},{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":330.0,"count":43},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":350.0,"count":104},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":350.0,"count":116},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":360.0,"count":79},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":390.5,"count":66},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":400.0,"count":86},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":400.0,"count":100},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":400.5,"count":92},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":410.0,"count":105}],"monthly":[{"intervalStart":"2020-06-01","intervalEnd":"2021-05-31","value":400.0,"count":105},{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":400.0,"count":102},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":400.0,"count":103},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":400.5,"count":106},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":400.0,"count":109},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":410.0,"count":110},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":410.0,"count":110},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":410.0,"count":105},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":410.0,"count":104},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":410.0,"count":97},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":410.0,"count":98},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":415.0,"count":104}]},{"bedrooms":"3","yearly":[{"intervalStart":"2012-01-01","intervalEnd":"2012-12-31","value":330.0,"count":46},{"intervalStart":"2013-01-01","intervalEnd":"2013-12-31","value":320.0,"count":32},{"intervalStart":"2014-01-01","intervalEnd":"2014-12-31","value":337.5,"count":68},{"intervalStart":"2015-01-01","intervalEnd":"2015-12-31","value":340.0,"count":94},{"intervalStart":"2016-01-01","intervalEnd":"2016-12-31","value":360.0,"count":59},{"intervalStart":"2017-01-01","intervalEnd":"2017-12-31","value":380.0,"count":43},{"intervalStart":"2018-01-01","intervalEnd":"2018-12-31","value":387.5,"count":60},{"intervalStart":"2019-01-01","intervalEnd":"2019-12-31","value":400.0,"count":66},{"intervalStart":"2020-01-01","intervalEnd":"2020-12-31","value":400.0,"count":62},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":400.0,"count":72}],"monthly":[{"intervalStart":"2020-06-01","intervalEnd":"2021-05-31","value":393.0,"count":70},{"intervalStart":"2020-07-01","intervalEnd":"2021-06-30","value":395.0,"count":67},{"intervalStart":"2020-08-01","intervalEnd":"2021-07-31","value":397.5,"count":70},{"intervalStart":"2020-09-01","intervalEnd":"2021-08-31","value":397.5,"count":70},{"intervalStart":"2020-10-01","intervalEnd":"2021-09-30","value":395.0,"count":73},{"intervalStart":"2020-11-01","intervalEnd":"2021-10-31","value":400.0,"count":73},{"intervalStart":"2020-12-01","intervalEnd":"2021-11-30","value":400.0,"count":76},{"intervalStart":"2021-01-01","intervalEnd":"2021-12-31","value":400.0,"count":72},{"intervalStart":"2021-02-01","intervalEnd":"2022-01-31","value":397.5,"count":70},{"intervalStart":"2021-03-01","intervalEnd":"2022-02-28","value":400.0,"count":68},{"intervalStart":"2021-04-01","intervalEnd":"2022-03-31","value":400.0,"count":69},{"intervalStart":"2021-05-01","intervalEnd":"2022-04-30","value":400.0,"count":72}]}]}};
REA.leadGen = {"actionUrl":"https://property.value.realestate.com.au","data":{"listingCompany":{"id":"JR…
</script>
What would be a good method for formatting all these variables into a JSON format as shown below:
{
"propertyId": "2879292",
"state": "Vic",
"suburb": "Gladstone Park"
.
.
.
"propertyMarketTrends": {...}
}
I can think of doing it using RE, but it may be a little tedious. Is there an easier way of parsing this data structure into JSON?
CodePudding user response:
If html_string
contains the data from your question you can try this example to parse it (but beware, there are many corner cases):
import re
import json
d = {
k: json.loads(v)
for k, v in re.findall(r"REA\.(.*?)\s*=\s*(.*);", html_string)
}
print(d)
Prints:
{
"propertyId": "2879292",
"state": "Vic",
"suburb": "Gladstone Park",
"channel": "property",
"suburbForAds": "gladstonepark",
"rawSuburb": "gladstone park",
"postcode": "3043",
"fullSuburb": "Gladstone Park, Vic 3043",
"marketFlags": [],
"buildingType": "house",
"longStreetAddress": "1 Adam Court",
"longStreetAddressWithSuburb": "1 Adam Court, Gladstone Park, Vic 3043",
"lat": "-37.687113",
"lon": "144.899982",
"allImages": [
{
"name": "photo",
"server": "https://i2.au.reastatic.net",
"uri": "/2ecfc132e00792c90e1a5eb569d249672e0e9d9ec60b364d1f482fc2477b66b6/main.jpg",
"caption": "Listed November 2010",
},
{
"name": "photo",
"server": "https://i2.au.reastatic.net",
"uri": "/ad424079d8c83822637f96cab86994ef0bb6fe6abb4b5a442e48b0df5e41c69b/image2.jpg",
"caption": "Listed November 2010",
},
...and so on.