Home > Mobile >  Parsing json that doesn't seem to be properly formatted
Parsing json that doesn't seem to be properly formatted

Time:02-23

I'm trying to do some work with real estate data and after failing on my own managed to borrow a code that pulled some of the data. Unfortunately I have no idea how to parse the rest, as the json formatting is very confusing to me. This is not my area of expertise so if anyone has ideas on how to approach this I would greatly appreciate it. If needed I can post the entire json but it's very long.

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import pprint

#-------------------------------------------------------------------------------------------------------------------------#
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome 61.0.3163.100 Safari/537.36',
    'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'upgrade-insecure-requests': '1'
}

#-------------------------------------------------------------------------------------------------------------------------#
def get_soup(address):
        page_request = requests.get(address, headers=HEADERS)
        return BeautifulSoup(page_request.text, "lxml")

#-------------------------------------------------------------------------------------------------------------------------#
def fetch_content(soup, verbose=False):
    item = soup.select_one("script#hdpApolloPreloadedData").text
    d = json.loads(item)['apiCache']
    return json.loads(d)

#-------------------------------------------------------------------------------------------------------------------------#
def process_fetched_content(raw_dictionary=None):
    if raw_dictionary is not None:
        keys = [k for k in raw_dictionary.keys() if k.startswith('VariantQuery{"zpid":')]
        property_info = dict((k.split(':')[-1].replace('}',''), raw_dictionary.get(k).get('property', None)) for k in keys)
        return property_info
    else:
        return None

#-------------------------------------------------------------------------------------------------------------------------#
if __name__ == "__main__":

    link = 'https://www.zillow.com/homedetails/2408-Comstock-Ct-Naperville-IL-60564/5367006_zpid/'
    soup = get_soup(link)
    results = process_fetched_content(raw_dictionary = fetch_content(soup, verbose=False))
    pprint.pprint(results)

Sidenote: I know zillow doesn't take kindly to scraping but I'm not trying to pull data at a large scale so not too concerned.

CodePudding user response:

I believe you can (Unless I Understood the question horribly wrong)

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import pprint

# -------------------------------------------------------------------------------------------------------------------------#
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome 61.0.3163.100 Safari/537.36",
    "Accept": "text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "upgrade-insecure-requests": "1",
}

# -------------------------------------------------------------------------------------------------------------------------#
def get_soup(address):
    page_request = requests.get(address, headers=HEADERS)
    return BeautifulSoup(page_request.text, "lxml")


# -------------------------------------------------------------------------------------------------------------------------#
def fetch_content(soup, verbose=False):
    item = soup.select_one("script#hdpApolloPreloadedData").text
    d = json.loads(item)["apiCache"]
    return json.loads(d)


# -------------------------------------------------------------------------------------------------------------------------#
def process_fetched_content(raw_dictionary=None):
    if raw_dictionary is not None:
        keys = [
            k for k in raw_dictionary.keys() if k.startswith('VariantQuery{"zpid":')
        ]
        property_info = dict(
            (
                k.split(":")[-1].replace("}", ""),
                raw_dictionary.get(k).get("property", None),
            )
            for k in keys
        )
        return raw_dictionary, property_info
    else:
        return None


# -------------------------------------------------------------------------------------------------------------------------#
if __name__ == "__main__":

    link = "https://www.zillow.com/homedetails/2408-Comstock-Ct-Naperville-IL-60564/5367006_zpid/"
    soup = get_soup(link)
    raw, results = process_fetched_content(raw_dictionary=fetch_content(soup, verbose=False))
    
    # Traverse through results
    for value in results.values():
        for inner_key, inner_value in value.items():
            print(f'{inner_key}: {inner_value}')

    # Traverse through raw dictionary
    
    for key, value in raw.items():
        print(f'{key}:')
        for inner_key, inner_value in value.items():
            print(f'\t{inner_key}:')
            try:
                for inner_2_key, inner_2_value in inner_value.items():
                    print(f'\t\t{inner_2_key}:')
                    try:
                        for inner_3_key, inner_3_value in inner_2_value.items():
                            print(f'\t\t\t{inner_3_key}:')
                            try:
                                for inner_4_value in inner_3_value:
                                    for inner_4_1_key, inner_4_1_value in inner_4_value.items():
                                        print(f'\t\t\t\t{inner_4_1_key}: {inner_4_1_value}')
                            except:
                                for inner_4_key, inner_4_value in inner_3_value.items():
                                    print(f'\t\t\t\t{inner_4_key}: {inner_4_value}')        
                    except:
                        print(f'\t\t\t{inner_2_value}')
            except:
                print(f'\t\t{inner_value}')

Try this. Output is waay too long though. But it seems readable now...

  • Related