Home > Blockchain >  how to get contents of script (window.__INITIAL_STATE__) in html using python
how to get contents of script (window.__INITIAL_STATE__) in html using python

Time:07-11

I'm crawling data by python, but I have problems.

  1. data is mixed byte type and string

    "https:\u002F\u002Finvest.zum.com\u002Finternal\u002Findex\u002F1"

  2. data i got is not changed to json

import re
import json
import requests

url = "https://zum.com/"
html_doc = requests.get(url).text
data = re.search("window.__INITIAL_STATE__=(.*?);", html_doc)

# TypeError: the JSON object must be str, bytes or bytearray, not Match
data = json.loads(data)
<script>window.__INITIAL_STATE__={"address":null,"fetchedCommonResponse":true,"isDarkTheme":false,"headerStore":{"tickers":{"items":[{"type":"DOMESTIC_INDEX","id":1,"name":"코스피","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Finternal\u002Findex\u002F1","price":2350.61,"priceChange":16.34,"rateOfChange":0.7},{"type":"DOMESTIC_INDEX","id":2,"name":"코스피200","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Finternal\u002Findex\u002F2","price":310.15,"priceChange":1.84,"rateOfChange":0.6},{"type":"DOMESTIC_INDEX","id":3,"name":"코스닥","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Finternal\u002Findex\u002F3","price":766.48,"priceChange":8.51,"rateOfChange":1.12},{"type":"OVERSEAS_INDEX","id":2,"name":"나스닥 종합","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Fworld\u002Findex\u002F2","price":11635.31,"priceChange":13.96,"rateOfChange":0.12},{"type":"OVERSEAS_INDEX","id":1,"name":"다우 산업","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Fworld\u002Findex\u002F1","price":31338.15,"priceChange":-46.4,"rateOfChange":-0.15},{"type":"OVERSEAS_INDEX","id":11,"name":"닛케이225","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Fworld\u002Findex\u002F11","price":26598.16,"priceChange":107.63,"rateOfChange":0.41},{"type":"OVERSEAS_INDEX","id":10,"name":"상해종합","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Fworld\u002Findex\u002F10","price":3356.7,"priceChange":-7.7,"rateOfChange":-0.23},{"type":"OVERSEAS_INDEX","id":7,"name":"독일 DAX","landingUrl":"https:\u002F\u002Finvest.zum.com\u002Fworld\u002Findex\u002F7","price":13015.23,"priceChange":172.01,"rateOfChange":1.34},{"type":"COIN","id":290,"name":"비트코인","landingUrl":"https:\u002F\u002Fcoin.zum.com\u002Fprice?cm=more","price":25761000,"priceChange":-881000,"rateOfChange":-3.31}]},"leftDoodle":{"items":[{"idx":88,"doodleType":"doodle_default_type1","target":"_self","title":"ZUM 투자","alt":"투자 일정 알려줌","url":"https:\u002F\u002Finvest.zum.com\u002Fdomestic","imageUrl":"https:\u002F\u002Fpost3.zumst.com\u002Flegocms\u002F2022\u002F07\u002F05\u002F10\u002F4430a35e167b4ec0bb475b613d3cc251.gif","darkImageUrl":"https:\u002F\u002Fpost3.zumst.com\u002Flegocms\u002F2022\u002F07\u002F05\u002F10\u002F6546d29d8f4040a3914974ebacc4a774.gif","order":1}]},"rightDoodle":{"items":[]},"gnb":{"gnbItems":[{"idx":377,"title":"투자","target":"_self","url":"https:\u002F\u002Finvest.zum.com\u002F","order":1,"emphasis":"COLOR"},{"idx":378,"title":"국내증시","target":"_self","url":"https:\u002F\u002Finvest.zum.com\u002Finternal","order":2,"emphasis":"COLOR_HOT"},{"idx":379,"title":"해외증시","target":"_self","url":"https:\u002F\u002Finvest.zum.com\u002Fworld","order":3,"emphasis":"COLOR"},{"idx":380,"title":"가상화폐","target":"_self","url":"https:\u002F\u002Fcoin.zum.com","order":4,"emphasis":"COLOR"},{"idx":381,"title":"뉴스","target":"_self","url":"https:\u002F\u002Fnews.zum.com","order":5,"emphasis":""},{"idx":382,"title":"연예","target":"_self","url":"https:\u002F\u002Fnews.zum.com\u002Ffront?c=06","order":6,"emphasis":""},{"idx":383,"title":"TV","target":"_self","url":"https:\u002F\u002Ftv.zum.com","order":7,"emphasis":""},{"idx":384,"title":"허브","target":"_self","url":"https:\u002F\u002Fhub.zum.com","order":8,"emphasis":""},{"idx":385,"title":"쇼핑","target":"_self","url":"https:\u002F\u002Fshopping.zum.com","order":9,"emphasis":""}],"gnbLayerItems":[{"idx":558,"title":"미디어","url":"http:\u002F\u002Fzum.com","target":"_blank","order":0,"items":[{"idx":564,"title":"뉴스","url":"https:\u002F\u002Fnews.zum.com","extUrl":"","imageUrl":"","target":"_self","order":0,"items":[]},{"idx":565,"title":"TV 뉴스","url":"https:\u002F\u002Fnews.zum.com\u002Ftv?cm=news_lnb","extUrl":"","imageUrl":"","target":"_self","order":1,"items":[]},{"idx":566,"title":"스포츠","url":"https:\u002F\u002Fnews.zum.com\u002Ffront?c=05","extUrl":"","imageUrl":"","target":"_self","order":2,"items":[]},{"idx":567,"title":"연예","url":"https:\u002F\u002Fnews.zum.com\u002Ffront?c=06","extUrl":"","imageUrl":"","target":"_self","order":3,"items":[]}]},{"idx":559,"title":"엔터테인먼트","url":"http:\u002F\u002Fzum.com","target":"_blank","order":1,"items":[{"idx":568,"title":"TV 인기클립","url":"https:\u002F\u002Ftv.zum.com\u002Franking?tab=0","extUrl":"","imageUrl":"","target":"_self","order":0,"items":[]},{"idx":569,"title":"자동차 소식","url":"https:\u002F\u002Fauto.zum.com\u002Fnews\u002Fmain","extUrl":"","imageUrl":"","target":"_self","order":1,"items":[]}]},{"idx":560,"title":"문화\u002F생활","url":"http:\u002F\u002Fzum.com","target":"_blank","order":2,"items":[{"idx":570,"title":"라이프","url":"https:\u002F\u002Fhub.zum.com\u002Flife","extUrl":"","imageUrl":"","target":"_self","order":0,"items":[]},{"idx":571,"title":"이글루스 블로그","url":"http:\u002F\u002Fwww.egloos.com\u002F","extUrl":"","imageUrl":"","target":"_self","order":1,"items":[]},{"idx":572,"title":"제주여행","url":"https:\u002F\u002Ftrip.zum.com\u002Fjeju\u002F","extUrl":"","imageUrl":"","target":"_self","order":2,"items":[]},{"idx":573,"title":"쇼핑","url":"https:\u002F\u002Fshopping.zum.com\u002F","extUrl":"","imageUrl":"","target":"_self","order":3,"items":[]}]},{"idx":561,"title":"금융\u002F소비","url":"http:\u002F\u002Fzum.com","target":"_blank","order":3,"items":[{"idx":574,"title":"자동차 견적","url":"https:\u002F\u002Fauto.zum.com\u002Fnewcar\u002Festimate_sheet","extUrl":"","imageUrl":"","target":"_self","order":0,"items":[]},{"idx":575,"title":"가상화폐","url":"https:\u002F\u002Fcoin.zum.com\u002F","extUrl":"","imageUrl":"","target":"_self","order":1,"items":[]},{"idx":576,"title":"경제 뉴스","url":"https:\u002F\u002Fnews.zum.com\u002Ffront?c=03","extUrl":"","imageUrl":"","target":"_self","order":2,"items":[]}

CodePudding user response:

Try:

import re
import json
import requests

url = "https://zum.com/"
html_doc = requests.get(url).text
data = re.search("window.__INITIAL_STATE__=(.*?}});", html_doc)


data = json.loads(data.group(1))
print(data)

Prints:

{
    "address": None,
    "fetchedCommonResponse": True,
    "isDarkTheme": False,
    "headerStore": {
        "tickers": {
            "items": [
                {
                    "type": "DOMESTIC_INDEX",
                    "id": 1,
                    "name": "코스피",
                    "landingUrl": "https://invest.zum.com/internal/index/1",
                    "price": 2350.61,
                    "priceChange": 16.34,
                    "rateOfChange": 0.7,
                },

...
  • Related