Home > Blockchain >  Multipe entities in JSON to Dataframe
Multipe entities in JSON to Dataframe

Time:12-26

I have multiple JSON files. Each JSON file includes structured data in multiple entities like below:

{
  "block": {
    "type": "block",
    "number": 2000000,
    "hash": "0xc0f4906fea23cf6f3cce98cb44e8e1449e455b28d684dfa9ff65426495584de6",
    "parent_hash": "0x57ebf07eb9ed1137d41447020a25e51d30a0c272b5896571499c82c33ecb7288",
    "nonce": "0x3b05c6d5524209f1",
    "sha3_uncles": "0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347",
    "logs_bloom": "0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
    "transactions_root": "0xb31f174d27b99cdae8e746bd138a01ce60d8dd7b224f7c60845914def05ecc58",
    "state_root": "0x96dbad955b166f5119793815c36f11ffa909859bbfeb64b735cca37cbf10bef1",
    "receipts_root": "0x84aea4a7aad5c5899bd5cfc7f309cc379009d30179316a2a7baa4a2ea4a438ac",
    "miner": "0x61c808d82a3ac53231750dadc13c777b59310bd9",
    "difficulty": 49824742724615,
    "total_difficulty": 44010101827705409388,
    "size": 650,
    "extra_data": "0xe4b883e5bda9e7a59ee4bb99e9b1bc",
    "gas_limit": 4712388,
    "gas_used": 21000,
    "timestamp": 1470173578,
    "transaction_count": 1,
    "base_fee_per_gas": null,
    "item_id": "block_0xc0f4906fea23cf6f3cce98cb44e8e1449e455b28d684dfa9ff65426495584de6",
    "item_timestamp": "2016-08-02T21:32:58Z"
  },
  "transactions": [
    {
      "type": "transaction",
      "hash": "0xc55e2b90168af6972193c1f86fa4d7d7b31a29c156665d15b9cd48618b5177ef",
      "nonce": 126917,
      "transaction_index": 0,
      "from_address": "0x32be343b94f860124dc4fee278fdcbd38c102d88",
      "to_address": "0x104994f45d9d697ca104e5704a7b77d7fec3537c",
      "value": 149990000000000000000,
      "gas": 333333,
      "gas_price": 30000000000,
      "input": "0x",
      "block_timestamp": 1470173578,
      "block_number": 2000000,
      "block_hash": "0xc0f4906fea23cf6f3cce98cb44e8e1449e455b28d684dfa9ff65426495584de6",
      "max_fee_per_gas": null,
      "max_priority_fee_per_gas": null,
      "transaction_type": 0,
      "receipt_cumulative_gas_used": 21000,
      "receipt_gas_used": 21000,
      "receipt_contract_address": null,
      "receipt_root": "0x5a2a9b14d22f5bb89b188d97312f397e94417d01f9a588eddf8924aa51700f40",
      "receipt_status": null,
      "receipt_effective_gas_price": 30000000000,
      "item_id": "transaction_0xc55e2b90168af6972193c1f86fa4d7d7b31a29c156665d15b9cd48618b5177ef",
      "item_timestamp": "2016-08-02T21:32:58Z"
    }
  ],
  "logs": [],
  "token_transfers": [],
  "traces": [
    {
      "type": "trace",
      "transaction_index": 0,
      "from_address": "0x32be343b94f860124dc4fee278fdcbd38c102d88",
      "to_address": "0x104994f45d9d697ca104e5704a7b77d7fec3537c",
      "value": 149990000000000000000,
      "input": "0x",
      "output": "0x",
      "trace_type": "call",
      "call_type": "call",
      "reward_type": null,
      "gas": 312333,
      "gas_used": 0,
      "subtraces": 0,
      "trace_address": [],
      "error": null,
      "status": 1,
      "transaction_hash": "0xc55e2b90168af6972193c1f86fa4d7d7b31a29c156665d15b9cd48618b5177ef",
      "block_number": 2000000,
      "trace_id": "call_0xc55e2b90168af6972193c1f86fa4d7d7b31a29c156665d15b9cd48618b5177ef_",
      "trace_index": 0,
      "block_timestamp": 1470173578,
      "block_hash": "0xc0f4906fea23cf6f3cce98cb44e8e1449e455b28d684dfa9ff65426495584de6",
      "item_id": "trace_call_0xc55e2b90168af6972193c1f86fa4d7d7b31a29c156665d15b9cd48618b5177ef_",
      "item_timestamp": "2016-08-02T21:32:58Z"
    },
    {
      "type": "trace",
      "transaction_index": null,
      "from_address": null,
      "to_address": "0x61c808d82a3ac53231750dadc13c777b59310bd9",
      "value": 5000000000000000000,
      "input": null,
      "output": null,
      "trace_type": "reward",
      "call_type": null,
      "reward_type": "block",
      "gas": null,
      "gas_used": null,
      "subtraces": 0,
      "trace_address": [],
      "error": null,
      "status": 1,
      "transaction_hash": null,
      "block_number": 2000000,
      "trace_id": "reward_2000000_0",
      "trace_index": 1,
      "block_timestamp": 1470173578,
      "block_hash": "0xc0f4906fea23cf6f3cce98cb44e8e1449e455b28d684dfa9ff65426495584de6",
      "item_id": "trace_reward_2000000_0",
      "item_timestamp": "2016-08-02T21:32:58Z"
    }
  ]
}

I would like to transform these JSON files into dataframes by the top keys. So each dataframe would correspond to block, transactions, logs etc as they are the top entity names.

I am trying to do this with df = pd.read_json("block-data/2000000.json") This gives an error of ValueError: Value is too big!

Any suggestions on how to properly perform this at scale for billions of identical JSON files ?

CodePudding user response:

You get the error ValueError: Value is too big! because for some reason Pandas is trying to convert 149990000000000000000 into a Int64. But your additional problem is that it is unclear to Pandas what you want your final dataframe(s) to look like, as your keys contain a combination of objects and lists.

To have tighter control of your final result, you could first read in the file as a dict and then use Pandas to make the dataframes you are looking for. For example:

import pandas as pd
import json

with open('file.json', 'r') as myfile:
    data=myfile.read()

obj = json.loads(data)

df_block = pd.DataFrame([obj['block']])
df_traces = pd.DataFrame(obj['traces'])
df_transactions = pd.DataFrame(obj['transactions'])
  • Related