I'm trying to convert a large JSON
file (4.35 GB) to CSV
.
My initial approach was importing it, converting it to a data frame (I only need what's in features
), doing some data manipulation, and exporting it to CSV
.
with open('Risk_of_Flooding_from_Rivers_and_Sea.json') as data_file:
d = json.load(data_file)
# Grabbing the data in 'features'.
json_df = json_normalize(d, 'features')
df = pd.DataFrame(json_df)
I've been successful at doing this with small samples of the whole dataset, but I'm unable to import the whole thing at once, even after leaving it running for 9 hours. Even though my PC has 16 GB of RAM, I'm assuming it's a memory issue even though there are no errors.
Here's a small sample of the JSON
data I'm using:
{
"type": "FeatureCollection",
"crs": {
"type": "name",
"properties": {
"name": "EPSG:27700"
}
},
"features": [
{
"type": "Feature",
"id": 1,
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
289344.50009999985,
60397.26009999961
],
[
289347.2400000002,
60400
]
]
]
},
"properties": {
"OBJECTID": 1,
"prob_4band": "Low",
"suitability": "National to County",
"pub_date": 1522195200000,
"shape_Length": 112.16436096255808,
"shape_Area": 353.4856092588217
}
},
{
"type": "Feature",
"id": 2,
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
289250,
60550
],
[
289200,
60550
]
]
]
},
"properties": {
"OBJECTID": 2,
"prob_4band": "Very Low",
"suitability": "National to County",
"pub_date": 1522195200000,
"shape_Length": 985.6295076665662,
"shape_Area": 18755.1377842949
}
},
I've looked into splitting up the JSON
file into smaller chunks, but I've had no success in my attempts. With the code below I'm getting the error
JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1).
with open(os.path.join('E:/Jupyter', 'Risk_of_Flooding_from_Rivers_and_Sea.json'), 'r',
encoding='utf-8') as f1:
ll = [json.loads(line.strip()) for line in f1.readlines()]
print(len(ll))
size_of_the_split = 10000
total = len(ll) // size_of_the_split
print(total 1)
for i in range(total 1):
json.dump(ll[i * size_of_the_split:(i 1) * size_of_the_split], open(
"E:/Jupyter/split" str(i 1) ".json", 'w',
encoding='utf-8'), ensure_ascii=False, indent=True)
I'm just wondering what my options are. Is the way I'm doing it the best way to do this? If it is, what can I change?
I get the smaller samples from this source, but they can't be too large.
CodePudding user response:
For splitting up the data you can use a streaming parser such as ijson e.g.
import ijson
import itertools
import json
chunk_size = 10_000
filename = 'Risk_of_Flooding_from_Rivers_and_Sea.json'
with open(filename, mode='rb', encoding='utf-8') as file_in:
features = ijson.items(file_in, 'features.item', use_float=True)
chunk = list(itertools.islice(features, chunk_size))
count = 1
while chunk:
with open(f'features-split-{count}.json', mode='w', encoding='utf-8') as file_out:
json.dump(chunk, file_out, ensure_ascii=False, indent=4)
chunk = list(itertools.islice(features, chunk_size))
count = 1
CodePudding user response:
Using sources online, pandas df.to_csv did the trick with a json file of size 181MB relatively fast. I Assume it will do the same to bigger files.
import wizzi_utils as wu # pip install wizzi_utils
import pandas
def func():
"""
source https://princekfrancis.medium.com/convert-large-json-file-into-csv-using-python-769d413b8afd
json file from https://github.com/zemirco/sf-city-lots-json/blob/master/citylots.json
:return:
"""
json_path = './citylots.json'
print('file {}: {}'.format(json_path, wu.file_or_folder_size(json_path)))
timer_begin = wu.get_timer()
j = wu.jt.load_json(json_path, ack=False)
timer_end = wu.get_timer_delta(s_timer=timer_begin, with_ms=False)
print('json loading time {}'.format(timer_end))
timer_begin = wu.get_timer()
df = pandas.json_normalize(j['features']) # load json into data frame
timer_end = wu.get_timer_delta(s_timer=timer_begin, with_ms=False)
print('json_normalize time {}'.format(timer_end))
csv_output_path = './citylots.csv'
timer_begin = wu.get_timer()
df.to_csv(csv_output_path, sep=',', encoding='utf-8') # save as csv
timer_end = wu.get_timer_delta(s_timer=timer_begin, with_ms=False)
print('csv creation time {}'.format(timer_end))
print('file {}: {}'.format(csv_output_path, wu.file_or_folder_size(csv_output_path)))
return
def main():
func()
return
if __name__ == '__main__':
# main()
wu.main_wrapper(
main_function=main,
seed=42,
ipv4=False,
cuda_off=False,
torch_v=False,
tf_v=False,
cv2_v=True,
with_pip_list=False,
with_profiler=False
)
And the output:
file ./citylots.json: 180.99 MB
json loading time 0:00:05
json_normalize time 0:00:02
csv creation time 0:00:09
file ./citylots.csv: 132.63 MB
Total run time 0:00:18