I have been working on a project that involves parsing a CSV file in order to turn all the data into a very specifically formatted JSON following a complex schema. I have to custom make this program as the required complexity of the JSON makes existing converters fail. I am mostly there, I have run into one final roadblock though:
I have nested dictionaries, and occasionally there must be a list within those, this list will contain further dictionaries. This is fine, I have been able to complete that, BUT now I need to find a way to add more nested dictionaries within those. Below is a simplified breakdown of the concept.
the CSV will look something like this, where the @ before a tag indicates it's a list
x.a, x.b.z, x.b.y, [email protected], [email protected], [email protected], x.d, x.e.z, x.e.y
ab, cd, ef, gh, ij, kl, mn, op, qr
this should result in the following JSON
{
"x": {
"a": "ab",
"b": {
"z": "cd",
"y": "ef"
},
"c": [
{
"z": {
"nest1": "gh",
"nest2": "ij"
}
},
{
"yy": "kl"
}
],
"d": "mn",
"e": {
"z": "op",
"y": "qr"
}
}
}
This is one issue that I haven't been able to solve, my current code can only do one dictionary after the list item, not further. I also need to be able to somehow do the following within a list of dictionaries:
"c": [
{
"z": {
"nest1": "gh"
},
"zz": {
"nest2": "ij"
}
},
{
"yy": "kl"
}
i.e. somehow add multiple nested dictionaries within the dictionary in the list. The problem with this occurs within the fact that these aren't reference-able by name, so I do not know how I could potentially indicate to do that within the CSV format.
Here is the code I have that works up to the first dictionary nested within a list:
import json
import pandas as pd
from os.path import exists
# df1 = pd.read_csv("excelTestFacilities.csv", header = 1, sep=",", keep_default_na=False, engine="python")
# df2 = pd.read_csv("excelTestFacilityContacts.csv", header = 1, sep=",", keep_default_na=False, engine="python")
# df = pd.merge(df1, df2, how = 'inner')
df = pd.read_csv("csvTestFile.csv", header = 1, sep=", ", keep_default_na=False, engine="python")
#print(df) # uncomment to see the transformation
json_data = df.to_dict(orient="records")
#print(json_data)
def unflatten_dic(dic):
"""
Unflattens a CSV list into a set of nested dictionaries
"""
ini = {}
for k,v in list(dic.items()):
node = ini
list_bool = False
*parents, key = k.split('.')
for parent in parents:
if parent[0] == '@':
list_bool = True
if list_bool:
for parent in parents:
if parent[0] == '@':
node[parent[1:]] = node = node.get(parent[1:], [])
else:
node[parent] = node = node.get(parent, {})
node.append({key : v})
else:
for parent in parents:
node[parent] = node = node.get(parent, {})
node[key] = v
return ini
def merge_lists(dic):
"""
Removes duplicates within sets
"""
for k,v in list(dic.items()):
if isinstance(v, dict):
keys = list(v.keys())
vals = list(v.values())
if all(isinstance(l, list) and len(l)==len(vals[0]) for l in vals):
dic[k] = []
val_tuple = set(zip(*vals)) # removing duplicates with set()
for t in val_tuple:
dic[k].append({subkey: t[i] for i, subkey in enumerate(keys)})
else:
merge_lists(v)
elif isinstance(v, list):
dic[k] = list(set(v)) # removing list duplicates
def clean_blanks(value):
"""
Recursively remove all None values from dictionaries and lists, and returns
the result as a new dictionary or list.
"""
if isinstance(value, list):
return [clean_blanks(x) for x in value if x != ""]
elif isinstance(value, dict):
return {
key: clean_blanks(val)
for key, val in value.items()
if val != "" and val != {}
}
else:
return value
def add_to_dict(section_added_to, section_to_add, value, reportNum):
"""
Adds a value to a given spot within a dictionary set.
section_added_to is optional for adding the set to a deeper section such as facility
section_to_add is the name that the new dictionary entry will have
value is the item to be added
reportNum is the number indicating which report to add to, starting at 0
"""
if section_added_to != '':
end_list[reportNum][section_added_to][section_to_add] = value
else:
end_list[reportNum][section_to_add] = value
def read_add_vals(filename_prefix, added_to, section):
for i in range(len(end_list)):
temp_list = []
filename = filename_prefix str(i 1) ".csv"
if not exists(filename):
continue;
temp_df = pd.read_csv(filename, header = 1, sep=",", keep_default_na=False, engine="python")
temp_json = temp_df.to_dict(orient="records")
for y in temp_json:
return_ini = unflatten_dic(y)
temp_list.append(return_ini)
add_to_dict(added_to, section, temp_list, i)
global end_list
end_list = []
for x in json_data:
return_ini = unflatten_dic(x)
end_list.append(return_ini)
#read_add_vals('excelTestPermitsFac', 'facility', 'permits');
json_data = clean_blanks(end_list)
final_json = {"year":2021, "version":"2022-02-14", "reports":json_data}
print(json.dumps(final_json, indent=4))
There is some parts of this code that are involved in other components of the overall end JSON, but I am mainly concerned with how to change unflatten_dic() Here is my current working code for changing unflatten_dic(), even though it doesn't work...
def list_get(list, list_item):
i = 0
for dict in list:
if list_item in dict:
return dict.get(list_item, {})
i = 1
return {}
def check_in_list(list, list_item):
i = 0
for dict in list:
if list_item in dict:
return i
i = 1
return -1
def unflatten_dic(dic):
"""
Unflattens a CSV list into a set of nested dictionaries
"""
ini = {}
for k,v in list(dic.items()):
node = ini
list_bool = False
*parents, key = k.split('.')
for parent in parents:
if parent[0] == '@':
list_bool = True
previous_node_list = False
if list_bool:
for parent in parents:
print(parent)
if parent[0] == '@':
node[parent[1:]] = node = node.get(parent[1:], [])
ends_with_dict = False
previous_node_list = True
else:
print("else")
if previous_node_list:
print("prev list")
i = check_in_list(node, parent)
if i >= 0:
node[i] = node = list_get(node, parent)
else:
node.append({parent : {}})
previous_node_list = False
ends_with_dict = True
else:
print("not prev list")
node[parent] = node = node.get(parent, {})
previous_node_list = False
if ends_with_dict:
node[key] = v
else:
node.append({key : v})
else:
for parent in parents:
node[parent] = node = node.get(parent, {})
node[key] = v
#print(node)
return ini
Any, even small, amount of help would be greatly appreciated.
CodePudding user response:
It is easiest to use recursion and collections.defaultdict
to group child entries on their parents (each entry is separated by the .
in the csv data):
from collections import defaultdict
def to_dict(vals, is_list = 0):
def form_child(a, b):
return b[0][0] if len(b[0]) == 1 else to_dict(b, a[0] == '@')
d = defaultdict(list)
for a, *b in vals:
d[a].append(b)
if not is_list:
return {a[a[0] == '@':]:form_child(a, b) for a, b in d.items()}
return [{a[a[0] == '@':]:form_child(a, b)} for a, b in d.items()]
import csv, json
with open('filename.csv') as f:
data = list(csv.reader(f))
r = [a.split('.') [b] for i in range(0, len(data), 2) for a, b in zip(data[i], data[i 1])]
print(json.dumps(to_dict(r), indent=4))
Output:
{
"x": {
"a": "ab",
"b": {
"z": "cd",
"y": "ef"
},
"c": [
{
"z": {
"nest1": "gh",
"nest2": "ij"
}
},
{
"yy": "kl"
}
],
"d": "mn",
"e": {
"z": "op",
"y": "qr"
}
}
}
CodePudding user response:
I managed to get it working in what seems to be all scenarios. Here is the code that I made for the unflatten_dic() function.
def unflatten_dic(dic):
"""
Unflattens a CSV list into a set of nested dictionaries
"""
ini = {}
for k,v in list(dic.items()):
node = ini
list_bool = False
*parents, key = k.split('.')
# print("parents")
# print(parents)
for parent in parents:
if parent[0] == '@':
list_bool = True
if list_bool:
for parent in parents:
if parent[0] == '@':
node[parent[1:]] = node = node.get(parent[1:], [])
elif parent.isnumeric():
# print("numeric parent")
# print("length of node")
# print(len(node))
if len(node) > int(parent):
# print("node length good")
node = node[int(parent)]
else:
node.append({})
node = node[int(parent)]
else:
node[parent] = node = node.get(parent, {})
try:
node.append({key : v})
except AttributeError:
node[key] = v
else:
for parent in parents:
node[parent] = node = node.get(parent, {})
node[key] = v
return ini
I haven't run into an issue thus far, this is based on the following rules for the CSV:
@ before any name results in that item being a list
if the section immediately after a list in the CSV is a number, that will create multiple dictionaries within the list. Here is an example
x.a, x.b.z, x.b.y, [email protected], [email protected], [email protected], x.d, x.e.z, x.e.y, [email protected], [email protected][email protected], [email protected][email protected]
ab, cd, ef, gh, , kl, mn, op, qr, st, uv, wx
12, 34, 56, 78, 90, 09, , 65, 43, 21, , 92
This will result in the following JSON after formatting
"reports": [
{
"x": {
"a": "ab",
"b": {
"z": "cd",
"y": "ef"
},
"c": [
{
"zz": "gh"
},
{
"yy": {
"l": "st",
"m": [
{
"q": "uv"
},
{
"r": "wx"
}
]
}
},
{
"zz": "kl"
}
],
"d": "mn",
"e": {
"z": "op",
"y": "qr"
}
}
},
{
"x": {
"a": "12",
"b": {
"z": "34",
"y": "56"
},
"c": [
{
"zz": "78"
},
{
"zz": "90",
"yy": {
"l": "21",
"m": [
{
"r": "92"
}
]
}
},
{
"zz": "09"
}
],
"e": {
"z": "65",
"y": "43"
}
}
}
]