I have been given a dataframe that includes dictionaries and nested dictionaries.
See here the examples and the different types of dictionaries one will find: type 1 (test_dict_1), type 2 (test_dict_2):
test_dict_1={'results': [{'key': 'q1',
'value': ['1'],
'end_time': '2021-01-21',
'start_time': '2021-01-21',
'result_type': 'multipleChoice'},
{'key': 'q2',
'value': ['False'],
'end_time': '2021-01-21',
'start_time': '2021-01-21',
'result_type': 'multipleChoice'},
{'key': 'q3',
'value': ['3'],
'end_time': '2021-01-21',
'start_time': '2021-01-21',
'result_type': 'multipleChoice'},
{'key': 'q4',
'value': ['3'],
'end_time': '2021-01-21',
'start_time': '2021-01-21',
'result_type': 'multipleChoice'}]}
test_dict_2={'results': [{'key': 'survey_x',
'value': [[{'key': 'q1',
'value': 2,
'endTime': '2021-01-21',
'skipped': False,
'startTime': '2021-01-21',
'resultType': 'multipleChoice'},
{'key': 'q2',
'value': 0,
'endTime': '2021-01-21',
'skipped': False,
'startTime': '2021-01-21',
'resultType': 'multipleChoice'},
{'key': 'q3',
'value': 2,
'endTime':'2021-01-21',
'skipped': False,
'startTime': '2021-01-21',
'resultType': 'multipleChoice'},
{'key': 'q4',
'value': 0,
'endTime': '2021-01-21',
'skipped': False,
'startTime':'2021-01-21',
'resultType': 'multipleChoice'}]],
'skipped': False,
'end_time': '2021-01-21',
'start_time': '2021-01-21',
'result_type': 'grouped'}]}
My goal is to create a function that detects the type of dictionary, and for each type, create a dataframe using the key
and values
.
Note, however, that in the type 2, one has to go deeper, and the dataframe that I would want looks like this
df_2 = pd.DataFrame(test_dict_2['results'][0]['value'][0])
df_2 = df_2[['key', 'value']]
[Out]:
key value
0 q1 2
1 q2 0
2 q3 2
3 q4 0
The following function is able to extract the key and values, but doesn't detect the dictionary type, so it doesn't retrieve the desired for type 2 dictionaries (unless one reads them as referred in the previous operation).
def extract_keys_values(df):
# Create a list of dictionaries
list_of_dicts = []
for index, row in df.iterrows():
# Create a dictionary for each row
dict_ = {}
for key, value in row.items():
# If the value is a list, extract the first element
if isinstance(value, list):
value = value[0]
# If the value is a dictionary, extract the value
if isinstance(value, dict):
value = value['value']
# Add key and value to dictionary
dict_[key] = value
# Add dictionary to list
list_of_dicts.append(dict_)
# Create dataframe from list of dictionaries
df = pd.DataFrame(list_of_dicts)
return df
My desired goal is a function that detects the type of dictionary, and adjust the way that the dataframe is created:
• If type 1, do something like the following
df_1 = pd.DataFrame(test_dict_1['results'])
• If type 2, do something like the following
df_2 = pd.DataFrame(test_dict_2['results'][0]['value'][0])
CodePudding user response:
By far not the prettiest solution, but this works for my messy dataframe:
def recursive_items(dictionary):
for key, value in dictionary.items():
if type(value) is dict:
yield from recursive_items(value)
else:
yield (key, value)
def extract_keys_values(df):
for i in range(len(df)):
# print(i)
global_dict={}
for key_, value_ in recursive_items(df.loc[i, 0]):
for element in value_:
keys_ = dict((k, element[k]) for k in ['key'] if k in element)
texts_ = dict((k, element[k]) for k in ['text'] if k in element)
values_ = dict((k, element[k]) for k in ['value'] if k in element)
if 'key' in str(values_):
for key, value in values_.items():
try:
nested_keys_list=list(object['key'] for object in value[0])
nested_values_list=list(object['value'] for object in value[0])
except:
nested_keys_list=list(object['key'] for object in [value][0])
nested_values_list=list(object['value'] for object in [value][0])
for list_index, word in enumerate(nested_keys_list):
if isinstance(nested_values_list[list_index], list):
try:
nested_dictionaries={word,nested_values_list[list_index][0]}
except:
nested_dictionaries={word,''}
else:
nested_dictionaries={word,nested_values_list[list_index]}
new_data_dictionary=dict.fromkeys(keys_.values(),[nested_dictionaries])
else:
new_data_dictionary=dict.fromkeys(keys_.values(),values_)
if bool(texts_):
new_data_dictionary.update(texts_)
global_dict.update(new_data_dictionary)
# print(global_dict)
df.loc[i,'data_key_value']=[global_dict]
return df['data_key_value']
df['data_key_value']=extract_keys_values(df)