The Goal
I'm attempting to combine duplicate dictionaries in a nested dict together for a large amount of dict (~ 10,000).
For my specific case, I'm mainly looking to have all the related information (e.g. Batch & Items) of an identifier (e.g. USERID) be in one dictionary, which is in a list containing similar dictionaries. As an example with a smaller size:
Input
raw = [
{'USERID': 'USERID1', 'BATCH': 'NUM1304', 'ITEMS': '105'},
{'USERID': 'USERID15', 'BATCH': 'NUM1323', 'ITEMS': '122'},
{'USERID': 'USERID1', 'BATCH': 'NUM1365', 'ITEMS': '98'},
{'USERID': 'USERID12', 'BATCH': 'NUM1365', 'ITEMS': '76'},
{'USERID': 'USERID1', 'BATCH': 'NUM1376', 'ITEMS': '55'},
{'USERID': 'USERID3', 'BATCH': 'NUM1396', 'ITEMS': '151'},
{'USERID': 'USERID7', 'BATCH': 'NUM1398', 'ITEMS': '69'},
{'USERID': 'USERID7', 'BATCH': 'NUM1398', 'ITEMS': '126'},
{'USERID': 'USERID12', 'BATCH': 'NUM1422', 'ITEMS': '76'},
{'USERID': 'USERID15', 'BATCH': 'NUM1455', 'ITEMS': '77'},
{'USERID': 'USERID1', 'BATCH': 'NUM1465', 'ITEMS': '97'}
]
Output
raw = [
{'USERID': 'USERID1', 'BATCH': ['NUM1304', 'NUM1365', 'NUM1376', 'NUM1465'], 'ITEMS': ['105', '98', '55', '97']},
{'USERID': 'USERID15', 'BATCH': ['NUM1323', 'NUM1455'], 'ITEMS': ['122', '77']},
{'USERID': 'USERID12', 'BATCH': ['NUM1365', 'NUM1422'], 'ITEMS': ['76', '76']},
{'USERID': 'USERID3', 'BATCH': ['NUM1396'], 'ITEMS': ['151']},
{'USERID': 'USERID7', 'BATCH': ['NUM1398'], 'ITEMS': ['69']}
]
What's Done
I have already completed the following:
# Converts the batch & items values to lists to allow for extending #
def Corrector(raw):
i = 0
while i != len(raw):
raw[i]['BATCH'] = [raw[i]['BATCH']]
raw[i]['ITEMS'] = [raw[i]['ITEMS']]
i = 1
# Goes through two dictionaries and combines their values together #
def DuplicateCombiner(o_dict1, o_dict2):
for key, value in o_dict2.items():
if key in o_dict1 and isinstance(value, list):
o_dict1[key].extend(value)
else:
o_dict1[key] = value
# Removes duplicates from the original nest #
def DuplicateRemover(raw):
i = 0
raw_copy = []
users = []
while i != len(raw):
if raw[i]['USERID'] not in users:
users.append(raw[i]['USERID'])
raw_copy.append(raw[i])
i = 1
return raw_copy
My Attempt
I have used the following, which is very ineffective as it loops millions of times with a larger size, but did function for the example I gave earlier. However, I am looking for something that will function with a larger size, preferably without maxing out my RAM :).
def Combiner(self):
for i in raw:
for n in raw:
if i['USER ID'] != n['USER ID']: # If they're not the same USERID
continue
if i['BATCH'][0] == n['BATCH'][0]: # If they're the same dict
continue
DuplicateCombiner(i, n)
Also, I am using python 3.8.12. Any assistance is greatly appreciated.
CodePudding user response:
Something as simple as this may suit your need:
from pprint import pprint
from collections import defaultdict
raw = [
{'USERID': 'USERID1', 'BATCH': 'NUM1304', 'ITEMS': '105'},
{'USERID': 'USERID15', 'BATCH': 'NUM1323', 'ITEMS': '122'},
{'USERID': 'USERID1', 'BATCH': 'NUM1365', 'ITEMS': '98'},
{'USERID': 'USERID12', 'BATCH': 'NUM1365', 'ITEMS': '76'},
{'USERID': 'USERID1', 'BATCH': 'NUM1376', 'ITEMS': '55'},
{'USERID': 'USERID3', 'BATCH': 'NUM1396', 'ITEMS': '151'},
{'USERID': 'USERID7', 'BATCH': 'NUM1398', 'ITEMS': '69'},
{'USERID': 'USERID7', 'BATCH': 'NUM1398', 'ITEMS': '126'},
{'USERID': 'USERID12', 'BATCH': 'NUM1422', 'ITEMS': '76'},
{'USERID': 'USERID15', 'BATCH': 'NUM1455', 'ITEMS': '77'},
{'USERID': 'USERID1', 'BATCH': 'NUM1465', 'ITEMS': '97'}
]
result = defaultdict(lambda: defaultdict(list))
for d in raw:
result[d['USERID']]['BATCH'].append(d['BATCH'])
result[d['USERID']]['ITEMS'].append(d['ITEMS'])
result = [dict(v) | {'USERID': k} for k, v in result.items()]
pprint(result)
(pprint
is only there to show the result nicer, defaultdict
helps with simple construction, you could avoid that by putting the logic in the code itself, but I don't think it's worth the trouble)
Output:
[{'BATCH': ['NUM1304', 'NUM1365', 'NUM1376', 'NUM1465'],
'ITEMS': ['105', '98', '55', '97'],
'USERID': 'USERID1'},
{'BATCH': ['NUM1323', 'NUM1455'],
'ITEMS': ['122', '77'],
'USERID': 'USERID15'},
{'BATCH': ['NUM1365', 'NUM1422'], 'ITEMS': ['76', '76'], 'USERID': 'USERID12'},
{'BATCH': ['NUM1396'], 'ITEMS': ['151'], 'USERID': 'USERID3'},
{'BATCH': ['NUM1398', 'NUM1398'], 'ITEMS': ['69', '126'], 'USERID': 'USERID7'}]
However, a list of dictionaries with list values that have to be processed in order to match up list values seems like a very poor choice for a data structure, unless there's some external reason that forces you to use this structure.
Have a look at DataFrame
in pandas
for a more flexible and likely far more performant solution.
CodePudding user response:
It's much easier if you create a new list of dictionaries instead of trying to do this in place. Unfortunately, this will be more memory intensive.
The intermediate structure should be a dictionary of dictionaries, so you can look up the dictionary by user ID without having to loop.
This is also a good place to use defaultdict
.
from collections import defaultdict
def combine_dicts(raw):
result_dict = defaultdict(lambda: {'USERID': '', BATCH: [], 'ITEMS': []})
for d in raw:
nested = result_dict[d['USERID']]
nested['USERID'] = d['USERID']
nested['BATCH'].append(d['BATCH'])
nested['ITEMS'].append(d['ITEMS'])
return list(result_dict.values())
raw = combine_dicts(raw)
CodePudding user response:
Just make use of dictionary's 'get' defaults:
processed = dict()
for dx in raw:
user = dx.get("USER_ID")
d = processed.get(user, {'USER_ID': user})
batch = d.get("BATCH", [])
batch.append(dx.get("BATCH_ID"))
d["BATCH"] = batch
items = d.get("ITEMS", [])
items.append(dx.get("ITEMS"))
d["ITEMS"] = items
processed["USER_ID"] = d
[v for _, v in processed.items()]
CodePudding user response:
Looks like everyone's answers are more or less the same.
from collections import defaultdict
def combine(data):
users = defaultdict(lambda: defaultdict(list))
for d in data:
users[d['USERID']]['BATCH'].append(d['BATCH'])
users[d['USERID']]['ITEMS'].append(d['ITEMS'])
user_lists = [
{
'USERID': user,
**dicts
}
for user, dicts in users.items()
]
return user_lists
raw = [
{'USERID': 'USERID1', 'BATCH': 'NUM1304', 'ITEMS': '105'},
{'USERID': 'USERID15', 'BATCH': 'NUM1323', 'ITEMS': '122'},
{'USERID': 'USERID1', 'BATCH': 'NUM1365', 'ITEMS': '98'},
{'USERID': 'USERID12', 'BATCH': 'NUM1365', 'ITEMS': '76'},
{'USERID': 'USERID1', 'BATCH': 'NUM1376', 'ITEMS': '55'},
{'USERID': 'USERID3', 'BATCH': 'NUM1396', 'ITEMS': '151'},
{'USERID': 'USERID7', 'BATCH': 'NUM1398', 'ITEMS': '69'},
{'USERID': 'USERID7', 'BATCH': 'NUM1398', 'ITEMS': '126'},
{'USERID': 'USERID12', 'BATCH': 'NUM1422', 'ITEMS': '76'},
{'USERID': 'USERID15', 'BATCH': 'NUM1455', 'ITEMS': '77'},
{'USERID': 'USERID1', 'BATCH': 'NUM1465', 'ITEMS': '97'}
]
output = combine(raw)
print(output)
CodePudding user response:
Instead of storing the result in a list, which is designed for sequential iteration, not lookup, use a dictionary. This will offer much faster lookup times and ease of creation both in terms of code and algorithmic complexity:
result = {}
for d in raw:
item = result.get(d['USERID'])
if item is None:
item = {'USERID': d['USERID'], 'BATCH': [], 'ITEMS': []}
result[d['USERID'] = item
item['ITEMS'].append(d['ITEMS'])
item['BATCH'].append(d['BATCH'])