Loading json from multiple files in one folder and putting them into one dicitionary (or list)-CodePudding

I have multiple text files that have this inside them(with diferent values):

{"abandon": {"R1F2V0CQAYJOZZ": 2, "R3NUFWQ9SPGVJO": 1}, "abduct": {"R1F2V0CQAYJOZZ": 1, "R3376OQSHCTV1A": 1, "R14BW4EQZNVKKG": 1, "R233CMES8RCOCU": 1},

If i format it online it becomes like this:

   "abandon":{
      "R1F2V0CQAYJOZZ":2,
      "R3NUFWQ9SPGVJO":1
   },
   "abduct":{
      "R1F2V0CQAYJOZZ":1,
      "R3376OQSHCTV1A":1,
      "R14BW4EQZNVKKG":1,
      "R233CMES8RCOCU":1
   },

What this JSON mean is:

"word":{
   "Document name":"value"
},

But there are repeated words in diferent files. What i want to do is: Read all files and store everything in one dictionary, but:

If the "word" exists in the dictionary check if the "document" is there;
If the "document exists", then increment the "value", else put the document there and the "value = 1"
If the "word" doesnt exist, store the "word", "document" and "value = 1"

EDIT:

So imagine this 2 files:

File1.txt = {"abandon": {"doc1": 2, "doc2": 1}, "abduct": {"doc1": 1, "doc2": 1, "doc8": 1},

File1.txt = {"abandon": {"doc1": 1, "doc3": 1}, "abduct": {"doc5": 1, "doc8": 1},

I want my dictionary to end like this:

{"abandon": {"doc1": 3, "doc2": 1, "doc3": 1}, "abduct": {"doc1": 1, "doc2": 1, "doc5": 1, "doc8": 2},

EDIT2: it can also be a nested List

CodePudding user response：

IIUC, try:

import os
import json

files = [f for f in os.listdir() if f.endswith(".txt")]
result = dict()

for file in files:
    d = json.load(open(file))
    for word in d:
        if word not in result:
            result[word] = dict()
        for doc in d[word]:
            if doc not in result[word]:
                result[word][doc] = d[word][doc]
            else:
                result[word][doc]  = d[word][doc]

>>> result
{'abandon': {'doc1': 3, 'doc2': 1, 'doc3': 1},
 'abduct': {'doc1': 1, 'doc2': 1, 'doc8': 2, 'doc5': 1}}

Input files:

file1.txt:

{"abandon": {"doc1": 2, "doc2": 1}, "abduct": {"doc1": 1, "doc2": 1, "doc8": 1}}

file2.txt:

{"abandon": {"doc1": 1, "doc3": 1}, "abduct": {"doc5": 1, "doc8": 1}}

CodePudding user response：

import json

files = ["input", "list", "of", "files"]
outDict = {}
for file in files:  # iterate over the files
    with open(file) as fn:
        newDict = json.load(fn)
    for word in newDict:  # iterate over each word from the file
        inWord = newDict[word]
        outWord = outDict.get(word, {})  # returns an empty dict if word isn't already in the output dictionary
        for docName in inWord:  # iterate over each document name from the file
            value = outWord.get(docName, 0)  # returns 0 if the document name isn't already in the output dictionary
            value  = 1  # increment the value
            outWord[docName] = value  # update the output dictionary

CodePudding user response：

It's straight-forward to merge using .setdefault:

import json
import glob

merged = {}

for file in glob.glob('*.txt'):  # match *.txt files in current directory

    with open(file) as f:
        in_dict = json.load(f)

        for word, docs in in_dict.items():
            for doc, value in docs.items():
                merged.setdefault(word,{})       # create word with empty dict value if it doesn't exist
                merged[word].setdefault(doc, 0)  # create value of 0 for document if it doesn't exist
                merged[word][doc]  = value       # add the doc's value.

print(json.dumps(merged,indent=2))

Or using defaultdict. The parameter to defaultdict must be a function that returns the default value, hence the lambda that returns a default integer dictionary:

import json
import glob
from collections import defaultdict

merged = defaultdict(lambda: defaultdict(int))

for file in glob.glob('*.txt'):  # match *.txt files in current directory

    with open(file) as f:
        in_dict = json.load(f)

        for word, docs in in_dict.items():
            for doc,value in docs.items():
                merged[word][doc]  = value       # add the doc's value.

print(json.dumps(merged,indent=2))