I know this question might irritate the expert but I hope you can have patience with a beginner.
I am trying to loop over multiple json file and format them as array object and save all those json file into a new json file. I need to basically merge all the files into one so I can have a accurate calculation of the unique ID.
so far I have this:
for root, subdirs, files in os.walk("./"):
for file in files:
if file.endswith('.json'):
to_queue = []
with open(file, "r ") as f:
print(file)
old = f.read()
f.seek(0) # rewind
# save to the old string after replace
new = old.replace('}{', '},{')
f.write(new)
tmps = '[' str(new) ']'
json_string = json.loads(tmps)
for key in json_string:
to_queue.append(key)
f.close
with open('update.json', 'a') as file:
json.dump(json_string, file, indent=2)
with open('update.json') as f:
data = json.load(f)
users = set(item.get('userID') for item in data)
print(len(users))
# print(users
It loops over every single files, format it and save it to update.json
which after being saved, I can count the unique UserID
present in the update.json.
This is the idea, but when I run the code I get the following error.
Traceback (most recent call last):
File "format.py", line 26, in <module>
data = json.load(f)
File "/Users/user/opt/anaconda3/lib/python3.8/json/__init__.py", line 293, in load
return loads(fp.read(),
File "/Users/user/opt/anaconda3/lib/python3.8/json/__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "/Users/user/opt/anaconda3/lib/python3.8/json/decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 17821 column 2 (char 501079)
As I mentioned I am a total beginner and I feel embarrassed to ask question that might be super easy for somebody else, I just hope somebody can kindly help me to understand how to go over this problem and why it happens.
Thank you so much
UPDATE:
for root, subdirs, files in os.walk("./"):
for file in files:
if file.endswith('.json'):
to_queue = []
newdictionary = {}
with open(file, "r ") as f:
print(file)
old = f.read()
f.seek(0) # rewind
# save to the old string after replace
new = old.replace('}{', '},{')
f.write(new)
tmps = '[' str(new) ']'
json_string = json.loads(tmps)
for key in json_string:
to_queue.append(key)
newdictionary.update(key)
f.close
for key in newdictionary:
with open('update.json', 'a') as file:
json.dump(key, file, indent=2)
with open('update.json') as f:
data = json.load(f)
users = set(item.get('userID') for item in data)
print(len(users))
This is the latest code error using buran code, which I really thank for his help.
Traceback (most recent call last):
File "format.py", line 40, in <module>
data = json.load(f)
File "/Users/user/opt/anaconda3/lib/python3.8/json/__init__.py", line 293, in load
return loads(fp.read(),
File "/Users/user/opt/anaconda3/lib/python3.8/json/__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "/Users/user/opt/anaconda3/lib/python3.8/json/decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 194 (char 193)
this is the raw content of the json files:
{"@timestamp":"2021-07-30T20:28:25.769Z","name":"","deviceAction":""},{"@timestamp":"2021-07-30T20:29:10.812Z","name":"","deviceAction":""}
In my previous code I had the formatting in place to convert them into an array. Some files looks like this, bot other have extra fields like this.
{
"@timestamp": "",
"userID": "",
"destinationUserName": "",
"message": "",
"name": ""
},
{
"@timestamp": "",
"userID": "",
"destinationUserName": "",
"message": "",
"name": ""
},
{
"@timestamp": "",
"userID": "",
"destinationUserName": "",
"message": "",
"name": ""
},
{
"@timestamp": "",
"userID": "",
"name": "",
"sourceUserName": "",
"deviceAction": ""
}
CodePudding user response:
import json
# loop over files recursively and store dicts from each file in a list
all_data = []
for root, subdirs, files in os.walk("./"):
for file in files:
if file.endswith('.json'):
with open(file, "r") as f:
# files are not valid json and need some pre-processing
raw_data = f.read()
try:
data = json.loads(f'[{raw_data.replace("}{", "},{")}]') # some files have missing comma between objects
all_data.extend(data)
except json.decoder.JSONDecodeError:
print(f'Error with file {file}')
# write all data to new file
with open('all_data.json', 'w') as f:
json.dump(all_data, f, indent=4)
users = set(item.get('userID') for item in all_data)
print(f'Number of users: {len(users)}')
# or read the file and count all unique users
with open('all_data.json') as f:
data = json.load(f)
users = set(item.get('userID') for item in data)
print(f'Number of users: {len(users)}')
Alternatively, instead of reading all_data.json
at the end, you can update can create empty set users
in the begining and update it with unique users in every file.