Suppose I have a txt file that looks like this (indentation is 4 spaces):
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1=value2_2_1
key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3
I want to convert it into any VALID json, like this one:
{
'key1':'value1',
'key2': {
'key2_1':'value2_1',
'key2_2':{
'key2_2_1':'value2_2_1'
},
'key2_3':['value2_3_1','value2_3_2','value2_3_3']
},
'key3':['value3_1','value3_2','value3_3']
}
I have tried this (which I got from another post):
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent 1)) strSplit[1]
inputString[position] = strSplit[0] '\n'
inputString.insert(position 1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text
# process our file here
with open(filename, 'r') as fh:
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = root.as_dict()['root']
# this variable is storing the json output
jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
print(jsonOutput)
which yields the following:
[
{
"key1": "value1"
},
{
"key2": [
{
"key2_1": "value2_1"
},
{
"key2_2": {
"key2_2_1": "value2_2_1"
}
},
{
"key2_3": "value2_3_1,value2_3_2,value2_3_3"
},
]
},
{
"key3": "value3_1,value3_2,value3_3"
}
]
Yet this is still not a valid JSON file.
When I try to open the output file using 'json' module, I get this predictable message: "JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)".
with open(r'C:\Users\nigel\OneDrive\Documents\LAB\lean\sample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
data = json.load(read_file)
output:
JSONDecodeError Traceback (most recent call last)
Input In [2], in <cell line: 1>()
1 with open(r'C:\Users\nigel\OneDrive\Documents\LAB\lean\sample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
----> 2 data = json.load(read_file)
File ~\Anaconda3\lib\json\__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
274 def load(fp, *, cls=None, object_hook=None, parse_float=None,
275 parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
276 """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
277 a JSON document) to a Python object.
278
(...)
291 kwarg; otherwise ``JSONDecoder`` is used.
292 """
--> 293 return loads(fp.read(),
294 cls=cls, object_hook=object_hook,
295 parse_float=parse_float, parse_int=parse_int,
296 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File ~\Anaconda3\lib\json\__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
341 s = s.decode(detect_encoding(s), 'surrogatepass')
343 if (cls is None and object_hook is None and
344 parse_int is None and parse_float is None and
345 parse_constant is None and object_pairs_hook is None and not kw):
--> 346 return _default_decoder.decode(s)
347 if cls is None:
348 cls = JSONDecoder
File ~\Anaconda3\lib\json\decoder.py:337, in JSONDecoder.decode(self, s, _w)
332 def decode(self, s, _w=WHITESPACE.match):
333 """Return the Python representation of ``s`` (a ``str`` instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):
File ~\Anaconda3\lib\json\decoder.py:353, in JSONDecoder.raw_decode(self, s, idx)
344 """Decode a JSON document from ``s`` (a ``str`` beginning with
345 a JSON document) and return a 2-tuple of the Python
346 representation and the index in ``s`` where the document ended.
(...)
350
351 """
352 try:
--> 353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
355 raise JSONDecodeError("Expecting value", s, err.value) from None
JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)
The reason is that JSON expects to find keys (strings enclosed in double quotes) when it actually finds json objects (nested dictionaries) in their places. That is it!
I truly appreciate any comments. Best,
Nigel
CodePudding user response:
Splitting The Strings Into Lists
I am assuming per your comment that you mean that you want to take your strings, for example, this line
key2_3=value2_3_1,value2_3_2,value2_3_3
and break these values up into "key2_3": ["value2_3_1", "value2_3_2", "value2_3_3"]
.
To do so, you'd have to make the following adjustment to the code provided to you:
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") # was self.text
Dictionaries of Dictionaries Instead of Lists
To make the output dictionary a dictionary of dictionaries with node base values of lists, ie {k1: {k2: [1, 2, 3]}}
, and of the like, we have to make 2 changes.
- Update the as_dict method to use
{}
instead of[]
. - Include a function to compress keys.
When I was doing this, I had a hard time outputting the correct data structure... it'd look basically like this, {k1: {k1: {k2: {k2: value}}}}
. This becomes obvious when you don't run the d = compress(root.as_dict()['root'])
(d = root.as_dict()['root']
) function in the code. So the code went from
def as_dict(self):
if len(self.children) > 1:
return {self.text: [node.as_dict() for node in self.children]}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
to
def as_dict(self):
if len(self.children) > 1:
return {self.text: {node.text: node.as_dict() for node in self.children}}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
, then I included the compress function
# for merging like sub keys and values
def compress(dictionary):
if isinstance(dictionary, dict):
for k, v in dictionary.items():
if isinstance(v, dict):
if k in v.keys():
dictionary[k] = dictionary[k].pop(k)
compress(dictionary[k])
compress(k)
return dictionary
Full Code
If you put the below in a file and run it from the command line, it should work 100%. Otherwise its probably a problem with anaconda or version of python (though that doesn't really seem likely).
from io import StringIO
import json
# for merging like sub keys and values
def compress(dictionary):
if isinstance(dictionary, dict):
for k, v in dictionary.items():
if isinstance(v, dict):
if k in v.keys():
dictionary[k] = dictionary[k].pop(k)
compress(dictionary[k])
compress(k)
return dictionary
# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
indentCount = 0
indentVal = " "
for position, eachLine in enumerate(inputString):
if "=" not in eachLine:
continue
else:
strSplit = eachLine.split("=", 1)
#get previous indentation
prevIndent = inputString[position].count(indentVal)
newVal = (indentVal * (prevIndent 1)) strSplit[1]
inputString[position] = strSplit[0] '\n'
inputString.insert(position 1, newVal)
flatList = "".join(inputString)
return flatList
# helper class for node usage
class Node:
def __init__(self, indented_line):
self.children = []
self.level = len(indented_line) - len(indented_line.lstrip())
self.text = indented_line.strip()
def add_children(self, nodes):
childlevel = nodes[0].level
while nodes:
node = nodes.pop(0)
if node.level == childlevel: # add node as a child
self.children.append(node)
elif node.level > childlevel: # add nodes as grandchildren of the last child
nodes.insert(0,node)
self.children[-1].add_children(nodes)
elif node.level <= self.level: # this node is a sibling, no more children
nodes.insert(0,node)
return
def as_dict(self):
if len(self.children) > 1:
return {self.text: {node.text: node.as_dict() for node in self.children}}
elif len(self.children) == 1:
return {self.text: self.children[0].as_dict()}
else:
return self.text.split(",") if "," in self.text else self.text
if __name__ == "__main__":
s = """
key1=value1
key2
key2_1=value2_1
key2_2
key2_2_1
key2_2_1_1=value2_2_1_1
key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3
"""
fh = StringIO(s)
fileContent = fh.readlines()
fileParse = convertIndentation(fileContent)
# convert equals signs to indentation
root = Node('root')
root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
d = compress(root.as_dict()['root'])
# this variable is storing the json output
jsonOutput = json.dumps(d, indent=4, sort_keys=False)
f = StringIO(jsonOutput)
# load the "file"
loaded = json.load(f)
print(s)
print(jsonOutput)
print(loaded)