Home > OS >  From text file to JSON file with python
From text file to JSON file with python

Time:12-02

Suppose I have a txt file that looks like this (indentation is 4 spaces):

key1=value1
key2
    key2_1=value2_1
    key2_2
        key2_2_1=value2_2_1
    key2_3=value2_3_1,value2_3_2,value2_3_3
key3=value3_1,value3_2,value3_3

I want to convert it into any VALID json, like this one:

{
'key1':'value1',
'key2': {
    'key2_1':'value2_1',
    'key2_2':{
        'key2_2_1':'value2_2_1'
        },
    'key2_3':['value2_3_1','value2_3_2','value2_3_3']
    },
'key3':['value3_1','value3_2','value3_3']
}

I have tried this (which I got from another post):

# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
    indentCount = 0
    indentVal = "    "
    for position, eachLine in enumerate(inputString):
        if "=" not in eachLine:
            continue
        else:
            strSplit = eachLine.split("=", 1)
            #get previous indentation
            prevIndent = inputString[position].count(indentVal)
            newVal = (indentVal * (prevIndent   1))   strSplit[1]
            inputString[position] = strSplit[0]   '\n'
            inputString.insert(position 1, newVal)
    flatList = "".join(inputString)
    return flatList

# helper class for node usage
class Node:
    def __init__(self, indented_line):
        self.children = []
        self.level = len(indented_line) - len(indented_line.lstrip())
        self.text = indented_line.strip()

    def add_children(self, nodes):
        childlevel = nodes[0].level

        while nodes:
            node = nodes.pop(0)
            if node.level == childlevel: # add node as a child
                self.children.append(node)
            elif node.level > childlevel: # add nodes as grandchildren of the last child
                nodes.insert(0,node)
                self.children[-1].add_children(nodes)
            elif node.level <= self.level: # this node is a sibling, no more children
                nodes.insert(0,node)
                return

    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: [node.as_dict() for node in self.children]}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text

# process our file here
with open(filename, 'r') as fh:
    fileContent = fh.readlines()
    fileParse = convertIndentation(fileContent)
    # convert equals signs to indentation
    root = Node('root')
    root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
    d = root.as_dict()['root']
    # this variable is storing the json output
    jsonOutput = json.dumps(d, indent = 4, sort_keys = False)
    print(jsonOutput)

which yields the following:

[
    {
        "key1": "value1"
    },
    {
        "key2": [
            {
                "key2_1": "value2_1"
            },
            {
                "key2_2": {
                    "key2_2_1": "value2_2_1"
                }
            },
            {
                "key2_3": "value2_3_1,value2_3_2,value2_3_3"
            },
        ]
    },
    {
        "key3": "value3_1,value3_2,value3_3"
    }
]

Yet this is still not a valid JSON file.

When I try to open the output file using 'json' module, I get this predictable message: "JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)".

with open(r'C:\Users\nigel\OneDrive\Documents\LAB\lean\sample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
    data = json.load(read_file)

output:

JSONDecodeError                           Traceback (most recent call last)
Input In [2], in <cell line: 1>()
      1 with open(r'C:\Users\nigel\OneDrive\Documents\LAB\lean\sample_01.02_R00.json', 'r', encoding='utf-8') as read_file:
----> 2     data = json.load(read_file)

File ~\Anaconda3\lib\json\__init__.py:293, in load(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    274 def load(fp, *, cls=None, object_hook=None, parse_float=None,
    275         parse_int=None, parse_constant=None, object_pairs_hook=None, **kw):
    276     """Deserialize ``fp`` (a ``.read()``-supporting file-like object containing
    277     a JSON document) to a Python object.
    278 
   (...)
    291     kwarg; otherwise ``JSONDecoder`` is used.
    292     """
--> 293     return loads(fp.read(),
    294         cls=cls, object_hook=object_hook,
    295         parse_float=parse_float, parse_int=parse_int,
    296         parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)

File ~\Anaconda3\lib\json\__init__.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    341     s = s.decode(detect_encoding(s), 'surrogatepass')
    343 if (cls is None and object_hook is None and
    344         parse_int is None and parse_float is None and
    345         parse_constant is None and object_pairs_hook is None and not kw):
--> 346     return _default_decoder.decode(s)
    347 if cls is None:
    348     cls = JSONDecoder

File ~\Anaconda3\lib\json\decoder.py:337, in JSONDecoder.decode(self, s, _w)
    332 def decode(self, s, _w=WHITESPACE.match):
    333     """Return the Python representation of ``s`` (a ``str`` instance
    334     containing a JSON document).
    335 
    336     """
--> 337     obj, end = self.raw_decode(s, idx=_w(s, 0).end())
    338     end = _w(s, end).end()
    339     if end != len(s):

File ~\Anaconda3\lib\json\decoder.py:353, in JSONDecoder.raw_decode(self, s, idx)
    344 """Decode a JSON document from ``s`` (a ``str`` beginning with
    345 a JSON document) and return a 2-tuple of the Python
    346 representation and the index in ``s`` where the document ended.
   (...)
    350 
    351 """
    352 try:
--> 353     obj, end = self.scan_once(s, idx)
    354 except StopIteration as err:
    355     raise JSONDecodeError("Expecting value", s, err.value) from None

JSONDecodeError: Expecting property name enclosed in double quotes: line 10 column 5 (char 165)

The reason is that JSON expects to find keys (strings enclosed in double quotes) when it actually finds json objects (nested dictionaries) in their places. That is it!

I truly appreciate any comments. Best,

Nigel

CodePudding user response:

Splitting The Strings Into Lists

I am assuming per your comment that you mean that you want to take your strings, for example, this line key2_3=value2_3_1,value2_3_2,value2_3_3 and break these values up into "key2_3": ["value2_3_1", "value2_3_2", "value2_3_3"].

To do so, you'd have to make the following adjustment to the code provided to you:

def as_dict(self):
    if len(self.children) > 1:
        return {self.text: [node.as_dict() for node in self.children]}
    elif len(self.children) == 1:
        return {self.text: self.children[0].as_dict()}
    else:
        return self.text.split(",") # was self.text

Dictionaries of Dictionaries Instead of Lists

To make the output dictionary a dictionary of dictionaries with node base values of lists, ie {k1: {k2: [1, 2, 3]}}, and of the like, we have to make 2 changes.

  1. Update the as_dict method to use {} instead of [].
  2. Include a function to compress keys.

When I was doing this, I had a hard time outputting the correct data structure... it'd look basically like this, {k1: {k1: {k2: {k2: value}}}}. This becomes obvious when you don't run the d = compress(root.as_dict()['root']) (d = root.as_dict()['root']) function in the code. So the code went from

def as_dict(self):
    if len(self.children) > 1:
        return {self.text: [node.as_dict() for node in self.children]}
    elif len(self.children) == 1:
        return {self.text: self.children[0].as_dict()}
    else:
        return self.text.split(",") if "," in self.text else self.text

to

def as_dict(self):
    if len(self.children) > 1:
        return {self.text: {node.text: node.as_dict() for node in self.children}}
    elif len(self.children) == 1:
        return {self.text: self.children[0].as_dict()}
    else:
        return self.text.split(",") if "," in self.text else self.text

, then I included the compress function

# for merging like sub keys and values
def compress(dictionary):
    if isinstance(dictionary, dict):
        for k, v in dictionary.items():
            if isinstance(v, dict):
                if k in v.keys():
                    dictionary[k] = dictionary[k].pop(k)
                compress(dictionary[k])
            compress(k)
    return dictionary

Full Code

If you put the below in a file and run it from the command line, it should work 100%. Otherwise its probably a problem with anaconda or version of python (though that doesn't really seem likely).

from io import StringIO
import json

# for merging like sub keys and values
def compress(dictionary):
    if isinstance(dictionary, dict):
        for k, v in dictionary.items():
            if isinstance(v, dict):
                if k in v.keys():
                    dictionary[k] = dictionary[k].pop(k)
                compress(dictionary[k])
            compress(k)
    return dictionary

# helper method to convert equals sign to indentation for easier parsing
def convertIndentation(inputString):
    indentCount = 0
    indentVal = "    "
    for position, eachLine in enumerate(inputString):
        if "=" not in eachLine:
            continue
        else:
            strSplit = eachLine.split("=", 1)
            #get previous indentation
            prevIndent = inputString[position].count(indentVal)
            newVal = (indentVal * (prevIndent   1))   strSplit[1]
            inputString[position] = strSplit[0]   '\n'
            inputString.insert(position 1, newVal)
    flatList = "".join(inputString)
    return flatList



# helper class for node usage
class Node:
    def __init__(self, indented_line):
        self.children = []
        self.level = len(indented_line) - len(indented_line.lstrip())
        self.text = indented_line.strip()
    def add_children(self, nodes):
        childlevel = nodes[0].level
        while nodes:
            node = nodes.pop(0)
            if node.level == childlevel: # add node as a child
                self.children.append(node)
            elif node.level > childlevel: # add nodes as grandchildren of the last child
                nodes.insert(0,node)
                self.children[-1].add_children(nodes)
            elif node.level <= self.level: # this node is a sibling, no more children
                nodes.insert(0,node)
                return
    def as_dict(self):
        if len(self.children) > 1:
            return {self.text: {node.text: node.as_dict() for node in self.children}}
        elif len(self.children) == 1:
            return {self.text: self.children[0].as_dict()}
        else:
            return self.text.split(",") if "," in self.text else self.text

if __name__ == "__main__":

    s = """
        key1=value1
        key2
            key2_1=value2_1
            key2_2
                key2_2_1
                    key2_2_1_1=value2_2_1_1
            key2_3=value2_3_1,value2_3_2,value2_3_3
        key3=value3_1,value3_2,value3_3
    """

    fh = StringIO(s)
    fileContent = fh.readlines()
    fileParse = convertIndentation(fileContent)
    # convert equals signs to indentation
    root = Node('root')
    root.add_children([Node(line) for line in fileParse.splitlines() if line.strip()])
    d = compress(root.as_dict()['root'])
    # this variable is storing the json output
    jsonOutput = json.dumps(d, indent=4, sort_keys=False)
    f = StringIO(jsonOutput)

    # load the "file"
    loaded = json.load(f)

    print(s)
    print(jsonOutput)
    print(loaded)
  • Related