I have the following XML data in string format, which using lxml
package of python, I am parsing it into XML.
UPDATE: I HAVE UPDATED THE CODE AND THE OUTPUT
Now, I have to traverse through this XML data:
<A xmlns="dfjdlfkdjflsd">
<B>
<B1>B_1</B1>
<B2>B_2</B2>
<B3>
<B31>B3_1</B31>
<B32>B3_2</B32>
<B33>
<B331>
<B3311></B3311>
</B331>
<B332>
<B3321></B3321>
</B332>
</B33>
<B34>
<B341>
<B3411></B3411>
</B341>
<B342>
<B3421></B3421>
</B342>
</B34>
<B35>
<B351>B35_1</B351>
<B352>
<B3521>B352_1</B3521>
<B3522>B352_2</B3522>
<B3523>B352_3</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
<B352>
<B3521>B352_4</B3521>
<B3522>B352_5</B3522>
<B3523>B352_6</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
<B352>
<B3521>B352_7</B3521>
<B3522>B352_8</B3522>
<B3523>B352_9</B3523>
<B3524>
<B35241>
<B352411></B352411>
<B352412></B352412>
<B352413></B352413>
</B35241>
</B3524>
</B352>
</B35>
<B36>
<B361>B36_1</B361>
<B362>B36_2</B362>
</B36>
</B3>
</B>
<C>
<C1>B_1</C1>
<C2>B_2</C2>
<C3>
<C31>C3_1</C31>
<C32>C3_2</C32>
<C33>
<C331>
<C3311></C3311>
</C331>
<C332>
<C3321></C3321>
</C332>
</C33>
</C3>
</C>
</A>
and generate an output in a specific format which will be something like this:
[{'B1': 'B_1',
'B2': 'B_2',
'B3_B31': 'B3_1',
'B3_B32': 'B3_2',
'B3_B33_B331_B3311': '-',
'B3_B33_B332_B3321': '-',
'B3_B34_B341_B3411': '-',
'B3_B34_B342_B3421': '-',
'B3_B35_B352': [
{
'B3_B35_B352_B3521': 'B352_1',
'B3_B35_B352_B3522': 'B352_2',
'B3_B35_B352_B3523': 'B352_3',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
},
{
'B3_B35_B352_B3521': 'B352_4',
'B3_B35_B352_B3522': 'B352_5',
'B3_B35_B352_B3523': 'B352_6',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
},
{
'B3_B35_B352_B3521': 'B352_7',
'B3_B35_B352_B3522': 'B352_8',
'B3_B35_B352_B3523': 'B352_9',
'B3_B35_B352_B3524_B35241_B352411': '-',
'B3_B35_B352_B3524_B35241_B352412': '-',
'B3_B35_B352_B3524_B35241_B352413': '-'
}
],
'B3_B36_B361': 'B36_1',
'B3_B36_B362': 'B36_2'},
{'C1': 'B_1',
'C2': 'B_2',
'C3_C31': 'C3_1',
'C3_C32': 'C3_2',
'C3_C33_C331_C3311': '-',
'C3_C33_C332_C3321': '-'}]
Now, this problem of mine is currently the follow-up of this question. Trying to traverse through nested xml tags but recursive function does not traverse in full depth, in where I am able to traverse through the nested XML tags and able to generate the output.
But with one thing is I am having an issue with is handling the scenarios where duplicate XML tags are present.
With the current code I have, I am getting this output.
[{'B1': 'B_1',
'B2': 'B_2',
'B3_B31': 'B3_1',
'B3_B32': 'B3_2',
'B3_B33_B331_B3311': '-',
'B3_B33_B332_B3321': '-',
'B3_B34_B341_B3411': '-',
'B3_B34_B342_B3421': '-',
'B3_B35_B351': 'B35_1',
'B3_B35_B352_B3521_B35241': '1',
'B3_B35_B352_B3521_B35242': '2',
'B3_B35_B352_B3521_B35243': '3',
'B3_B35_B353_B3531': 'B353_1',
'B3_B36_B361': 'B36_1',
'B3_B36_B362': 'B36_2',
'duplicate': [{'B3_B35_B352_B3521_B35241': '4',
'B3_B35_B352_B3521_B35242': '5',
'B3_B35_B352_B3521_B35243': '6'},
{'B3_B35_B352_B3521_B35241': '7',
'B3_B35_B352_B3521_B35242': '8',
'B3_B35_B352_B3521_B35243': '9',
'B3_B35_B353_B3532_B35321': 'B3532_3',
'B3_B35_B353_B3532_B35322': 'B3532_4'},
{'B3_B35_B353_B3532_B35321': 'B3532_5',
'B3_B35_B353_B3532_B35322': 'B3532_6'},
{'B3_B35_B353_B3532_B35321': 'B3532_1',
'B3_B35_B353_B3532_B35322': 'B3532_2'}]},
{'C1': 'B_1',
'C2': 'B_2',
'C3_C31': 'C3_1',
'C3_C32': 'C3_2',
'C3_C33_C331_C3311': '-',
'C3_C33_C332_C3321': '-'}]
Now, if you compare the expected output and the actual output, you'll realize that the key names are different where duplicate XML tags are present. Also in the duplicate list, XML tags are mixed up with each other.
I have to use _handle_duplicates
method separately to handle the duplicate XML tags.
This is the code I am using right now
class ParseXML:
def __init__(self, xml_input):
self.main_output = []
parser = et.XMLParser(recover=True)
self.tree = et.fromstring(re.sub('\s*xmlns(:\w )?="[^"]*"', '', xml_input), parser=parser)
def parse_xml(self):
for interface in list(self.tree):
temp_output = {}
for children in interface:
temp_list = []
temp_dict = {}
for key, value in self._flatten(children):
if key in temp_output:
if key in temp_dict:
temp_list.append(temp_dict)
temp_dict = {}
temp_dict.update({key: value})
else:
temp_output.update({key: value})
temp = self._handle_duplicates(temp_output, temp_dict, temp_list) if temp_dict else temp_output
self.main_output.append(temp)
return self.main_output
def _flatten(self, node, tags=None):
if tags is None:
tags = []
children = list(node)
if not children:
if node.text is None:
yield '_'.join(tags [node.tag]), '-'
else:
yield '_'.join(tags [node.tag]), node.text
else:
for child in children:
for key_val in self._flatten(child, tags [node.tag]):
yield key_val
def _handle_duplicates(self, temp_output, temp_dict, temp_list):
temp_list.append(temp_dict)
temp = {}
for dup in temp_dict:
temp.update({dup: temp_output.pop(dup)})
temp_list.append(temp)
temp_output.update({'duplicate': temp_list})
return temp_output
if __name__ == '__main__':
parse = ParseXML(data)
output = parse.parse_xml()
pprint(output)
The current code is able to handle the duplicate XML tags but not in the format I want it. Also, it would be better to handle these duplicate XML tags in the _flatten
method, rather than having a different code/method to handle.
Can anyone look into this and provide me some guidance in handling the duplicate XML tag?
CodePudding user response:
You can use collections.defaultdict
with recursion:
import xml.etree.ElementTree as ET, re, json
from collections import defaultdict
t = ET.fromstring(re.sub('\sxmlns\="\w "', '', s_xml))
def get_groups(d, p = []):
if not (c:=list(d)):
yield [re.sub('^[A-Z] _', '', '_'.join(p [d.tag])), '-' if d.text is None else d.text]
else:
m = defaultdict(int)
for i in c:
m[json.dumps([*get_groups(i, p [d.tag])])] = 1
for a, b in m.items():
if b == 1:
yield from json.loads(a)
else:
yield [re.sub('^[A-Z] _', '', '_'.join(p [d.tag])), [dict(json.loads(a)) for _ in range(b)]]
r = [dict(get_groups(i)) for i in t]
Output:
[{'B1': 'B_1', 'B2': 'B_2', 'B3_B31': 'B3_1', 'B3_B32': 'B3_2', 'B3_B33_B331_B3311': '-', 'B3_B33_B332_B3321': '-', 'B3_B34_B341_B3411': '-', 'B3_B34_B342_B3421': '-', 'B3_B35_B351': 'B35_1', 'B3_B35': [{'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}, {'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}, {'B3_B35_B352_B3521': 'B352_1', 'B3_B35_B352_B3522': 'B352_2', 'B3_B35_B352_B3523': 'B352_3', 'B3_B35_B352_B3524_B35241_B352411': '-', 'B3_B35_B352_B3524_B35241_B352412': '-', 'B3_B35_B352_B3524_B35241_B352413': '-'}], 'B3_B36_B361': 'B36_1', 'B3_B36_B362': 'B36_2'}, {'C1': 'B_1', 'C2': 'B_2', 'C3_C31': 'C3_1', 'C3_C32': 'C3_2', 'C3_C33_C331_C3311': '-', 'C3_C33_C332_C3321': '-'}]
CodePudding user response:
Somehow, I managed to achieve what I needed. But, I am not proud of this code. I'll be more than happy if someone provides me a better, more pythonic code than this.
class ParseXML:
"""
Parsing of an XML section from a string to a flattened dictionary
:param xml_input: XML section from a string
"""
def __init__(self, xml_input):
# Parsing an XML section from a string with also removing `xmlns` tags
self.main_output = []
self.tree = et.fromstring(re.sub('\\s*xmlns="[\\S]*"', '', xml_input))
def parse_xml(self):
"""
Parsing each XML section and returning the output
:return: Returning flattened dictionary from XML string
"""
# Looping through each interface section
for interface in list(self.tree):
temp_output = {}
# Looping through all the child elements each interface section has
for children in interface:
temp_dup = {}
dup_keys = []
dup_child = []
# Getting flattened key and value from _flatten method
for key, value in self._flatten(children):
# If the key is duplicate, store the duplicate entries into the temp_dup,
# Also store the duplicate keys into dup_keys
if key in temp_output:
# if the key is multi duplicate, store all the duplicate entries into dup_child
if key in dup_keys:
dup_child.append(temp_dup)
temp_dup = {}
dup_keys = []
dup_keys.append(key)
temp_dup.update({key: value})
else:
# If temp_dup is not empty
if temp_dup:
# merge all the duplicate values into a list of dictionaries
temp_output = self._merge_duplicates(dup_keys, temp_output, temp_dup, dup_child)
if dup_child:
dup_child = []
dup_keys = []
temp_dup = {}
# Update the temp_output with the flattened duplicate values
temp_output.update({key: value})
# if the duplicate section is the last one in the XML section
# merge all the duplicate values into a list of dictionaries
if temp_dup:
temp_output = self._merge_duplicates(dup_keys, temp_output, temp_dup, dup_child)
# Update main_output with the values of temp_output
self.main_output.append(temp_output)
return self.main_output
@staticmethod
def common_substr(data):
"""
Getting common xml tag name where duplicate XML tags are present
:param data: containing duplicate XML tags
:return: common xml tag name
"""
output = ''
# Sort the data by lenghtwise
data = sorted(data, key=len)
# Run a loop as per the length of data
for i in range(len(data[0])):
# check if character at the specific position,
# matches will all other entries of data values at the same position
if all(data[0][i] in j[i] for j in data[1:]):
output = data[0][i]
return output.rstrip('_')
def _flatten(self, node, tags=None):
"""
Generating flattened dictionary from a nested XML
:param node: XML tag section
:param tags: XML tag name
:return: dictionary key and value in tuple
"""
if tags is None:
tags = []
children = list(node)
# check if XML tag has children or not
if not children:
# If a XML tag doesn't have any text, replace it with the '-'
if node.text is None:
yield '_'.join(tags [node.tag]), '-'
else:
yield '_'.join(tags [node.tag]), node.text
else:
# Looping throught all the children
# call the same function recursively with update tag name
for child in children:
for key_val in self._flatten(child, tags [node.tag]):
yield key_val
def _merge_duplicates(self, dup_keys, temp_output, temp_dup, dup_child):
"""
Merge duplicate values into the list
:param dup_keys: List containing duplicate keys
:param temp_output: Temporary list holding dictionary of keys and values
:param temp_dup: Temporary dictionary of duplicate keys and values
:param dup_child: List holding dictionaries of duplicate keys and values
:return: Temporty list holding merged duplicate dictionaries into a list of dictionaries
"""
# Get the common key name from the list of dup_keys
common_key = self.common_substr(dup_keys)
# update temp_output with the values of temp_dup
temp_output[common_key] = []
temp_output[common_key].append(temp_dup)
temp_dup = {}
# Looping through all the value of dup_keys,
# Pop key-values from temp_output and store back into temp_dup
for dup_key in dup_keys:
temp_dup[dup_key] = temp_output.pop(dup_key)
# Update temp_output with the values of temp_dup
temp_output[common_key].append(temp_dup)
# If has multiple duplicate entries in dup_child
# adds the dup_child to temp_output
if dup_child:
temp_output[common_key].extend(dup_child)
return temp_output
if __name__ == '__main__':
parse = ParseXML(DATA)
result = parse.parse_xml()
pprint(result)
P.S. The _flatten
method is provided by the @Ajax1234.