Home > Software design >  How to iterate through nested dictionaries and extract substructures and filter out certain keys?
How to iterate through nested dictionaries and extract substructures and filter out certain keys?

Time:10-21

I have nested dictionaries, which are representations of XML that I have parsed using xmltodict.

Now I want to give the possibility to extract certain sub-structures and remove keys which contain '@'.

{'rpc': {'@xmlns': 'urn:1.0',
  '@message-id': '4',
  'edit-config': {'target': {'running': None},
   'config': {'test': {'@xmlns': 'urn:2.0',
     'common': {'abc': 'forward',
      'bbc': {'geo-model': 'texas',
       'remote': [{'id': '288',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '318',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '348',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'}]}}}}}}}

I have defined a function where the user should be able to provide an input to choose which tag they want to extract the substructure from, while also removing any key with '@' in it.

def process_xml_dict(d,clean_d,start_after_tag = None,reached_tag=False):
    if start_after_tag == None:
        for k, v in d.items():
            if isinstance(v, dict):
                process_xml_dict(v,clean_d)
            else:
                if '@' not in k:
                    clean_d[k] = v
    else:
        for k,v in d.items():
            if isinstance(v, dict):
                if k == start_after_tag:
                    reached_tag = True
                process_xml_dict(v,clean_d,start_after_tag,reached_tag)
            else:
                if '@' not in k and reached_tag:
                    clean_d[k] = v

But it does not work

   clean_d = dict()
   process_xml_dict(d,clean_d)
   print(clean_d)

Should output

{'rpc': {
  'edit-config': {'target': {'running': None},
   'config': {'test': {
     'common': {'abc': 'forward',
      'bbc': {'geo-model': 'texas',
       'remote': [{'id': '288',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '318',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '348',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'}]}}}}}}}

But now it outputs

{
 'running': None,
 'abc': 'forward',
 'geo-model': 'texas',
 'remote': [{'id': '288',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-system-id': '318',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-system-id': '348',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'}]}

And if I input

clean_d = dict()
process_xml_dict(d,clean_d,start_after_tag='config')
print(clean_d)

It should output

{'test': {'common': {'abc': 'forward',
    'bbc': {'geo-model': 'texas',
    'remote': [{'id': '288',
      'transport': 'tcp',
      'port': '8',
      'ipv4-address': '0.0.0.0'},
     {'distributed-id': '318',
      'transport': 'tcp',
      'port': '8',
      'ipv4-address': '0.0.0.0'},
     {'distributed-id': '348',
      'transport': 'tcp',
      'port': '8',
      'ipv4-address': '0.0.0.0'}]}}}}

but now it outputs

{'abc': 'forward',
 'geo-model': 'texas',
 'remote': [{'id': '288',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-id': '318',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'},
  {'distributed-id': '348',
   'transport': 'tcp',
   'port': '8',
   'ipv4-address': '0.0.0.0'}]}

What am I doing wrong? And how would I modify my function to output expected output?

Thankful for any input.

CodePudding user response:


def clean(dictionary):
    return {key:value for key, value in dictionary.items() if '@' not in key}

def clean_recursively(dictionary):
    dictionary = clean(dictionary)
    return {key: (clean_recursively(value) 
            if isinstance(value, dict) else value) 
            for key, value in dictionary.items()}

Does this achieve the result? :) You can write it without dictionary comprehension, I just think dict-comp looks fine in this particular case because it avoids deep nesting and looks 'clearer' to me.

CodePudding user response:

This will work for you

orig_dict = {'rpc': {'@xmlns': 'urn:1.0',
  '@message-id': '4',
  'edit-config': {'target': {'running': None},
   'config': {'test': {'@xmlns': 'urn:2.0',
     'common': {'abc': 'forward',
      'bbc': {'geo-model': 'texas',
       'remote': [{'id': '288',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '318',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'},
        {'distributed-system-id': '348',
         'transport': 'tcp',
         'port': '8',
         'ipv4-address': '0.0.0.0'}]}}}}}}}

def get_filter_dict(dict_):
    res = {}
    if isinstance(dict_, dict):
        for k,v in dict_.items():
            if isinstance(v, dict):
                res[k] = get_filter_dict(v)
            else:
                if not k.startswith("@"):
                    res[k] = v
    return res

def get_dict_start_after_tag(dict_, res=[None], start_after_tag=None):
    if start_after_tag:
        for k, v in dict_.items():
            if k == start_after_tag:
                res[0] = get_filter_dict(v)
                return res[0]
            if isinstance(v, dict):
                _ =  get_dict_start_after_tag(v, res, start_after_tag)
    else:
        res[0] = get_filter_dict(dict_)
        return res

res = [None]
get_dict_start_after_tag(orig_dict, res)
# {'rpc': {'edit-config': {'target': {'running': None}, 'config': {'test': {'common': {'abc': 'forward', 'bbc': {'geo-model': 'texas', 'remote': [{'id': '288', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '318', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '348', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}]}}}}}}}
get_dict_start_after_tag(orig_dict, res, "config")
# {'test': {'common': {'abc': 'forward', 'bbc': {'geo-model': 'texas', 'remote': [{'id': '288', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '318', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}, {'distributed-system-id': '348', 'transport': 'tcp', 'port': '8', 'ipv4-address': '0.0.0.0'}]}}}}

print(res[0])

        
  • Related