Dynamically/Recursively Normalize Data while Iterating through a Python Dictionary-CodePudding

I have a dictionary that looks like:

my_dict = {
    '_id': '12powensjdm683ma23',
    'data': {
      'account': 'FUNDING',
      'form': {
        'accounts': {
          'credit': {
            'name': 'JOBEN BETETI BORGES',
            'account_number': 'YYYYYYYY',
            'address': {
              'line3': '0.',
              'line2': 'rial, Santo André, SP, BR,09080-50',
              'line1': '1600 APTO.51 TORRE 3 Avenida Indust'
            }
          },
          'receiving': {
            'name': 'THE BANK OF NEW YORK MELLON',
            'aba_number': 'XXXXXXXX',
            'address': {
              'line1': 'NEW YORK, NY, US',
              'line2': '',
              'line3': ''
            }
          }
        }
      }
    }
  }

I am trying to create a function that will loop through each key value pair and determine if the value has "offending" values like the é in the municipality Santo André.

line2': 'rial, Santo André, SP, BR, 09080-50',

If it is, then we will normalize the value to a normal e. So it'd look like this:

line2': 'rial, Santo Andre, SP, BR, 09080-50',

In addition to this, I would like this to be dynamic, so that the code doesn't have to explicitly look for my_dict['data']['form']['accounts']['credit'] or my_dict['line2']. It should just loop through each key value pair, and if that value is "offending", then update it (I have other dictionaries that I need to update in a similar way, but their keys, lengths and depths are varying).

I think I really just need a way to loop through every level of a dictionary that has any number of particular levels.

I initially thought to recursively run the function through this dynamic_input() function I have but the input takes string values. However, with the recursive function, the values might also be another dict. dynamic_input() currently takes in a string value.

def dynamic_input(input):
    ''' Clean the given dynamic input of any offending formats or characters.
        Parameters: str(input)
        Returns: str(cleaned_input)
    '''
    # remove excessive spaces
    input = " ".join(input.split())

    # remove undesired characters
    input = re.sub("\\n|\\r|\\t|'|\"", '', input)

    # special character conversion
    input = unicodedata.normalize('NFD', input)
    input = re.sub(r'[^\x00-\x7f]', '', input)

    # return
    return input

CodePudding user response：

since I cannot comment yet, I will post the answer I've found here

my_dict = {
    '_id': '12powensjdm683ma23',
    'data': {
    'account': 'FUNDING',
    'form': {
        'accounts': {
        'credit': {
            'name': 'JOBEN BETETI BORGES',
            'account_number': 'YYYYYYYY',
            'address': {
            'line3': '0.',
            'line2': 'rial, Santo André, SP, BR,09080-50',
            'line1': '1600 APTO.51 TORRE 3 Avenida Indust'
            }
        },
        'receiving': {
            'name': 'THE BANK OF NEW YORK MELLON',
            'aba_number': 'XXXXXXXX',
            'address': {
            'line1': 'NEW YORK, NY, US',
            'line2': '',
            'line3': ''
            }
        }
        }
    }
    }
}


def nested_dict_pairs_iterator(dict_obj):
    ''' This function accepts a nested dictionary as argument
        and iterate over all values of nested dictionaries
    '''
    # Iterate over all key-value pairs of dict argument
    for key, value in dict_obj.items():
        # Check if value is of dict type
        if isinstance(value, dict):
            # If value is dict then iterate over all its values
            for pair in  nested_dict_pairs_iterator(value):
                yield (key, *pair)
        else:
            # If value is not dict type then yield the value
            yield (key, value)

#Loop through all key-value pairs of a nested dictionary    
for pair in nested_dict_pairs_iterator(my_dict):
    print(pair)

output:

('_id', '12powensjdm683ma23')
('data', 'account', 'FUNDING')
('data', 'form', 'accounts', 'credit', 'name', 'JOBEN BETETI BORGES')
('data', 'form', 'accounts', 'credit', 'account_number', 'YYYYYYYY')
('data', 'form', 'accounts', 'credit', 'address', 'line3', '0.')
('data', 'form', 'accounts', 'credit', 'address', 'line2', 'rial, Santo André, SP, BR,09080-50')
('data', 'form', 'accounts', 'credit', 'address', 'line1', '1600 APTO.51 TORRE 3 Avenida Indust')
('data', 'form', 'accounts', 'receiving', 'name', 'THE BANK OF NEW YORK MELLON')
('data', 'form', 'accounts', 'receiving', 'aba_number', 'XXXXXXXX')
('data', 'form', 'accounts', 'receiving', 'address', 'line1', 'NEW YORK, NY, US')
('data', 'form', 'accounts', 'receiving', 'address', 'line2', '')
('data', 'form', 'accounts', 'receiving', 'address', 'line3', '')

You can implement your replacing stuff of unaccepted characters in the function.

Hope this helps!

CodePudding user response：

You can define a simple function, that would recursively map dict values to a function. Here's how I'd define one and how you could use it:

from typing import Callable
import collections.abc

def dict_rec_map(d: dict, func: Callable[[str], str]):
    for k, v in d.items():
        if isinstance(v, collections.abc.Mapping):
            d[k] = dict_rec_map(v, func) # recursive call
        else:
            d[k] = func(v)
    return d

updated = dict_rec_map(my_dict, dynamic_input)

Using your data and function, this returned to me:

{
    "_id": "12powensjdm683ma23",
    "data": {
        "account": "FUNDING",
        "form": {
            "accounts": {
                "credit": {
                    "account_number": "YYYYYYYY",
                    "address": {
                        "line1": "1600 APTO.51 TORRE 3 Avenida Indust",
                        "line2": "rial, Santo Andre, SP, BR,09080-50",
                        "line3": "0."
                    },
                    "name": "JOBEN BETETI BORGES"
                },
                "receiving": {
                    "aba_number": "XXXXXXXX",
                    "address": {
                        "line1": "NEW YORK, NY, US",
                        "line2": "",
                        "line3": ""
                    },
                    "name": "THE BANK OF NEW YORK MELLON"
                }
            }
        }
    }
}

Hope that helps.