Flatten nested dictionaries while checking content

Question

I have a dictionary like this:

source = {

    'Section 1' : {
        'range'       : [0, 200],
        'template'    : 'ID-LOA-XXX',
        'nomenclature': True
    },

    'Section 2' : {
        'range'       : [201, 800],
        'template'    : 'ID-EPI-XXX',
        'nomenclature': False,
        'Subsection 1' : {
            'range'       : [0, 400],
            'template'    : 'ID-EPI-S1-XXX',
            'nomenclature': False,
            'Subsubsection 1' : {
                'range'       : [0, 400],
                'template'    : 'ID-EPI-S12-XXX',
                'nomenclature': False
            }
        },
        'Subsection 2' : {
            'range'       : [0, 400],
            'template'    : 'ID-EPI-S2-XXX',
            'nomenclature': False
        }
    }, 

    # etc.

}

which is loaded from a JSON file. I would like to 'flatten' that to the following dictionary:

target = {

    'Section 1' : {
        'range'       : [0, 200],
        'template'    : 'ID-LOA-XXX',
        'nomenclature': True,
        'location'    : './Section 1/'
    },

    'Section 2' : {
        'range'       : [201, 800],
        'template'    : 'ID-EPI-XXX',
        'nomenclature': False,
        'location'    : './Section 2/'
    },

    'Subsection 1' : {
        'range'       : [0, 400],
        'template'    : 'ID-EPI-S1-XXX',
        'nomenclature': False,
        'location'    : './Section 2/Subsection 1/'
    },

    'Subsubsection 1' : {
        'range'       : [0, 400],
        'template'    : 'ID-EPI-S12-XXX',
        'nomenclature': False,
        'location'    : './Section 2/Subsection 1/Subsubsection 1'
    },

    'Subsection 2' : {
        'range'       : [0, 400],
        'template'    : 'ID-EPI-S2-XXX',
        'nomenclature': False,
        'location'    : './Section 2/Subsection 2/'
    },

    # etc.

}

I may be able to change how the original JSON file is generated, but I'd prefer not to go there.

The JSON file in words: each section contains at least three keys, and may contain other keys. Those other keys are interpreted as subsections contained in the current section, and each one is a dict with the same properties. This pattern may in principle recurse infinitely deep.

I would also like to perform some assertions:

whether all required fields are present ('range', 'template' and 'nomenclature')
that the values of the required fields pass certain assertions

So far I've only managed to do these checks:

import json

key_requirements = {
    "nomenclature": lambda x : isinstance(x, bool),
    "template"    : lambda x : isinstance(x, str)  and "X" in x,
    "range"       : lambda x : isinstance(x, list) and len(x)==2 and all([isinstance(y,int) for y in x]) and x[1] > x[0]
}

def checkSection(section):

    for key in section:
        if key not in key_requirements:            
            checkSection(section[key])

        elif not key_requirements[key]( section[key] ): 
            # error: assertion failed
            pass

        else:      
            # error: key not present
            pass

for key in source # json.load(open(myJsonFile))
    checkSection(data[key])

But at the moment, no amount of coffee has enabled me to come up with an efficient, elegant, pythonic way to weave the desired conversion into this scheme...

Any suggestions or ideas?

Abhijit · Accepted Answer · 2014-07-14T11:57:29.133

The problem requires a recursive traversal and unless you would want some third party library (yes there are solutions with it), you would need a simple home grown recursive traversal

NOte Path semantics may be different than yours as I am on windows

Implementation

def flatten(source):
    target = {}
    def helper(src, path ='.', last_key = None):
        if last_key: 
            target[last_key] = {}
            target[last_key]['location'] = path
        for key, value in src.items():
            if isinstance(value, dict):
                helper(value, os.path.join(path, key), key)

            else:
                target[last_key][key] = value

    helper(source)
    return target

Output

>>> pprint.pprint(source)
{'Section 1': {'nomenclature': True,
               'range': [0, 200],
               'template': 'ID-LOA-XXX'},
 'Section 2': {'Subsection 1': {'Subsubsection 1': {'nomenclature': False,
                                                    'range': [0, 400],
                                                    'template': 'ID-EPI-S12-XXX'},
                                'nomenclature': False,
                                'range': [0, 400],
                                'template': 'ID-EPI-S1-XXX'},
               'Subsection 2': {'nomenclature': False,
                                'range': [0, 400],
                                'template': 'ID-EPI-S2-XXX'},
               'nomenclature': False,
               'range': [201, 800],
               'template': 'ID-EPI-XXX'}}
>>> pprint.pprint(flatten(source))
{'Section 1': {'location': '\\Section 1',
               'nomenclature': True,
               'range': [0, 200],
               'template': 'ID-LOA-XXX'},
 'Section 2': {'location': '\\Section 2',
               'nomenclature': False,
               'range': [201, 800],
               'template': 'ID-EPI-XXX'},
 'Subsection 1': {'location': '\\Section 2\\Subsection 1',
                  'nomenclature': False,
                  'range': [0, 400],
                  'template': 'ID-EPI-S1-XXX'},
 'Subsection 2': {'location': '\\Section 2\\Subsection 2',
                  'nomenclature': False,
                  'range': [0, 400],
                  'template': 'ID-EPI-S2-XXX'},
 'Subsubsection 1': {'location': '\\Section 2\\Subsection 1\\Subsubsection 1',
                     'nomenclature': False,
                     'range': [0, 400],
                     'template': 'ID-EPI-S12-XXX'}}

@RodyOldenhuis: Fixed, some errors crept in as I overlooked. — Abhijit, Jul 14 '14 at 12:01
Yup, that's the one. Much more elegant than mine. Thanks a bunch! — Rody Oldenhuis, Jul 14 '14 at 12:04

score 1 · Answer 2 · answered Jul 14 '14 at 08:55

1

this works for your case:

output = {}
for key, value in source.iteritems():
    item = {}
    for nested_key, nested_value in value.iteritems():
        if type(nested_value) == type({}):
            nested_item = {}
            for nested_key_2, nested_value_2 in nested_value.iteritems():
                nested_item[nested_key_2] = nested_value_2
            output[nested_key] = nested_item
        else:
            item[nested_key] = nested_value
    output[key] = item

answered Jul 14 '14 at 08:55

fceruti

2,405
1
19
28

...except for the fact that *"This pattern may in principle recurse infinitely deep."*, and the extra `location` keys. But that can be reworked :) – Rody Oldenhuis Jul 14 '14 at 09:03

score 0 · Answer 3 · answered Jul 14 '14 at 10:26

I ended up with this solution:

import os

key_requirements = {
    "nomenclature": lambda x : isinstance(x, bool),
    "template"    : lambda x : isinstance(x, str)  and "X" in x,
    "range"       : lambda x : isinstance(x, list) and len(x)==2 and all([isinstance(y,int) for y in x]) and x[1] > x[0]
}


def checkAndFlattenData(data):

    def merge_dicts(dict1,dict2):
        return dict(list(dict1.items()) + list(dict2.items()))


    def check_section(section, section_content):

        section_out = {
            'range'   : section_content['range'],
            'template': section_content['template'],
            'location': section
        }
        nested_section_out = {}

        for key,value in section_content.iteritems():

            if key not in key_requirements:
                if not isinstance(value,dict):
                    # error: invalid key
                    pass

                else:
                    nested_section_out[key], recurse_out = check_section(key,value)
                    nested_section_out = merge_dicts(nested_section_out, recurse_out)


            elif not key_requirements[key](value):
                print "ASSERTION FAILED!"# error: field assertion failed
                pass

        for key in nested_section_out:
            nested_section_out[key]['location'] = os.path.join(section, nested_section_out[key]['location'])

        return section_out, nested_section_out

    new_data = {}
    for key,value in data.iteritems():
        new_data[key], nested_data = check_section(key, value)
        new_data = merge_dicts(new_data, nested_data)

    for key,value in new_data.iteritems():
        new_data[key]['location'] = os.path.join('.', new_data[key]['location'])

    return new_data


target = checkAndFlattenData(source)

But I can't help escape the feeling that this can all be done a bit more pythonic (and/or more efficient)...If anyone has any suggestions, don't hesitate to copy-paste this and make improvements in an independent answer, so I can accept that.

Flatten nested dictionaries while checking content

3 Answers3