Nested json to csv - generic approach

Question

I am very new to Python and I am struggling with converting nested json file into cvs. To do so I started with loading the json and then transformed it in a way that prints out nice output with json_normalize, then using pandas package I output the normalised parts into cvs.

My example json:

[{
 "_id": {
   "id": "123"
 },
 "device": {
   "browser": "Safari",
   "category": "d",
   "os": "Mac"
 },
 "exID": {
   "$oid": "123"
 },
 "extreme": false,
 "geo": {
   "city": "London",
   "country": "United Kingdom",
   "countryCode": "UK",
   "ip": "00.000.000.0"
 },
 "viewed": {
   "$date": "2011-02-12"
 },
 "attributes": [{
   "name": "gender",
   "numeric": 0,
   "value": 0
 }, {
   "name": "email",
   "value": false
 }],
 "change": [{
   "id": {
     "$id": "1231"
   },
   "seen": [{
     "$date": "2011-02-12"
   }]
 }]
}, {
 "_id": {
   "id": "456"
 },
 "device": {
   "browser": "Chrome 47",
   "category": "d",
   "os": "Windows"
 },
 "exID": {
   "$oid": "345"
 },
 "extreme": false,
 "geo": {
   "city": "Berlin",
   "country": "Germany",
   "countryCode": "DE",
   "ip": "00.000.000.0"
 },
 "viewed": {
   "$date": "2011-05-12"
 },
 "attributes": [{
   "name": "gender",
   "numeric": 1,
   "value": 1
 }, {
   "name": "email",
   "value": true
 }],
 "change": [{
   "id": {
     "$id": "1231"
   },
   "seen": [{
     "$date": "2011-02-12"
   }]
 }]
}]

With following code (here I exclude the nested parts):

import json
from pandas.io.json import json_normalize


def loading_file():
    #File path
    file_path = #file path here

    #Loading json file
    json_data = open(file_path)
    data = json.load(json_data)
    return data

#Storing avaliable keys
def data_keys(data):
    keys = {}
    for i in data:
        for k in i.keys():
            keys[k] = 1

    keys = keys.keys()

#Excluding nested arrays from keys - hard coded -> IMPROVE
    new_keys = [x for x in keys if
    x != 'attributes' and
    x != 'change']

    return new_keys

#Excluding nested arrays from json dictionary
def new_data(data, keys):
    new_data = []
    for i in range(0, len(data)):
        x = {k:v for (k,v) in data[i].items() if k in keys }
        new_data.append(x)
    return new_data

 def csv_out(data):
     data.to_csv('out.csv',encoding='utf-8')

def main():
     data_file = loading_file()
     keys = data_keys(data_file)
     table = new_data(data_file, keys)
     csv_out(json_normalize(table))

main()

My current output looks something like this:

| _id.id | device.browser | device.category | device.os |  ... | viewed.$date |
|--------|----------------|-----------------|-----------|------|--------------|
| 123    | Safari         | d               | Mac       | ...  | 2011-02-12   |
| 456    | Chrome 47      | d               | Windows   | ...  | 2011-05-12   |
|        |                |                 |           |      |              |

My problem is that I would like to include the nested arrays into the cvs, so I have to flatten them. I cannot figure out how to make it generic so I do not use dictionary keys (numeric, id, name) and values while creating table. I have to make it generalisable because the number of keys in attributes and change. Therefore, I would like to have output like this:

| _id.id | device.browser | ... | attributes_gender_numeric | attributes_gender_value | attributes_email_value | change_id | change_seen |
|--------|----------------|-----|---------------------------|-------------------------|------------------------|-----------|-------------|
| 123    | Safari         | ... | 0                         | 0                       | false                  | 1231      | 2011-02-12  |
| 456    | Chrome 47      | ... | 1                         | 1                       | true                   | 1231      | 2011-02-12  |
|        |                |     |                           |                         |                        |           |             |

Thank you in advance! Any tips how to improve my code and make it more efficient are very welcome.

score 6 · Accepted Answer · answered Jun 09 '16 at 15:53

Thanks to the great blog post by Amir Ziai which you can find here I managed to output my data in form of a flat table. With the following function:

#Function that recursively extracts values out of the object into a flattened dictionary
def flatten_json(data):
    flat = [] #list of flat dictionaries
    def flatten(y):
        out = {}

        def flatten2(x, name=''):
            if type(x) is dict:
                for a in x:
                    if a == "name": 
                            flatten2(x["value"], name + x[a] + '_')
                    else:  
                        flatten2(x[a], name + a + '_')
            elif type(x) is list:
                for a in x:
                    flatten2(a, name + '_')
            else:
                out[name[:-1]] = x

        flatten2(y)
        return out

#Loop needed to flatten multiple objects
    for i in range(len(data)):
        flat.append(flatten(data[i]).copy())

    return json_normalize(flat)

I am aware of the fact that it is not perfectly generalisable, due to name-value if statement. However, if this exemption for creating the name-value dictionaries is deleted, the code can be used with other embedded arrays.

score 0 · Answer 2 · answered Jun 26 '18 at 17:30

I had a task to turn a json with nested key and values into a csv file a couple of weeks ago. For this task it was necessary to handle the nested keys properly to concatenate the to be used as unique headers for the values. The result was the code bellow, which can also be found here.

def get_flat_json(json_data, header_string, header, row):
    """Parse json files with nested key-vales into flat lists using nested column labeling"""
    for root_key, root_value in json_data.items():
        if isinstance(root_value, dict):
            get_flat_json(root_value, header_string + '_' + str(root_key), header, row)
        elif isinstance(root_value, list):
            for value_index in range(len(root_value)):
                for nested_key, nested_value in root_value[value_index].items():
                    header[0].append((header_string +
                                      '_' + str(root_key) +
                                      '_' + str(nested_key) +
                                      '_' + str(value_index)).strip('_'))
                    if nested_value is None:
                        nested_value = ''
                    row[0].append(str(nested_value))
        else:
            if root_value is None:
                root_value = ''
            header[0].append((header_string + '_' + str(root_key)).strip('_'))
            row[0].append(root_value)
    return header, row

This is a more generalized approach based on An Economist answer to this question.

score -1 · Answer 3 · answered Mar 11 '21 at 12:40

The following code worked on a messy json file with mixed dictionaries and lists within each other 7 layers deep:

    import csv, json, os
    def parse_json(data):
        a_dict_accum = {}
        for key, val in data.items():
            print("key, val = ", key, val)
            output.writerow([key])
            output.writerow([val])
            if isinstance(val, dict):
                for a_key, a_val in val.items():
                    print("a_key, a_val = ", a_key, a_val)
                    output.writerow([a_key])
                    output.writerow([a_val])
                    a_dict_accum.update({a_key:a_val})
                print("a_dict_accum = ", a_dict_accum)
                parse_json(a_dict_accum)
            elif isinstance(val, list):
                print("val_list = ", val)
                for a_list in val:
                     print("a_list = ", a_list)
                     output.writerow([a_list])
                     if isinstance(a_list, dict):
                         for a_key, a_val in a_list.items():
                             print("a_key, a_val = ", a_key, a_val)
                             output.writerow([a_key])
                             output.writerow([a_val])    
                             a_dict_accum.update({a_key:a_val})
                         print("a_dict_accum = ", a_dict_accum)
                         parse_json(a_dict_accum)
    os.chdir('C://Users/Robert/viirs/20200217_output')
    fileInput = 'night_lights_points.json'
    fileOutput = 'night_lights_points.csv'
    inputFile = open(fileInput) #open json file
    outputFile = open(fileOutput, 'w', newline='') #load csv file
    data = json.load(inputFile) #load json content
    output = csv.writer(outputFile) #create a csv.writer
    output = parse_json(data)
    inputFile.close() #close the input file
    outputFile.close() #close the output file

Nested json to csv - generic approach

3 Answers3

Linked