2

I have been trying this for 2 days now, to load this jsonl nested file into a dataframe but unable to succeed. I want to load this into a flattened data frame as I want to perform some join and aggregations after inserting into a database.

pasting first few lines of the file

{"metadata": {"timestamp": "2022-02-19T02:55:54", "collection_id": "a8b7c401-fafd-4e4c-924a-5935526722fd", "session_id": "452eb9e8-e090-4a08-b836-d23d05b1f400", "profile_id": "21636369-8b52-4b4a-97b7-50923ceb3ffd"}, "behaviour": {"mobile": {"swipe": [{"timestamp": 0, "x": 0.4230440650862826, "y": -1.1079966897549942}, {"timestamp": 13, "x": -0.6336283415951725, "y": -0.5487586450523316}, {"timestamp": 24, "x": -0.7667096573026922, "y": -0.17038917226708414}, {"timestamp": 27, "x": -0.2492178616850527, "y": -0.07701361360512053}, {"timestamp": 36, "x": -0.4191304177115883, "y": -0.3397303353601901}, {"timestamp": 30, "x": 0.2320275902893194, "y": 0.4445459453058296}, {"timestamp": 48, "x": 1.4065844499882725, "y": -0.6494710011790842}, {"timestamp": 84, "x": 0.6409924815827414, "y": 1.084704106728884}, {"timestamp": 112, "x": -0.12560819944452944, "y": 0.3951936918404698}, {"timestamp": 72, "x": -1.2564732537980146, "y": 0.5162185447218673}, {"timestamp": 140, "x": -0.11225893044809712, "y": -1.0759589427931768}, {"timestamp": 99, "x": 0.4055014359919277, "y": -0.8092838008674887}, {"timestamp": 108, "x": 0.5696685733474918, "y": -0.026973393894283243}, {"timestamp": 104, "x": -0.721861152315641, "y": 0.9688713713582879}, {"timestamp": 112, "x": -0.517704263457985, "y": 0.11542994879900434}, {"timestamp": 75, "x": -1.605496286221394, "y": -0.14112093019514352}, {"timestamp": 128, "x": -0.6291370576878567, "y": -1.6601324818355463}, {"timestamp": 255, "x": 0.40843440157409044, "y": 0.12849855301903662}, {"timestamp": 126, "x": -0.41855530399404134, "y": 0.9280432396602861}, {"timestamp": 190, "x": 0.8272200887537559, "y": -2.8837656534871665}, {"timestamp": 220, "x": -0.46418914345820755, "y": 1.461671361359334}, {"timestamp": 315, "x": 1.4982587078714356, "y": -0.02798356032708405}, {"timestamp": 154, "x": 0.8495531080382587, "y": -0.21588769612222933}, {"timestamp": 230, "x": -0.7817274589055877, "y": 0.10459280883004704}, {"timestamp": 288, "x": -0.6199621973185518, "y": 0.005292257129691979}, {"timestamp": 250, "x": 0.30746861940044845, "y": 1.0241480367845581}, {"timestamp": 312, "x": -0.06819093424015772, "y": 0.5494263659551994}, {"timestamp": 162, "x": 2.0174773870865987, "y": -0.37867315639172483}, {"timestamp": 392, "x": -0.6091712707845026, "y": 0.9772389197482354}, {"timestamp": 348, "x": -1.2571532898911881, "y": 0.5790041974234325}, {"timestamp": 210, "x": -0.10627207941819672, "y": 0.5625704328169008}, {"timestamp": 434, "x": 0.7250857141705704, "y": 0.04705329465119688}, {"timestamp": 448, "x": -2.778612588025202, "y": -0.5212241988490687}, {"timestamp": 330, "x": 0.7224408299657998, "y": 1.7002044390937856}, {"timestamp": 170, "x": -1.016443134380015, "y": 0.08434308734071132}, {"timestamp": 385, "x": 0.0740135245013153, "y": -0.6937385355902548}, {"timestamp": 288, "x": 0.6873913431881746, "y": 2.0759013166848317}, {"timestamp": 481, "x": 0.561396110432751, "y": 0.49573900401513177}, {"timestamp": 456, "x": -0.1773451988572836, "y": -1.759401045101997}, {"timestamp": 195, "x": -0.3695919636514994, "y": -0.9902143983832665}, {"timestamp": 560, "x": -0.04585535029678639, "y": -1.7036174960872914}, {"timestamp": 205, "x": 0.5111813501633438, "y": 0.47367897298849226}, {"timestamp": 546, "x": -1.9309270698843783, "y": 0.3806555477944176}, {"timestamp": 559, "x": 0.4629684313263635, "y": 0.9452816436897316}, {"timestamp": 616, "x": 0.8893217401839395, "y": 0.4248136684216903}, {"timestamp": 495, "x": -0.316562829967066, "y": 0.6125479349850482}, {"timestamp": 414, "x": 0.8445187708427919, "y": 0.9843145760818928}, {"timestamp": 564, "x": 0.691084533796371, "y": 0.7790856500722466}, {"timestamp": 336, "x": -1.4869830646379347, "y": -1.3299596471768933}, {"timestamp": 686, "x": 0.175807248613561, "y": 0.8389891374495156}, {"timestamp": 700, "x": -0.05181686823382315, "y": -0.6072622860567705}, {"timestamp": 408, "x": 0.9032138095285404, "y": -0.7372126870214335}, {"timestamp": 572, "x": 0.9090357577884669, "y": -1.0933486637672398}, {"timestamp": 371, "x": -0.21968857950449133, "y": 1.2297936396937472}, {"timestamp": 594, "x": -0.903683811769815, "y": -0.962349247335407}, {"timestamp": 770, "x": 1.363974698178015, "y": -2.45398664951111}]}}}
{"metadata": {"timestamp": "2022-01-20T11:58:31", "collection_id": "b29d1647-684e-4c5f-856a-87fbabdfcd7e", "session_id": "43dbf234-6207-4ba8-a32f-64bccb8948be", "profile_id": "21636369-8b52-4b4a-97b7-50923ceb3ffd"}, "behaviour": {"mobile": {"pin": [{"timestamp": 0, "x": -1.635364533608917, "y": -0.9233169601939333}, {"timestamp": 6, "x": -0.6138268672129017, "y": -0.7333714325660339}, {"timestamp": 18, "x": 1.5371807147417926, "y": -0.1772820460807428}, {"timestamp": 21, "x": 0.9848563399999479, "y": -0.47868567228881614}]}}}
{"metadata": {"timestamp": "2022-01-04T02:15:37", "collection_id": "781aa808-074f-4f1f-af27-667a490a55ea", "session_id": "de8877cb-3e8e-4713-8403-e4fea7cd0a38", "profile_id": "6018366c-f658-47a7-9ed3-4fe53a096533"}, "behaviour": {"mobile": {"keystrokes": [{"timestamp": 0, "key_hash": -1.2626154136500727}, {"timestamp": 8, "key_hash": 0.9900211973859506}, {"timestamp": 30, "key_hash": -1.5075769024975958}, {"timestamp": 45, "key_hash": -1.0967796122154305}, {"timestamp": 24, "key_hash": -0.3627313749623099}, {"timestamp": 40, "key_hash": 1.0158181665717492}, {"timestamp": 84, "key_hash": 0.4517722807573615}, {"timestamp": 63, "key_hash": 0.47838264462494967}, {"timestamp": 96, "key_hash": 0.15833055712400226}, {"timestamp": 63, "key_hash": 0.09400350314454811}, {"timestamp": 60, "key_hash": -0.3262757188314575}, {"timestamp": 66, "key_hash": -0.1975736686989061}, {"timestamp": 168, "key_hash": 2.0745969522246765}, {"timestamp": 182, "key_hash": 0.6457012666514507}]}}}

I have tried the following code snippet to parse the json using json_normalize()

    collections=(f'../test/input/collections.jsonl')

    collections_data = [json.loads(line) for line in open(collections, 'r')]
    collections_df = pd.json_normalize(collections_data)
    print(collections_df)

but it only normalizes the 'metadata' part of the json, not 'behaviour'. resulting df is like this

     metadata.timestamp                metadata.collection_id  ...                               behaviour.mobile.pin                        behaviour.mobile.keystrokes
0   2022-02-19T02:55:54  a8b7c401-fafd-4e4c-924a-5935526722fd  ...                                                NaN                                                NaN
1   2022-01-20T11:58:31  b29d1647-684e-4c5f-856a-87fbabdfcd7e  ...  [{'timestamp': 0, 'x': -1.635364533608917, 'y'...                                                NaN
2   2022-01-04T02:15:37  781aa808-074f-4f1f-af27-667a490a55ea  ...                                                NaN  [{'timestamp': 0, 'key_hash': -1.2626154136500...
3   2022-04-14T15:07:29  ba103592-f361-4cc8-86ed-4f03bde3702a  ...                                                NaN  [{'timestamp': 0, 'key_hash': 0.05626071817640...

My output schema should be like

['metadata.timestamp','metadata.collection_id','metadata.session_id','metadata.profile_id','behaviour.mobile.swipe.timestamp','behaviour.mobile.swipe.x','behaviour.mobile.swipe.y','behaviour.mobile.pin.timestamp','behaviour.mobile.pin.x','behaviour.mobile.pin.y','behaviour.mobile.keystrokes.timestamp','behaviour.mobile.keystrokes.key_hash']

I have also tried to use the 'record_path' argument while calling json_normalize() but cant figure out what 'key' to pass.

Any help will be a appreciated a lot

  • You may have to reorganize the data yourself before feeding it into pandas. That's not hard. The automated megatools provided by pandas are great when your flow fits their model, but they only handle limited situations. – Tim Roberts Aug 30 '22 at 17:38
  • @TimRoberts can you shed some light or link some articles on how I should approach this, given I'm a newbie when it comes to json loading – Hassaan Murtaza Aug 30 '22 at 18:12
  • The question shows 3 dicts that you want to add, but you say it is one json file ? please clarify which it is.... – D.L Aug 30 '22 at 18:15
  • @D.L its a jsonl file not a json file, each line has a separate dictionary. And within each dictionary are further nested dictionaries – Hassaan Murtaza Aug 30 '22 at 18:44
  • @D.L I just pasted the first 3 lines, meaning the first 3 dictionaries – Hassaan Murtaza Aug 30 '22 at 19:23

1 Answers1

1
import pandas as pd
import json


collections=(f'collections.jsonl')
collections_data = [json.loads(line) for line in open(collections, 'r')]
collections_df = pd.json_normalize(collections_data)

def flatten_pandas(df_):
    #The same as flatten but for pandas

    have_list = df_.columns[df_.applymap(lambda x: isinstance(x, list)).any()].tolist()
    have_dict = df_.columns[df_.applymap(lambda x: isinstance(x, dict)).any()].tolist()
    have_nested = len(have_list) + len(have_dict)
    
    while have_nested!=0:
        if len(have_list)!=0:
            for _ in have_list:
                df_ = df_.explode(_)
                
        elif have_dict !=0:
            df_ = pd.json_normalize(json.loads(df_.to_json(force_ascii=False, orient="records")), sep=".")
        
        have_list = df_.columns[df_.applymap(lambda x: isinstance(x, list)).any()].tolist()
        have_dict = df_.columns[df_.applymap(lambda x: isinstance(x, dict)).any()].tolist()
        have_nested = len(have_list) + len(have_dict)
        
    return df_


flattend = flatten_pandas(collections_df)

flattend.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 15 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   metadata.timestamp                     74 non-null     object 
 1   metadata.collection_id                 74 non-null     object 
 2   metadata.session_id                    74 non-null     object 
 3   metadata.profile_id                    74 non-null     object 
 4   behaviour.mobile.pin                   0 non-null      float64
 5   behaviour.mobile.keystrokes            0 non-null      float64
 6   behaviour.mobile.swipe.timestamp       56 non-null     float64
 7   behaviour.mobile.swipe.x               56 non-null     float64
 8   behaviour.mobile.swipe.y               56 non-null     float64
 9   behaviour.mobile.swipe                 0 non-null      float64
 10  behaviour.mobile.pin.timestamp         4 non-null      float64
 11  behaviour.mobile.pin.x                 4 non-null      float64
 12  behaviour.mobile.pin.y                 4 non-null      float64
 13  behaviour.mobile.keystrokes.timestamp  14 non-null     float64
 14  behaviour.mobile.keystrokes.key_hash   14 non-null     float64
dtypes: float64(11), object(4)
memory usage: 8.8+ KB