i have JSON data on user profiles that i want to eventually analyze with SPSS. Currently i imported the data in Google Refine, to run some data cleansing. My problem is however that the original JSON consists of nested objects, namely e.g. the "professional_experience" section with "companies", that includes several sub objects/arrays (see example). Google refine handles this by creating additional rows with that information. This is however in no way consistent with a "relational" (in terms of SQL) view/table structure that i'd need to analyze the data with SPSS or Excel or whatever, as there are other sub-objects (schools, awards, etc.) which are also "stupidly" filled in the rows below the high level "main" record, but do not have a direct (row/column wise) relationship to one another (considering analysis).
As i see it i would need to extract those (sub object) columns and rows to an own table and create some n:m relationship, or at least normalize it into ONE table (then of course with accepting the redundancies of the other unnested attributes of course).
What I want to end up with is one consistent table to run statistical analysis/clustering on certain attributes. I assume map reduce is not really an option here.
Does anyone of you have an idea on how to handle this issue or is there maybe an easier way directly to work on the JSON data?
{ "users": [
{
"id": "123456_abcdef",
"first_name": "Max",
"last_name": "Mustermann",
"display_name": "Max Mustermann",
"page_name": "Max_Mustermann",
"permalink": "https://www.xing.com/profile/Max_Mustermann",
"employment_status": "EMPLOYEE",
"gender": "m",
"birth_date": {
"day": 12,
"month": 8,
"year": 1963
},
"active_email": "max.mustermann@xing.com",
"time_zone": {
"name": "Europe/Copenhagen",
"utc_offset": 2.0
},
"premium_services": [
"SEARCH",
"PRIVATEMESSAGES"
],
"badges": [
"PREMIUM",
"MODERATOR"
],
"wants": "einen neuen Job",
"haves": "viele tolle Skills",
"interests": "Flitzebogen schießen and so on",
"organisation_member": "ACM, GI",
"languages": {
"de": "NATIVE",
"en": "FLUENT",
"fr": null,
"zh": "BASIC"
},
"private_address": {
"city": "Hamburg",
"country": "DE",
"zip_code": "20357",
"street": "Privatstraße 1",
"phone": "49|40|1234560",
"fax": "||",
"province": "Hamburg",
"email": "max@mustermann.de",
"mobile_phone": "49|0155|1234567"
},
"business_address": {
"city": "Hamburg",
"country": "DE",
"zip_code": "20357",
"street": "Geschäftsstraße 1a",
"phone": "49|40|1234569",
"fax": "49|40|1234561",
"province": "Hamburg",
"email": "max.mustermann@xing.com",
"mobile_phone": "49|160|66666661"
},
"web_profiles": {
"qype": [
"http://qype.de/users/foo"
],
"google+": [
"http://plus.google.com/foo"
],
"other": [
"http://blog.example.org"
],
"homepage": [
"http://example.org",
"http://other-example.org"
]
},
"instant_messaging_accounts": {
"skype": "1122334455",
"googletalk": "max.mustermann"
},
"professional_experience": {
"primary_company": {
"id": "1_abcdef",
"name": "XING AG",
"title": "Softwareentwickler",
"company_size": "201-500",
"tag": null,
"url": "http://www.xing.com",
"career_level": "PROFESSIONAL_EXPERIENCED",
"begin_date": "2010-01",
"description": null,
"end_date": null,
"industry": "AEROSPACE",
"form_of_employment": "FULL_TIME_EMPLOYEE",
"until_now": true
},
"companies": [
{
"id": "1_abcdef",
"name": "XING AG",
"title": "Softwareentwickler",
"company_size": "201-500",
"tag": null,
"url": "http://www.xing.com",
"career_level": "PROFESSIONAL_EXPERIENCED",
"begin_date": "2010-01",
"description": null,
"end_date": null,
"industry": "AEROSPACE",
"form_of_employment": "FULL_TIME_EMPLOYEE",
"until_now": true
},
{
"id": "24_abcdef",
"name": "Ninja Ltd.",
"title": "DevOps",
"company_size": null,
"tag": "NINJA",
"url": "http://www.ninja-ltd.co.uk",
"career_level": null,
"begin_date": "2009-04",
"description": null,
"end_date": "2010-07",
"industry": "ALTERNATIVE_MEDICINE",
"form_of_employment": "OWNER",
"until_now": false
},
{
"id": "45_abcdef",
"name": null,
"title": "Wiss. Mitarbeiter",
"company_size": null,
"tag": "OFFIS",
"url": "http://www.uni.de",
"career_level": null,
"begin_date": "2007",
"description": null,
"end_date": "2008",
"industry": "APPAREL_AND_FASHION",
"form_of_employment": "PART_TIME_EMPLOYEE",
"until_now": false
},
{
"id": "176_abcdef",
"name": null,
"title": "TEST NINJA",
"company_size": "201-500",
"tag": "TESTCOMPANY",
"url": null,
"career_level": "ENTRY_LEVEL",
"begin_date": "1998-12",
"description": null,
"end_date": "1999-05",
"industry": "ARTS_AND_CRAFTS",
"form_of_employment": "INTERN",
"until_now": false
}
],
"awards": [
{
"name": "Awesome Dude Of The Year",
"date_awarded": 2007,
"url": null
}
]
},
"educational_background": {
"degree": "MSc CE/CS",
"primary_school": {
"id": "42_abcdef",
"name": "Carl-von-Ossietzky Universtät Schellenburg",
"degree": "MSc CE/CS",
"notes": null,
"subject": null,
"begin_date": "1998-08",
"end_date": "2005-02"
},
"schools": [
{
"id": "42_abcdef",
"name": "Carl-von-Ossietzky Universtät Schellenburg",
"degree": "MSc CE/CS",
"notes": null,
"subject": null,
"begin_date": "1998-08",
"end_date": "2005-02"
}
],
"qualifications": [
"TOEFLS",
"PADI AOWD"
]
}
}
] }