I am trying to flatten the below-nested JSON:
root
|-- id: string (nullable = true)
|-- InsuranceProvider: string (nullable = true)
|-- Type: struct (nullable = true)
| |-- Client: struct (nullable = true)
| | |-- PaidIn: struct (nullable = true)
| | | |-- Insuranceid: string (nullable = true)
| | | |-- Insurancedesc: string (nullable = true)
| | | |-- purchaseditems: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- InsuranceNumber: string (nullable = true)
| | | | | |-- InsuranceLabel: string (nullable = true)
| | | | | |-- Insurancequantity: double (nullable = true)
| | | | | |-- Insuranceprice: integer (nullable = true)
| | | | | |-- discountsreceived: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- amount: integer (nullable = true)
| | | | | | | |-- description: string (nullable = true)
| | | | | |-- childItems: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- InsuranceNumber: string (nullable = true)
| | | | | | | |-- InsuranceLabel: string (nullable = true)
| | | | | | | |-- Insurancequantity: double (nullable = true)
| | | | | | | |-- Insuranceprice: integer (nullable = true)
| | | | | | | |-- discountsreceived: array (nullable = true)
| | | | | | | | |-- element: struct (containsNull = true)
| | | | | | | | | |-- amount: integer (nullable = true)
| | | | | | | | | |-- description: string (nullable = true)
|-- eventTime: string (nullable = true)
I am trying to manually flatten it by exploding each array type and selecting the column required.
Below is my code:
df1 = purchaseDF.select(#col("InsuranceProvider"),
col("eventTime"),
col("id"),
col("Type.Client.PaidIn.Insurancedesc").alias("Insurancedesc"),
col("Type.Client.PaidIn.Insuranceid").alias("Insuranceid"),
explode("Type.Client.PaidIn.purchaseditems").alias("purchaseditemsArray")
)#.show(truncate=True)
df2 = df1.select("*",
col("purchaseditemsArray.InsuranceLabel").alias("InsuranceLabel"),
col("purchaseditemsArray.InsuranceNumber").alias("InsuranceNumber"),
col("purchaseditemsArray.Insuranceprice").alias("Insuranceprice"),
col("purchaseditemsArray.Insurancequantity").alias("Insurancequantity"),
explode("purchaseditemsArray.childItems").alias("childItemsArray")
)#.drop("purchaseditemsArray")
#.show(truncate=False)
df3 = df2.select("*",
col("childItemsArray.InsuranceLabel").alias("ChildInsuranceLabel"),
col("childItemsArray.InsuranceNumber").alias("ChildInsuranceNumber"),
col("childItemsArray.Insuranceprice").alias("ChildInsuranceprice"),
col("childItemsArray.Insurancequantity").alias("ChildInsurancequantity"),
explode("childItemsArray.discountsreceived").alias("discountsreceivedArray")
).drop("childItemsArray")
#.show(truncate=False)
df4 = df3.select("*",
col("discountsreceivedArray.amount").alias("ChildItemDiscountAmount"),
col("discountsreceivedArray.description").alias("ChildItemDiscountDescription")
).drop("discountsreceivedArray")
#.show(truncate=False)
discountReceivedDF = df4.select("*",
explode("purchaseditemsArray.discountsreceived").alias("discountsreceivedArray")
)
finalDF = discountReceivedDF.select("*",
"discountsreceivedArray.amount",
"discountsreceivedArray.description"
).drop("purchaseditemsArray").drop("discountsreceivedArray")
After writing the above I got all the columns separately. I tried the below to show the result:
finalDF.show(truncate= False)
finalDF.printSchema()
OUTPUT:
root
|-- eventTime: string (nullable = true)
|-- id: string (nullable = true)
|-- Insurancedesc: string (nullable = true)
|-- Insuranceid: string (nullable = true)
|-- InsuranceLabel: string (nullable = true)
|-- InsuranceNumber: string (nullable = true)
|-- Insuranceprice: integer (nullable = true)
|-- Insurancequantity: double (nullable = true)
|-- ChildInsuranceLabel: string (nullable = true)
|-- ChildInsuranceNumber: string (nullable = true)
|-- ChildInsuranceprice: integer (nullable = true)
|-- ChildInsurancequantity: double (nullable = true)
|-- ChildItemDiscountAmount: integer (nullable = true)
|-- ChildItemDiscountDescription: string (nullable = true)
|-- amount: integer (nullable = true)
|-- description: string (nullable = true)
My question is, Is there any alternative where I can optimize this nested JSON flattening thing? I looked up many blogs but didn't understand anything.
Below is the sample JSON Data:
[
{
"id": "164651478631223455788978942317",
"InsuranceProvider": "Embroker",
"Type": {
"Client": {
"PaidIn": {
"Insuranceid": "97331549875122744335422",
"Insurancedesc": "Magic happens here",
"purchaseditems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "DNO",
"Insurancequantity": 1,
"Insuranceprice": 345,
"discountsreceived": [
{
"amount": 495,
"description": "Item 1, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 2495,
"description": "Insurance item 1, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "2",
"InsuranceLabel": "LPL",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": -295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 495,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "3",
"InsuranceLabel": "LPL",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 400,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "4",
"InsuranceLabel": "LPL",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 335,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "5",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 1295,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 195,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "6",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 805,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 501,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "7",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 521,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 533,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "8",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 422,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 333,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "9",
"InsuranceLabel": "Employment Practices Liability",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 444,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 666,
"description": "Insurance item 2, Discount 1"
}
]
}
]
},
{
"InsuranceNumber": "10",
"InsuranceLabel": "DNO",
"Insurancequantity": 2,
"Insuranceprice": 945,
"discountsreceived": [
{
"amount": 10,
"description": "Item 2, Discount 1"
}
],
"childItems": [
{
"InsuranceNumber": "1",
"InsuranceLabel": "Cyber",
"Insurancequantity": 1,
"Insuranceprice": 0,
"discountsreceived": [
{
"amount": 63,
"description": "Insurance item 2, Discount 1"
}
]
}
]
}
]
}
}
},
"eventTime": "2020-05-19T01:59:10.379Z"
}
]
Thank you so much in advance :)