It's convenient to use inline
to expand arrays of struct. In the following code it's used 3 times.
This is your data:
from pyspark.sql import functions as F
import re
df = spark.createDataFrame(
[({"GroupId":"10C81","IDArray":["ABC-XYZ-123"],"IDStruct":{"DSA-ASA-211":None,"BSA-ASA-211":None,"ABC-XYZ-123":[{"BagId":"42425fsdfs","TravelerId":"1234567","LegCorr":[{"DelID":"SQH","SegID":"PQR-UVW"},{"DelID":"GFS","SegID":"GHS-UVW"}]}]}},)],
"MainTag struct<GroupId:string, IDArray:array<string>, IDStruct:struct<`ABC-XYZ-123`:array<struct<BagId:string,LegCorr:array<struct<DelID:string,SegID:string>>,TravelerId:string>>,`DSA-ASA-211`:string,`BSA-ASA-211`:string>>")
df.printSchema()
# root
# |-- MainTag: struct (nullable = true)
# | |-- GroupId: string (nullable = true)
# | |-- IDArray: array (nullable = true)
# | | |-- element: string (containsNull = true)
# | |-- IDStruct: struct (nullable = true)
# | | |-- ABC-XYZ-123: array (nullable = true)
# | | | |-- element: struct (containsNull = true)
# | | | | |-- BagId: string (nullable = true)
# | | | | |-- LegCorr: array (nullable = true)
# | | | | | |-- element: struct (containsNull = true)
# | | | | | | |-- DelID: string (nullable = true)
# | | | | | | |-- SegID: string (nullable = true)
# | | | | |-- TravelerId: string (nullable = true)
# | | |-- DSA-ASA-211: string (nullable = true)
# | | |-- BSA-ASA-211: string (nullable = true)
Script:
df = df.selectExpr("inline(array(MainTag))")
df = df.selectExpr(
*[c for c in df.columns if c != 'IDStruct'],
"inline(IDStruct.`ABC-XYZ-123`)")
df = df.selectExpr(
*[c for c in df.columns if c != 'LegCorr'],
"inline(LegCorr)")
df = df.withColumn("IDArray", F.explode("IDArray"))
df.show()
# +-------+-----------+----------+----------+-----+-------+
# |GroupId| IDArray| BagId|TravelerId|DelID| SegID|
# +-------+-----------+----------+----------+-----+-------+
# | 10C81|ABC-XYZ-123|42425fsdfs| 1234567| SQH|PQR-UVW|
# | 10C81|ABC-XYZ-123|42425fsdfs| 1234567| GFS|GHS-UVW|
# +-------+-----------+----------+----------+-----+-------+
If you don't want to hard-code "ABC-XYZ-123", one of possible ways is to change the schema for "MainTag" column. Something like the following could do it.
re.sub(r"([\w\-]+)(?=:array<struct<BagId)", "_ids", maintag_schema)
- hopefully should rename that column to "_ids".
re.sub(r"(?<=,|<)([^,<]+)(?=:)", r"`\1`", maintag_schema)
- hopefully should add `
around column/ field names which is needed for some field names.
maintag_schema = df.schema["MainTag"].dataType.simpleString()
maintag_schema = re.sub(r"([\w\-]+)(?=:array<struct<BagId)", "_ids", maintag_schema)
maintag_schema = re.sub(r"(?<=,|<)([^,<]+)(?=:)", r"`\1`", maintag_schema)
df = df.withColumn("MainTag", F.col("MainTag").cast(maintag_schema))
df = df.selectExpr("inline(array(MainTag))")
df = df.selectExpr(
*[c for c in df.columns if c != 'IDStruct'],
"inline(IDStruct._ids)")
df = df.selectExpr(
*[c for c in df.columns if c != 'LegCorr'],
"inline(LegCorr)")
df = df.withColumn("IDArray", F.explode("IDArray"))
df.show()
# +-------+-----------+----------+----------+-----+-------+
# |GroupId| IDArray| BagId|TravelerId|DelID| SegID|
# +-------+-----------+----------+----------+-----+-------+
# | 10C81|ABC-XYZ-123|42425fsdfs| 1234567| SQH|PQR-UVW|
# | 10C81|ABC-XYZ-123|42425fsdfs| 1234567| GFS|GHS-UVW|
# +-------+-----------+----------+----------+-----+-------+