Problem I can either flatten the XML file in total (with the functions below) or not at all.
But how can I flatten it in part - with the twist that some values should be the column-name (_Name (col) and _VALUE (corresponding value))?
Input
df = spark.read.format("xml").options(rowTag="MAT").load("file.xml")
df_flat = flatten_explode_df(nested_df=df)
def type_cols(df_dtypes, filter_type):
cols = []
for col_name, col_type in df_dtypes:
if col_type.startswith(filter_type):
cols.append(col_name)
return cols
def flatten_df(nested_df, sep="_"):
nested_cols = type_cols(nested_df.dtypes, "struct")
flatten_cols = [fc for fc, _ in nested_df.dtypes if fc not in nested_cols]
for nc in nested_cols:
for cc in nested_df.select(f"{nc}.*").columns:
if sep is None:
flatten_cols.append(F.col(f"{nc}.{cc}").alias(f"{cc}"))
else:
flatten_cols.append(F.col(f"{nc}.{cc}").alias(f"{nc}{sep}{cc}"))
return nested_df.select(flatten_cols)
def explode_df(nested_df):
nested_cols = type_cols(nested_df.dtypes, "array")
exploded_df = nested_df
for nc in nested_cols:
exploded_df = exploded_df.withColumn(nc, F.explode(F.col(nc)))
return exploded_df
def flatten_explode_df(nested_df):
df = nested_df
struct_cols = type_cols(nested_df.dtypes, "struct")
array_cols = type_cols(nested_df.dtypes, "array")
if struct_cols:
df = flatten_df(df)
return flatten_explode_df(df)
if array_cols:
df = explode_df(df)
return flatten_explode_df(df)
return df
XML-Content
<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope
xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
<soap:Body
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<MAT>
<Topic1>
<place>short</place>
<placeDescription>long</placeDescription>
<timestamp>2021-02-03T04:05:06+07:00</timestamp>
</Topic1>
<Topic2>
<SubTopic2 Position="1">
<Tag>012345</Tag>
<isForward>true</isForward>
</SubTopic2>
</Topic2>
<Topic3>
<SubTopic3 Position="1">
<SubSubTopic32 Side="left">
<PROBLEM321 Name="A" isValid="true">0.1</PROBLEM321>
<PROBLEM321 Name="B" isValid="true">2.0</PROBLEM321>
<PROBLEM321 Name="C" isValid="true">3.5</PROBLEM321>
<PROBLEM321 Name="D" isValid="true">4.0</PROBLEM321>
<PROBLEM321 Name="E" isValid="true">5.5</PROBLEM321>
<PROBLEM321 Name="F" isValid="true">6.0</PROBLEM321>
<PROBLEM321 Name="G" isValid="true">0.7</PROBLEM321>
<PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
<PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
</SubSubTopic32>
<SubSubTopic32 Side="right">
<PROBLEM321 Name="A" isValid="true">1.0</PROBLEM321>
<PROBLEM321 Name="B" isValid="true">0.2</PROBLEM321>
<PROBLEM321 Name="C" isValid="true">0.3</PROBLEM321>
<PROBLEM321 Name="D" isValid="true">0.4</PROBLEM321>
<PROBLEM321 Name="E" isValid="true">0.5</PROBLEM321>
<PROBLEM321 Name="F" isValid="true">0.6</PROBLEM321>
<PROBLEM321 Name="G" isValid="true">7.7</PROBLEM321>
<PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
<PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
</SubSubTopic32>
<SubSubTopic31>
<PROBLEM31 Name="X" isValid="true">123.4</PROBLEM31>
<PROBLEM31 Name="Y" isValid="true">567.8</PROBLEM31>
<PROBLEM31 Name="Z" isValid="true">901.2</PROBLEM31>
</SubSubTopic31>
</SubTopic3>
...
<SubTopic3 Position="28">
<SubSubTopic32 Side="left">
<PROBLEM321 Name="A" isValid="true">0.1</PROBLEM321>
<PROBLEM321 Name="B" isValid="true">2.0</PROBLEM321>
<PROBLEM321 Name="C" isValid="true">3.5</PROBLEM321>
<PROBLEM321 Name="D" isValid="true">4.0</PROBLEM321>
<PROBLEM321 Name="E" isValid="true">5.5</PROBLEM321>
<PROBLEM321 Name="F" isValid="true">6.0</PROBLEM321>
<PROBLEM321 Name="G" isValid="true">0.7</PROBLEM321>
<PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
<PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
</SubSubTopic32>
<SubSubTopic32 Side="right">
<PROBLEM321 Name="A" isValid="true">1.0</PROBLEM321>
<PROBLEM321 Name="B" isValid="true">0.2</PROBLEM321>
<PROBLEM321 Name="C" isValid="true">0.3</PROBLEM321>
<PROBLEM321 Name="D" isValid="true">0.4</PROBLEM321>
<PROBLEM321 Name="E" isValid="true">0.5</PROBLEM321>
<PROBLEM321 Name="F" isValid="true">0.6</PROBLEM321>
<PROBLEM321 Name="G" isValid="true">7.7</PROBLEM321>
<PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
<PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
</SubSubTopic32>
<SubSubTopic31>
<PROBLEM31 Name="X" isValid="true">123.4</PROBLEM31>
<PROBLEM31 Name="Y" isValid="true">567.8</PROBLEM31>
<PROBLEM31 Name="Z" isValid="true">901.2</PROBLEM31>
</SubSubTopic31>
</SubTopic3>
</Topic3>
</MAT>
</soap:Body>
</soap:Envelope>
Schema
root
|-- Topic1: struct (nullable = true)
| |-- timestamp: timestamp (nullable = true)
| |-- place: string (nullable = true)
| |-- placeDescription: string (nullable = true)
|-- Topic2: struct (nullable = true)
| |-- SubTopic2: struct (nullable = true)
| | |-- Tag: long (nullable = true)
| | |-- _Position: long (nullable = true)
| | |-- isForward: boolean (nullable = true)
|-- Topic3: struct (nullable = true)
| |-- SubTopic3: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- SubSubTopic31: struct (nullable = true)
| | | | |-- PROBLEM31: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- _Name: string (nullable = true)
| | | | | | |-- _VALUE: double (nullable = true)
| | | | | | |-- _isValid: boolean (nullable = true)
| | | |-- SubSubTopic32: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- PROBLEM321: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- _Name: string (nullable = true)
| | | | | | | |-- _VALUE: double (nullable = true)
| | | | | | | |-- _isValid: boolean (nullable = true)
| | | | | |-- PROBLEM322: array (nullable = true)
| | | | | | |-- element: struct (containsNull = true)
| | | | | | | |-- _Name: string (nullable = true)
| | | | | | | |-- _VALUE: string (nullable = true)
| | | | | | | |-- _isValid: boolean (nullable = true)
| | | | | |-- _Side: string (nullable = true)
| | | |-- _Position: long (nullable = true)
Output I wish to get
root
|-- Topic1_timestamp: timestamp (nullable = true)
|-- Topic1_place: string (nullable = true)
|-- Topic1_placeDescription: string (nullable = true)
|-- Topic2_SubTopic2_Tag: long (nullable = true)
|-- Topic2_SubTopic2__Position: long (nullable = true)
|-- Topic2_SubTopic2_isForward: boolean (nullable = true)
|-- Topic3_SubTopic3__Position: long (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic31_X: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic31_Y: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic31_Z: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32__Side: string (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_A: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_B: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_C: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_D: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_E: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_F: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_G: double (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement1: string (nullable = true)
|-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement2: string (nullable = true)
Topic1_timestamp | Topic1_place | Topic1_placeDescription | Topic2_SubTopic2_Tag | Topic2_SubTopic2__Position | Topic2_SubTopic2_isForward | Topic3_SubTopic3__Position | Topic3_SubTopic3_SubSubTopic31_X | Topic3_SubTopic3_SubSubTopic31_Y | Topic3_SubTopic3_SubSubTopic31_Z | Topic3_SubTopic3_SubSubTopic32__Side | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_A | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_B | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_C | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_D | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_E | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_F | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_G | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement1 | Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement2 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2021-02-03 ... | short | long | 012345 | 1 | true | 1 | 123.4 | 567.8 | 901.2 | left | 0.1 | 2.0 | 3.5 | 4.0 | 5.5 | 6.0 | 0.7 | base64_encoded | base64_encoded |
2021-02-03 ... | short | long | 012345 | 1 | true | 1 | 123.4 | 567.8 | 901.2 | right | 1.0 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 7.7 | base64_encoded | base64_encoded |
2021-02-03 ... | short | long | 012345 | 1 | true | 28 | 123.4 | 567.8 | 901.2 | left | 0.1 | 2.0 | 3.5 | 4.0 | 5.5 | 6.0 | 0.7 | base64_encoded | base64_encoded |
2021-02-03 ... | short | long | 012345 | 1 | true | 28 | 123.4 | 567.8 | 901.2 | right | 1.0 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 7.7 | base64_encoded | base64_encoded |