3

Problem I can either flatten the XML file in total (with the functions below) or not at all.

But how can I flatten it in part - with the twist that some values should be the column-name (_Name (col) and _VALUE (corresponding value))?

Input

df = spark.read.format("xml").options(rowTag="MAT").load("file.xml")
df_flat = flatten_explode_df(nested_df=df)
def type_cols(df_dtypes, filter_type):
    cols = []
    for col_name, col_type in df_dtypes:
        if col_type.startswith(filter_type):
            cols.append(col_name)
    return cols

def flatten_df(nested_df, sep="_"):
    nested_cols = type_cols(nested_df.dtypes, "struct")
    flatten_cols = [fc for fc, _ in nested_df.dtypes if fc not in nested_cols]
    for nc in nested_cols:
        for cc in nested_df.select(f"{nc}.*").columns:
            if sep is None:
                flatten_cols.append(F.col(f"{nc}.{cc}").alias(f"{cc}"))
            else:
                flatten_cols.append(F.col(f"{nc}.{cc}").alias(f"{nc}{sep}{cc}"))
    return nested_df.select(flatten_cols)

def explode_df(nested_df):
    nested_cols = type_cols(nested_df.dtypes, "array")
    exploded_df = nested_df
    for nc in nested_cols:
        exploded_df = exploded_df.withColumn(nc, F.explode(F.col(nc)))
    return exploded_df

def flatten_explode_df(nested_df):
    df = nested_df
    struct_cols = type_cols(nested_df.dtypes, "struct")
    array_cols = type_cols(nested_df.dtypes, "array")
    if struct_cols:
        df = flatten_df(df)
        return flatten_explode_df(df)
    if array_cols:
        df = explode_df(df)
        return flatten_explode_df(df)
    return df

XML-Content

<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope
  xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
  <soap:Body
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xmlns:xsd="http://www.w3.org/2001/XMLSchema">
    <MAT>
      <Topic1>
        <place>short</place>
        <placeDescription>long</placeDescription>
        <timestamp>2021-02-03T04:05:06+07:00</timestamp>
      </Topic1>
      <Topic2>
        <SubTopic2 Position="1">
          <Tag>012345</Tag>
          <isForward>true</isForward>
        </SubTopic2>
      </Topic2>
      <Topic3>
        <SubTopic3 Position="1">
          <SubSubTopic32 Side="left">
            <PROBLEM321 Name="A" isValid="true">0.1</PROBLEM321>
            <PROBLEM321 Name="B" isValid="true">2.0</PROBLEM321>
            <PROBLEM321 Name="C" isValid="true">3.5</PROBLEM321>
            <PROBLEM321 Name="D" isValid="true">4.0</PROBLEM321>
            <PROBLEM321 Name="E" isValid="true">5.5</PROBLEM321>
            <PROBLEM321 Name="F" isValid="true">6.0</PROBLEM321>
            <PROBLEM321 Name="G" isValid="true">0.7</PROBLEM321>
            <PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
            <PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
          </SubSubTopic32>
          <SubSubTopic32 Side="right">
            <PROBLEM321 Name="A" isValid="true">1.0</PROBLEM321>
            <PROBLEM321 Name="B" isValid="true">0.2</PROBLEM321>
            <PROBLEM321 Name="C" isValid="true">0.3</PROBLEM321>
            <PROBLEM321 Name="D" isValid="true">0.4</PROBLEM321>
            <PROBLEM321 Name="E" isValid="true">0.5</PROBLEM321>
            <PROBLEM321 Name="F" isValid="true">0.6</PROBLEM321>
            <PROBLEM321 Name="G" isValid="true">7.7</PROBLEM321>
            <PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
            <PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
          </SubSubTopic32>
          <SubSubTopic31>
            <PROBLEM31 Name="X" isValid="true">123.4</PROBLEM31>
            <PROBLEM31 Name="Y" isValid="true">567.8</PROBLEM31>
            <PROBLEM31 Name="Z" isValid="true">901.2</PROBLEM31>
          </SubSubTopic31>
        </SubTopic3>
        ...
        <SubTopic3 Position="28">
          <SubSubTopic32 Side="left">
            <PROBLEM321 Name="A" isValid="true">0.1</PROBLEM321>
            <PROBLEM321 Name="B" isValid="true">2.0</PROBLEM321>
            <PROBLEM321 Name="C" isValid="true">3.5</PROBLEM321>
            <PROBLEM321 Name="D" isValid="true">4.0</PROBLEM321>
            <PROBLEM321 Name="E" isValid="true">5.5</PROBLEM321>
            <PROBLEM321 Name="F" isValid="true">6.0</PROBLEM321>
            <PROBLEM321 Name="G" isValid="true">0.7</PROBLEM321>
            <PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
            <PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
          </SubSubTopic32>
          <SubSubTopic32 Side="right">
            <PROBLEM321 Name="A" isValid="true">1.0</PROBLEM321>
            <PROBLEM321 Name="B" isValid="true">0.2</PROBLEM321>
            <PROBLEM321 Name="C" isValid="true">0.3</PROBLEM321>
            <PROBLEM321 Name="D" isValid="true">0.4</PROBLEM321>
            <PROBLEM321 Name="E" isValid="true">0.5</PROBLEM321>
            <PROBLEM321 Name="F" isValid="true">0.6</PROBLEM321>
            <PROBLEM321 Name="G" isValid="true">7.7</PROBLEM321>
            <PROBLEM322 Name="measurement1" isValid="true">base64_encoded</PROBLEM322>
            <PROBLEM322 Name="measurement2" isValid="true">base64_encoded</PROBLEM322>
          </SubSubTopic32>
          <SubSubTopic31>
            <PROBLEM31 Name="X" isValid="true">123.4</PROBLEM31>
            <PROBLEM31 Name="Y" isValid="true">567.8</PROBLEM31>
            <PROBLEM31 Name="Z" isValid="true">901.2</PROBLEM31>
          </SubSubTopic31>
        </SubTopic3>
      </Topic3>
    </MAT>
  </soap:Body>
</soap:Envelope>

Schema

    root
     |-- Topic1: struct (nullable = true)
     |    |-- timestamp: timestamp (nullable = true)
     |    |-- place: string (nullable = true)
     |    |-- placeDescription: string (nullable = true)
     |-- Topic2: struct (nullable = true)
     |    |-- SubTopic2: struct (nullable = true)
     |    |    |-- Tag: long (nullable = true)
     |    |    |-- _Position: long (nullable = true)
     |    |    |-- isForward: boolean (nullable = true)
     |-- Topic3: struct (nullable = true)
     |    |-- SubTopic3: array (nullable = true)
     |    |    |-- element: struct (containsNull = true)
     |    |    |    |-- SubSubTopic31: struct (nullable = true)
     |    |    |    |    |-- PROBLEM31: array (nullable = true)
     |    |    |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |    |    |-- _Name: string (nullable = true)
     |    |    |    |    |    |    |-- _VALUE: double (nullable = true)
     |    |    |    |    |    |    |-- _isValid: boolean (nullable = true)
     |    |    |    |-- SubSubTopic32: array (nullable = true)
     |    |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |    |-- PROBLEM321: array (nullable = true)
     |    |    |    |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |    |    |    |-- _Name: string (nullable = true)
     |    |    |    |    |    |    |    |-- _VALUE: double (nullable = true)
     |    |    |    |    |    |    |    |-- _isValid: boolean (nullable = true)
     |    |    |    |    |    |-- PROBLEM322: array (nullable = true)
     |    |    |    |    |    |    |-- element: struct (containsNull = true)
     |    |    |    |    |    |    |    |-- _Name: string (nullable = true)
     |    |    |    |    |    |    |    |-- _VALUE: string (nullable = true)
     |    |    |    |    |    |    |    |-- _isValid: boolean (nullable = true)
     |    |    |    |    |    |-- _Side: string (nullable = true)
     |    |    |    |-- _Position: long (nullable = true)

Output I wish to get

root
 |-- Topic1_timestamp: timestamp (nullable = true)
 |-- Topic1_place: string (nullable = true)
 |-- Topic1_placeDescription: string (nullable = true)
 |-- Topic2_SubTopic2_Tag: long (nullable = true)
 |-- Topic2_SubTopic2__Position: long (nullable = true)
 |-- Topic2_SubTopic2_isForward: boolean (nullable = true)
 |-- Topic3_SubTopic3__Position: long (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic31_X: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic31_Y: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic31_Z: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32__Side: string (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_A: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_B: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_C: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_D: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_E: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_F: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_G: double (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement1: string (nullable = true)
 |-- Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement2: string (nullable = true)
Topic1_timestamp Topic1_place Topic1_placeDescription Topic2_SubTopic2_Tag Topic2_SubTopic2__Position Topic2_SubTopic2_isForward Topic3_SubTopic3__Position Topic3_SubTopic3_SubSubTopic31_X Topic3_SubTopic3_SubSubTopic31_Y Topic3_SubTopic3_SubSubTopic31_Z Topic3_SubTopic3_SubSubTopic32__Side Topic3_SubTopic3_SubSubTopic32_PROBLEM321_A Topic3_SubTopic3_SubSubTopic32_PROBLEM321_B Topic3_SubTopic3_SubSubTopic32_PROBLEM321_C Topic3_SubTopic3_SubSubTopic32_PROBLEM321_D Topic3_SubTopic3_SubSubTopic32_PROBLEM321_E Topic3_SubTopic3_SubSubTopic32_PROBLEM321_F Topic3_SubTopic3_SubSubTopic32_PROBLEM321_G Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement1 Topic3_SubTopic3_SubSubTopic32_PROBLEM321_measurement2
2021-02-03 ... short long 012345 1 true 1 123.4 567.8 901.2 left 0.1 2.0 3.5 4.0 5.5 6.0 0.7 base64_encoded base64_encoded
2021-02-03 ... short long 012345 1 true 1 123.4 567.8 901.2 right 1.0 0.2 0.3 0.4 0.5 0.6 7.7 base64_encoded base64_encoded
2021-02-03 ... short long 012345 1 true 28 123.4 567.8 901.2 left 0.1 2.0 3.5 4.0 5.5 6.0 0.7 base64_encoded base64_encoded
2021-02-03 ... short long 012345 1 true 28 123.4 567.8 901.2 right 1.0 0.2 0.3 0.4 0.5 0.6 7.7 base64_encoded base64_encoded
Olly
  • 31
  • 3

0 Answers0