You're very close, you were just missing the .otherwise
call.
from pyspark.sql import types as T, functions as F, SparkSession
spark = SparkSession.builder.getOrCreate()
# Synthesize DataFrames
schema = T.StructType([
T.StructField("type_description", T.StringType(), False),
T.StructField("rename_description", T.StringType(), False),
T.StructField("col_3", T.StringType(), False),
T.StructField("col_4", T.IntegerType(), False),
])
data = [
{"type_description": "key_1", "rename_description": "key_2", "col_3": "CREATE", "col_4": 0},
{"type_description": "key_2", "rename_description": "key_2", "col_3": "CREATE", "col_4": 0},
{"type_description": "key_3", "rename_description": "OVERRIDE", "col_3": "CREATE", "col_4": 0},
]
df = spark.createDataFrame(data, schema)
df.show()
"""
+----------------+------------------+------+-----+
|type_description|rename_description| col_3|col_4|
+----------------+------------------+------+-----+
| key_1| key_2|CREATE| 0|
| key_2| key_2|CREATE| 0|
| key_3| OVERRIDE|CREATE| 0|
+----------------+------------------+------+-----+
"""
SUMMARY_DF = df.withColumn(
"final_description",
F.when(
df.type_description == df.rename_description,
df.type_description
).otherwise(
df.rename_description
)
)
SUMMARY_DF.show()
"""
+----------------+------------------+------+-----+-----------------+
|type_description|rename_description| col_3|col_4|final_description|
+----------------+------------------+------+-----+-----------------+
| key_1| key_2|CREATE| 0| key_2|
| key_2| key_2|CREATE| 0| key_2|
| key_3| OVERRIDE|CREATE| 0| OVERRIDE|
+----------------+------------------+------+-----+-----------------+
"""