dept = [("001","m", 25, "medium", "East"),
from pyspark.sql.types import StructType,StructField, StringType,
IntegerType
data2 = [("001","m", 25, "medium", "East"),
("002", None, 22, "medium", "West"),
("003","f", None, None, None),
("004","m", None, "low", "South"),
("005","f", 28, None, "East"),
("006","m", 25, "medium", "East"),
("007","m", 27, "high", None),
("008",None, 21, None, "North")
]
schema = StructType([ \
StructField("code",StringType(),True), \
StructField("sex",StringType(),True), \
StructField("age",IntegerType(),True), \
StructField("level", StringType(), True), \
StructField("region", StringType(), True) \
])
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()
df.show(truncate=False)
I am interested to filter the dataframe such that the following columns (sex, age and level) only has one null value and others are not null.
This code works fine for me but I think their is a smarter way that many lines of code
df.filter((df.sex.isNull() & df.age.isNotNull() &
df.level.isNotNull())\
| (df.sex.isNotNull() & df.age.isNull() &
df.level.isNotNull())\
|(df.sex.isNotNull() & df.age.isNotNull() &
df.level.isNull())).show()