0
 dept = [("001","m", 25, "medium", "East"), 
 from pyspark.sql.types import StructType,StructField, StringType, 
 IntegerType
  data2 = [("001","m", 25, "medium", "East"),  
     ("002", None, 22, "medium", "West"), 
     ("003","f", None, None, None),
     ("004","m", None, "low", "South"),
     ("005","f", 28, None, "East"),
     ("006","m", 25, "medium", "East"),
     ("007","m", 27, "high", None),
     ("008",None, 21, None, "North")
 ]

  schema = StructType([ \
   StructField("code",StringType(),True), \
   StructField("sex",StringType(),True), \
   StructField("age",IntegerType(),True), \
   StructField("level", StringType(), True), \
   StructField("region", StringType(), True) \
 ])

 df = spark.createDataFrame(data=data2,schema=schema)
 df.printSchema()
 df.show(truncate=False)

I am interested to filter the dataframe such that the following columns (sex, age and level) only has one null value and others are not null.

This code works fine for me but I think their is a smarter way that many lines of code

 df.filter((df.sex.isNull() & df.age.isNotNull() & 
 df.level.isNotNull())\
      | (df.sex.isNotNull() & df.age.isNull() & 
 df.level.isNotNull())\
      |(df.sex.isNotNull() & df.age.isNotNull() & 
 df.level.isNull())).show()

enter image description here

  • 1
    Count `none` values in row and select those that have exactly 1? Something like [this](https://stackoverflow.com/a/52864816/16698727) – David Rubin Jul 27 '22 at 19:20
  • Does this answer your question? [Counting number of nulls in pyspark dataframe by row](https://stackoverflow.com/questions/52864713/counting-number-of-nulls-in-pyspark-dataframe-by-row) – Kafels Jul 27 '22 at 19:34

0 Answers0