0

All,

I am reading the mongo connection using spark, and some of the datatypes are coming as null; I found code to replace null data types with string in pyspark, but need similar in scala - can anyone help how can be done?

def fix_spark_schema(schema):
  if schema.__class__ == pyspark.sql.types.StructType:
    return pyspark.sql.types.StructType([fix_spark_schema(f) for f in schema.fields])
  if schema.__class__ == pyspark.sql.types.StructField:
    return pyspark.sql.types.StructField(schema.name, fix_spark_schema(schema.dataType), schema.nullable)
  if schema.__class__ == pyspark.sql.types.NullType:
    return pyspark.sql.types.StringType()
  return schema
thebluephantom
  • 16,458
  • 8
  • 40
  • 83

1 Answers1

0

Here is the scala version

  def deNullifyStruct(struct: StructType): StructType = {
    val items = struct.map { field => StructField(field.name, fixNullType(field.dataType), field.nullable, field.metadata) }
    StructType(items)
  }


  def fixNullType(dt: DataType): DataType = {
    dt match {
      case _: StructType => return deNullifyStruct(dt.asInstanceOf[StructType])
      case _: ArrayType =>
        val array = dt.asInstanceOf[ArrayType]
        return ArrayType(fixNullType(array.elementType), array.containsNull)
      case _: NullType => return StringType
      case _ => return dt
    }
  }

val df = // Initial df
// get new dataframe using modified schema
val newDF = spark.createDataFrame(df.rdd, deNullifyStruct(df.schema))

Reference : Writing null values to Parquet in Spark when the NullType is inside a StructType

Mohana B C
  • 5,021
  • 1
  • 9
  • 28