0

There are a few posts on here about handling invalid characters at the first level but not multi-nested attributes

I encountered this error with my multi-nested schema

org.apache.spark.sql.AnalysisException: Attribute name "Foo Bar" contains invalid character(s) among " ,;{}()\n\t=". Please use alias to rename it.;
moon
  • 1,702
  • 3
  • 19
  • 35
  • Possible duplicate of [Spark Dataframe validating column names for parquet writes (scala)](https://stackoverflow.com/questions/38191157/spark-dataframe-validating-column-names-for-parquet-writes-scala) – Eugene Lopatkin Jul 09 '19 at 08:53

1 Answers1

0

Here is my solution in scala

private val INVALID_ATTRIBUTE_CHARS = "[ ,;{}()\n\t=]"

def replaceBadAttriName(structType: StructType): StructType =
  StructType(structType.fields.map(cleanStructFld))

private def cleanStructFld(fld: StructField): StructField = {
  fld.dataType match {
    case struct: StructType =>
      StructField(fld.name, StructType(struct.map(cleanStructFld)), fld.nullable, fld.metadata)
    case _ =>
      val newName = fld.name.replaceAll(INVALID_ATTRIBUTE_CHARS, "_")
      StructField(newName, fld.dataType, fld.nullable, fld.metadata)
  }
}
moon
  • 1,702
  • 3
  • 19
  • 35