A literal answer, helped by How to know if an object is an instance of a TypeTag's type?, would be this:
var x = spark.table(...)
import org.apache.spark.sql.types._
import scala.reflect.{ClassTag, classTag}
def selectColumnsByType[T <: DataType : ClassTag](schema: StructType):Seq[String] = {
schema.filter(field => classTag[T].runtimeClass.isInstance(field.dataType)).map(_.name)
}
selectColumnsByType[DecimalType](x.schema)
However, this form definitely makes it easier to use:
var x = spark.table(...)
import org.apache.spark.sql.types._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
import scala.reflect.{ClassTag, classTag}
class DataFrameHelpers(val df: DataFrame) {
def selectColumnsByType[T <: DataType : ClassTag](): DataFrame = {
val cols = df.schema.filter(field => classTag[T].runtimeClass.isInstance(field.dataType)).map(field => col(field.name))
df.select(cols:_*)
}
}
implicit def toDataFrameHelpers(df: DataFrame): DataFrameHelpers = new DataFrameHelpers(df)
x = x.selectColumnsByType[DecimalType]()
Note, though, as an earlier answer mentioned -- isInstanceOf
isn't really appropriate here, although it is helpful if you want to get all DecimalType
columns, regardless of precision. Using the more normal method, you could do the following instead, which also lets you specify multiple types!
var x = spark.table(...)
import org.apache.spark.sql.types._
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.col
class DataFrameHelpers(val df: DataFrame) {
def selectColumnsByType(dt: DataType*): DataFrame = {
val cols = df.schema.filter(field => dt.exists(_ == field.dataType)).map(field => col(field.name))
df.select(cols:_*)
}
}
implicit def toDataFrameHelpers(df: DataFrame): DataFrameHelpers = new DataFrameHelpers(df)
x = x.selectColumnsByType(ShortType, DecimalType(38,18))