I have a requirement to output the partitions based on the key. I'm trying using MultipleTextOutputFormat
.
I found this https://stackoverflow.com/a/26051042/6561443
but when I am trying to do the same in spark-shell I am getting the error.
scala> import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.io.NullWritable
scala> import org.apache.spark._
import org.apache.spark._
scala> import org.apache.spark.SparkContext._
import org.apache.spark.SparkContext._
scala> import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
scala> class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
override def generateActualKey(key: Any, value: Any): Any =
NullWritable.get()
override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
key.asInstanceOf[String]
}
<console>:11: error: not found: type MultipleTextOutputFormat
class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat[Any, Any] {
^
<console>:13: error: not found: value NullWritable
NullWritable.get()
If I submit this application with spark-submit ,I am getting
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 1
Am I missing something here? Doesn't it work in spark-shell?