When saving an RDD to S3 in AVRO, I get the following warning in the console:
Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe.
I haven't been able to find a simple implicit such as saveAsAvroFile
and therefore I've dug around and came to this:
import org.apache.avro.Schema
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.{AvroJob, AvroKeyOutputFormat}
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
object AvroUtil {
def write[T](
path: String,
schema: Schema,
avroRdd: RDD[T],
job: Job = Job.getInstance()): Unit = {
val intermediateRdd = avroRdd.mapPartitions(
f = (iter: Iterator[T]) => iter.map(new AvroKey(_) -> NullWritable.get()),
preservesPartitioning = true
)
job.getConfiguration.set("avro.output.codec", "snappy")
job.getConfiguration.set("mapreduce.output.fileoutputformat.compress", "true")
AvroJob.setOutputKeySchema(job, schema)
intermediateRdd.saveAsNewAPIHadoopFile(
path,
classOf[AvroKey[T]],
classOf[NullWritable],
classOf[AvroKeyOutputFormat[T]],
job.getConfiguration
)
}
}
I'm rather baffled as I don't see what is incorrect because the AVRO files seem to be outputted correctly.