0

I am trying to build an application with spark job server API(for spark 2.2.0). But I found that there is no support for namedObject with sparkSession. my looks like:

import com.typesafe.config.Config
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.scalactic._
import spark.jobserver.{NamedDataFrame, NamedObjectSupport, SparkSessionJob}
import spark.jobserver.api.{JobEnvironment, SingleProblem, ValidationProblem}

import scala.util.Try

object word1 extends SparkSessionJob with NamedObjectSupport {
  type JobData = Seq[String]
  type JobOutput = String

def runJob(sparkSession: SparkSession, runtime: JobEnvironment, data: JobData): JobOutput =
{
  val df = sparkSession.sparkContext.parallelize(data)
  val ndf = NamedDataFrame(df, true, StorageLevel.MEMORY_ONLY)
  this.namedObjects.update("df1", ndf)
  this.namedObjects.getNames().toString
}


 def validate(sparkSession: SparkSession, runtime: JobEnvironment, config: Config):
    JobData Or Every[ValidationProblem] = {
Try(config.getString("input.string").split(" ").toSeq)
  .map(words => Good(words))
  .getOrElse(Bad(One(SingleProblem("No input.string param"))))
   }  

}

but there is error at line this.namedObjects.update(). I think they do not have support for namedObject. while the same code is compiling with SparkJob:

object word1 extends SparkJob with NamedObjectSupport 

Is there support of namedObjects with sparksession ? If not then what is work around to persist dataframe/dataset ?

arglee
  • 1,374
  • 4
  • 17
  • 30

1 Answers1

0

I figured it out. it was silly mistake from my side. from https://github.com/spark-jobserver/spark-jobserver/blob/master/job-server-api/src/main/scala/spark/jobserver/NamedObjectSupport.scala#L138 . as it says:

// NamedObjectSupport is not needed anymore due to JobEnvironment in api.SparkJobBase. It is also // imported into the old spark.jobserver.SparkJobBase automatically for compatibility.

@Deprecated
trait NamedObjectSupport

Therefore to access these functionality we need to modify this code into:

import com.typesafe.config.Config
import org.apache.spark.sql.SparkSession 
import org.apache.spark.storage.StorageLevel
import org.scalactic._
import spark.jobserver.{NamedDataFrame, NamedObjectSupport, SparkSessionJob}
import spark.jobserver.api.{JobEnvironment, SingleProblem, ValidationProblem}

import scala.util.Try

object word1 extends SparkSessionJob with NamedObjectSupport {
  type JobData = Seq[String]
  type JobOutput = String

def runJob(sparkSession: SparkSession, runtime: JobEnvironment, data: JobData): JobOutput =
  {
   val df = sparkSession.sparkContext.parallelize(data)
   val ndf = NamedDataFrame(df, true, StorageLevel.MEMORY_ONLY)
   runtime.namedObjects.update("df1", ndf)
   runtime.namedObjects.getNames().toString
  }


 def validate(sparkSession: SparkSession, runtime: JobEnvironment, config: Config):
    JobData Or Every[ValidationProblem] = {
 Try(config.getString("input.string").split(" ").toSeq)
   .map(words => Good(words))
   .getOrElse(Bad(One(SingleProblem("No input.string param"))))
  }  

 }
arglee
  • 1,374
  • 4
  • 17
  • 30