I am trying to parse data from a XML file through Spark using databrics
library
Here is my code:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions
import java.text.Format
import org.apache.spark.sql.functions.concat_ws
import org.apache.spark.sql
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.plans.logical.With
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.udf
import scala.sys.process._
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions._
object printschema
{
def main(args: Array[String]): Unit =
{
val conf = new SparkConf().setAppName("printschema").setMaster("local")
conf.set("spark.debug.maxToStringFields", "10000000")
val context = new SparkContext(conf)
val sqlCotext = new SQLContext(context)
import sqlCotext.implicits._
val df = sqlCotext.read.format("com.databricks.spark.xml")
.option("rowTag", "us-bibliographic-data-application")
.option("treatEmptyValuesAsNulls", true)
.load("/Users/praveen/Desktop/ipa0105.xml")
val q1= df.withColumn("document",$"application-reference.document-id.doc-number".cast(sql.types.StringType))
.withColumn("document_number",$"application-reference.document-id.doc-number".cast(sql.types.StringType)).select("document","document_number").collect()
for(l<-q1)
{
val m1=l.get(0)
val m2=l.get(1)
println(m1,m2)
}
}
}
When I run the code on ScalaIDE/IntelliJ IDEA it works fine and here is my Output.
(14789882,14789882)
(14755945,14755945)
(14755919,14755919)
But, when I build a jar and execute it by using spark-submit
it returns simply null values
OUTPUT :
NULL,NULL
NULL,NULL
NULL,NULL
Here is my Spark submit:
./spark-submit --jars /home/hadoop/spark-xml_2.11-0.4.0.jar --class inndata.praveen --master local[2] /home/hadoop/ip/target/scala-2.11/ip_2.11-1.0.jar