Writing a test program to stream a file data to a Kafka stream using Spark 3.4.0
object SparkTest {
def main(args : Array[String]): Unit ={
val schema = StructType(
List(
StructField("id", StringType, true),
StructField("applicablechannel", StringType, true),
StructField("value", StringType, true),
StructField("associatedoffer", StringType, true),
StructField("pyissue", StringType, true),
StructField("controlgrouppercentage", StringType, true),
StructField("direction", StringType, true)
)
)
val spark = SparkSession.builder().appName("LoadFile")
.master("local[*]")
.config("spark.driver.memory", "1g")
.getOrCreate()
import spark.implicits._
val csvDF = spark.readStream.format("csv")
.option("header", true)
.schema(schema)
.load("file:////C:\\Users\\XYZ\\Downloads\\files")
.select("id", "value")
csvDF.writeStream
.format("kafka")
.option("kafka.bootstrap.servers", "myserver.mydomain.com:9092")
.option("topic", "KafkaDS")
.option("checkpointLocation", "C:\\Users\\XYZ\\AppData\\Local\\Temp")
.start()
.awaitTermination()
spark.close()
}
}
I gave below VM Args
--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED
--add-opens=java.base/sun.security.action=ALL-UNNAMED
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED
HADOOP_HOME environment variable is pointing to a \bin folder containing winutils.exe file. Below is my build.sbt file
ThisBuild / version := "0.1.0-SNAPSHOT"
ThisBuild / scalaVersion := "2.13.11"
val sparkVersion = "3.4.0"
val sparkAndDependencies = Seq(
"org.apache.spark" %% "spark-core" % sparkVersion,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion
)
libraryDependencies ++= sparkAndDependencies
lazy val root = (project in file("."))
.settings(
name := "SparkTestTake3"
)
I get below exception on console
Exception in thread "main" java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:225)
at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$1(HadoopFSUtils.scala:95)
at scala.collection.immutable.List.map(List.scala:246)
at scala.collection.immutable.List.map(List.scala:79)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:85)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:69)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:162)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:133)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:96)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:68)
at org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:539)
at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$sourceSchema$2(DataSource.scala:265)
at org.apache.spark.sql.execution.datasources.DataSource.tempFileIndex$lzycompute$1(DataSource.scala:162)
at org.apache.spark.sql.execution.datasources.DataSource.tempFileIndex$1(DataSource.scala:162)
at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:167)
at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:259)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:118)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:118)
at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:35)
at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:197)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:211)
at SparkTest$.main(SparkTest.scala:43)
at SparkTest.main(SparkTest.scala)
Looking for help on UnsatisfiedLinkError didn't provide much help. As I missing something here? I tried using Spark 2.4.8 but that throws a different error.