0

Writing a test program to stream a file data to a Kafka stream using Spark 3.4.0




    object SparkTest {
      def main(args : Array[String]): Unit ={

        val schema = StructType(
          List(
            StructField("id", StringType, true),
            StructField("applicablechannel", StringType, true),
            StructField("value", StringType, true),
            StructField("associatedoffer", StringType, true),
            StructField("pyissue", StringType, true),
            StructField("controlgrouppercentage", StringType, true),
            StructField("direction", StringType, true)
          )
        )

        val spark = SparkSession.builder().appName("LoadFile")
          .master("local[*]")
          .config("spark.driver.memory", "1g")
          .getOrCreate()
        import spark.implicits._
        val csvDF = spark.readStream.format("csv")
          .option("header", true)
          .schema(schema)
          .load("file:////C:\\Users\\XYZ\\Downloads\\files")
          .select("id", "value")

        csvDF.writeStream
          .format("kafka")
          .option("kafka.bootstrap.servers", "myserver.mydomain.com:9092")
          .option("topic", "KafkaDS")
          .option("checkpointLocation", "C:\\Users\\XYZ\\AppData\\Local\\Temp")
          .start()
          .awaitTermination()

        spark.close()
      }
    }

I gave below VM Args

--add-opens=java.base/java.lang=ALL-UNNAMED
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED
--add-opens=java.base/java.io=ALL-UNNAMED
--add-opens=java.base/java.net=ALL-UNNAMED
--add-opens=java.base/java.nio=ALL-UNNAMED
--add-opens=java.base/java.util=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED
--add-opens=java.base/sun.security.action=ALL-UNNAMED
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED

HADOOP_HOME environment variable is pointing to a \bin folder containing winutils.exe file. Below is my build.sbt file


    ThisBuild / version := "0.1.0-SNAPSHOT"

    ThisBuild / scalaVersion := "2.13.11"
    val sparkVersion = "3.4.0"

    val sparkAndDependencies = Seq(
      "org.apache.spark" %% "spark-core" % sparkVersion,
      "org.apache.spark" %% "spark-sql" % sparkVersion,
      "org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVersion
    )

    libraryDependencies ++= sparkAndDependencies

    lazy val root = (project in file("."))
      .settings(
        name := "SparkTestTake3"
      )



I get below exception on console



Exception in thread "main" java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
    at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
    at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
    at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
    at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
    at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
    at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
    at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
    at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
    at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:225)
    at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$1(HadoopFSUtils.scala:95)
    at scala.collection.immutable.List.map(List.scala:246)
    at scala.collection.immutable.List.map(List.scala:79)
    at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:85)
    at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:69)
    at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:162)
    at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:133)
    at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:96)
    at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:68)
    at org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:539)
    at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$sourceSchema$2(DataSource.scala:265)
    at org.apache.spark.sql.execution.datasources.DataSource.tempFileIndex$lzycompute$1(DataSource.scala:162)
    at org.apache.spark.sql.execution.datasources.DataSource.tempFileIndex$1(DataSource.scala:162)
    at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:167)
    at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:259)
    at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:118)
    at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:118)
    at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:35)
    at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:197)
    at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:211)
    at SparkTest$.main(SparkTest.scala:43)
    at SparkTest.main(SparkTest.scala)


Looking for help on UnsatisfiedLinkError didn't provide much help. As I missing something here? I tried using Spark 2.4.8 but that throws a different error.

OneCricketeer
  • 179,855
  • 19
  • 132
  • 245

0 Answers0