I would like to read kudu table in spark 3(spark3_2.12). But, I cant read kudu table , even though I tried so hard. Could you please help me ? I tried to use https://medium.com/@sciencecommitter/how-to-read-from-and-write-to-kudu-tables-in-pyspark-via-impala-c4334b98cf05 and https://kudu.apache.org/releases/1.10.1/docs/developing.html and https://gist.github.com/zkrhm/1e0b2fa9ac2daa44c25cda5963b2e386; but mine is not working. Thanks in advance. BR.
`
######1 My environment like below ###`
`import sys, os
import platform
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ['PYSPARK_PYTHON'] = "/opt/cloudera/parcels/anaconda/bin/python"
os.environ['SPARK_HOME'] = "/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/"
os.environ['PYSPARK_DRIVER_PYTHON'] = "/opt/cloudera/parcels/anaconda/bin/python"
#os.environ["PYSPARK_SUBMIT_ARGS"] = " --packages org.apache.kudu:kudu-spark3_2.12:15.0.7.1.7 pyspark-shell" `
""" I was not sure whether I should add above "PYSPARK_SUBMIT_ARGS" env or not, I tried both and then they were unsuccessful to read kudu table """
`sys.path.append("/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/")
sys.path.append("/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/lib/py4j-0.10.9-src.zip")
sys.path.append("/opt/cloudera/parcels/CDH/lib/")`
`#####2`
`import findspark
findspark.init("/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/")
from pyspark import SparkContext
from pyspark.sql import SparkSession,functions
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import broadcast
from matplotlib import pyplot as plt
import numpy as np
import matplotlib
import numpy as np
import pandas as pd
import time`
#3
`spark = SparkSession.builder.appName("tets").enableHiveSupport() \
.config("spark.jars", "/opt/cloudera/parcels/CDH/lib/kudu/kudu-spark3_2.12.jar")\
.config('spark.packages', 'org.apache.kudu:kudu-spark3_2.12')\
.config('hive.exec.dynamic.partition.mode','nonstrict')\
.config('hive.groupby.skewindata','false') \
.getOrCreate()`
`#4 `
`kuduDF = spark.read \
.format('org.apache.kudu.spark.kudu') \
.option('kudu.master',"master-1,master-2,master-3") \
.option('kudu.table', "impala::db.kudu_table") \
.load()`
I got below error after 4 command
`---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
/tmp/ipykernel_37711/3874063836.py in <module>
2 .format('org.apache.kudu.spark.kudu') \
3 .option('kudu.master',"master-1,master-2,master-3") \
----> 4 .option('kudu.table', "impala::db.kudu_table") \
5 .load()
/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
208 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
209 else:
--> 210 return self._df(self._jreader.load())
211
212 def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/pyspark/sql/utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o146.load.
: java.lang.NoClassDefFoundError: org/apache/spark/sql/sources/v2/DataSourceV2
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:756)
at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
at java.net.URLClassLoader.defineClass(URLClassLoader.java:468)
at java.net.URLClassLoader.access$100(URLClassLoader.java:74)
at java.net.URLClassLoader$1.run(URLClassLoader.java:369)
at java.net.URLClassLoader$1.run(URLClassLoader.java:363)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:362)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
at java.lang.ClassLoader.loadClass(ClassLoader.java:405)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:370)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:44)
at scala.collection.Iterator.foreach(Iterator.scala:941)
at scala.collection.Iterator.foreach$(Iterator.scala:941)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:255)
at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:249)
at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
at scala.collection.TraversableLike.filter(TraversableLike.scala:347)
at scala.collection.TraversableLike.filter$(TraversableLike.scala:347)
at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:659)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:743)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:266)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:226)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.sources.v2.DataSourceV2
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 46 more`