0

I would like to read kudu table in spark 3(spark3_2.12). But, I cant read kudu table , even though I tried so hard. Could you please help me ? I tried to use https://medium.com/@sciencecommitter/how-to-read-from-and-write-to-kudu-tables-in-pyspark-via-impala-c4334b98cf05 and https://kudu.apache.org/releases/1.10.1/docs/developing.html and https://gist.github.com/zkrhm/1e0b2fa9ac2daa44c25cda5963b2e386; but mine is not working. Thanks in advance. BR.

`

######1 My environment like below ###`
`import sys, os
import platform

os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ['PYSPARK_PYTHON'] = "/opt/cloudera/parcels/anaconda/bin/python"
os.environ['SPARK_HOME'] = "/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/"
os.environ['PYSPARK_DRIVER_PYTHON'] = "/opt/cloudera/parcels/anaconda/bin/python"
#os.environ["PYSPARK_SUBMIT_ARGS"] = " --packages org.apache.kudu:kudu-spark3_2.12:15.0.7.1.7 pyspark-shell" `

""" I was not sure whether I should  add above "PYSPARK_SUBMIT_ARGS" env or not, I tried both and then they were unsuccessful to read kudu table """

`sys.path.append("/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/")
sys.path.append("/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/lib/py4j-0.10.9-src.zip")
sys.path.append("/opt/cloudera/parcels/CDH/lib/")`
`#####2`

`import findspark

findspark.init("/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/")

from pyspark import SparkContext
from pyspark.sql import SparkSession,functions
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import broadcast
from matplotlib import pyplot as plt
import numpy as np
import matplotlib
import numpy as np
import pandas as pd
import time`

#3

`spark = SparkSession.builder.appName("tets").enableHiveSupport() \
.config("spark.jars", "/opt/cloudera/parcels/CDH/lib/kudu/kudu-spark3_2.12.jar")\
.config('spark.packages', 'org.apache.kudu:kudu-spark3_2.12')\
.config('hive.exec.dynamic.partition.mode','nonstrict')\
.config('hive.groupby.skewindata','false') \
.getOrCreate()`

`#4 `

`kuduDF = spark.read \
.format('org.apache.kudu.spark.kudu') \     
.option('kudu.master',"master-1,master-2,master-3") \
.option('kudu.table', "impala::db.kudu_table") \
.load()`

I got below error after 4 command

`---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
/tmp/ipykernel_37711/3874063836.py in <module>
      2     .format('org.apache.kudu.spark.kudu') \
      3     .option('kudu.master',"master-1,master-2,master-3") \
----> 4     .option('kudu.table', "impala::db.kudu_table") \
      5     .load()

/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
    208             return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
    209         else:
--> 210             return self._df(self._jreader.load())
    211 
    212     def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,

/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1303         answer = self.gateway_client.send_command(command)
   1304         return_value = get_return_value(
-> 1305             answer, self.gateway_client, self.target_id, self.name)
   1306 
   1307         for temp_arg in temp_args:

/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/pyspark/sql/utils.py in deco(*a, **kw)
    109     def deco(*a, **kw):
    110         try:
--> 111             return f(*a, **kw)
    112         except py4j.protocol.Py4JJavaError as e:
    113             converted = convert_exception(e.java_exception)

/opt/cloudera/parcels/SPARK3_RAPIDS/lib/spark3/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling o146.load.
: java.lang.NoClassDefFoundError: org/apache/spark/sql/sources/v2/DataSourceV2
    at java.lang.ClassLoader.defineClass1(Native Method)
    at java.lang.ClassLoader.defineClass(ClassLoader.java:756)
    at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
    at java.net.URLClassLoader.defineClass(URLClassLoader.java:468)
    at java.net.URLClassLoader.access$100(URLClassLoader.java:74)
    at java.net.URLClassLoader$1.run(URLClassLoader.java:369)
    at java.net.URLClassLoader$1.run(URLClassLoader.java:363)
    at java.security.AccessController.doPrivileged(Native Method)
    at java.net.URLClassLoader.findClass(URLClassLoader.java:362)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:405)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
    at java.lang.Class.forName0(Native Method)
    at java.lang.Class.forName(Class.java:348)
    at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:370)
    at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
    at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
    at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:44)
    at scala.collection.Iterator.foreach(Iterator.scala:941)
    at scala.collection.Iterator.foreach$(Iterator.scala:941)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
    at scala.collection.IterableLike.foreach(IterableLike.scala:74)
    at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
    at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
    at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:255)
    at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:249)
    at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
    at scala.collection.TraversableLike.filter(TraversableLike.scala:347)
    at scala.collection.TraversableLike.filter$(TraversableLike.scala:347)
    at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
    at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:659)
    at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:743)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:266)
    at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:226)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
    at py4j.Gateway.invoke(Gateway.java:282)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:238)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.sources.v2.DataSourceV2
    at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
    ... 46 more`

0 Answers0