I wrote these simple 4 lines of code:
import pyspark
from pyspark.sql import SparkSession
spa = SparkSession.builder.getOrCreate()
spa.createDataFrame([(1,2,3)], ["count"])
but that createDataFrame function is generating this huge error:
Py4JError Traceback (most recent call last) in 3 spa = SparkSession.builder.getOrCreate() 4 ----> 5 spa.createDataFrame([(1,2,3)], ["count"])
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\sql\session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema) 690 else: 691 rdd, schema = self._createFromLocal(map(prepare, data), schema) --> 692 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) 693 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) 694 df = DataFrame(jdf, self._wrapped)
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\rdd.py in _to_java_object_rdd(self) 2294 """ 2295 rdd = self._pickled() -> 2296 return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True) 2297 2298 def countApprox(self, timeout, confidence=0.95):
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\rdd.py in _jrdd(self) 2472
self._jrdd_deserializer, profiler) 2473 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func, -> 2474 self.preservesPartitioning) 2475 self._jrdd_val = python_rdd.asJavaRDD() 2476c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\py4j\java_gateway.py in call(self, *args) 1523 answer = self._gateway_client.send_command(command) 1524
return_value = get_return_value( -> 1525 answer, self._gateway_client, None, self._fqn) 1526 1527 for temp_arg in temp_args:c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw) 61 def deco(*a, **kw): 62 try: ---> 63 return f(*a, **kw) 64 except py4j.protocol.Py4JJavaError as e: 65 s = e.java_exception.toString()
c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name) 330 raise Py4JError( 331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n". --> 332 format(target_id, ".", name, value)) 333 else: 334 raise Py4JError(
> Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonRDD. Trace: py4j.Py4JException: Constructor org.apache.spark.api.python.PythonRDD([class org.apache.spark.rdd.ParallelCollectionRDD, class org.apache.spark.api.python.PythonFunction, class java.lang.Boolean]) does not exist at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179) at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196) at py4j.Gateway.invoke(Gateway.java:237) at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80) at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748)
why is this happening? that code is literally the same as other tutorials and it works fine there...