-1

I wrote these simple 4 lines of code:

import pyspark
from pyspark.sql import SparkSession
spa = SparkSession.builder.getOrCreate()

spa.createDataFrame([(1,2,3)], ["count"])

but that createDataFrame function is generating this huge error:

Py4JError Traceback (most recent call last) in 3 spa = SparkSession.builder.getOrCreate() 4 ----> 5 spa.createDataFrame([(1,2,3)], ["count"])

c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\sql\session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema) 690 else: 691 rdd, schema = self._createFromLocal(map(prepare, data), schema) --> 692 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) 693 jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json()) 694 df = DataFrame(jdf, self._wrapped)

c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\rdd.py in _to_java_object_rdd(self) 2294 """ 2295 rdd = self._pickled() -> 2296 return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True) 2297 2298 def countApprox(self, timeout, confidence=0.95):

c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\rdd.py in _jrdd(self) 2472
self._jrdd_deserializer, profiler) 2473 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func, -> 2474 self.preservesPartitioning) 2475 self._jrdd_val = python_rdd.asJavaRDD() 2476

c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\py4j\java_gateway.py in call(self, *args) 1523 answer = self._gateway_client.send_command(command) 1524
return_value = get_return_value( -> 1525 answer, self._gateway_client, None, self._fqn) 1526 1527 for temp_arg in temp_args:

c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw) 61 def deco(*a, **kw): 62 try: ---> 63 return f(*a, **kw) 64 except py4j.protocol.Py4JJavaError as e: 65 s = e.java_exception.toString()

c:\users\hp\appdata\local\programs\python\python37\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name) 330 raise Py4JError( 331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n". --> 332 format(target_id, ".", name, value)) 333 else: 334 raise Py4JError(

> Py4JError: An error occurred while calling None.org.apache.spark.api.python.PythonRDD. Trace: py4j.Py4JException: Constructor org.apache.spark.api.python.PythonRDD([class org.apache.spark.rdd.ParallelCollectionRDD, class org.apache.spark.api.python.PythonFunction, class java.lang.Boolean]) does not exist at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179) at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196) at py4j.Gateway.invoke(Gateway.java:237) at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80) at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748)

why is this happening? that code is literally the same as other tutorials and it works fine there...

DobleR
  • 83
  • 1
  • 6
  • try this https://stackoverflow.com/questions/47674311/how-to-create-a-sample-spark-dataframe-in-python and see if you are still getting error. check versions of spark once – Sarath Chandra Vema Jan 09 '20 at 05:41
  • It will be successful in spark shell or submit the job " spark-submit yourcode.py" – Kannan Mar 14 '21 at 12:14

1 Answers1

-1

Try this It's working. Place a comma after the values, while initializing.

import pyspark
from pyspark.sql import SparkSession

spa = SparkSession.builder.getOrCreate()
df = spa.createDataFrame(sc.parallelize([(1,), (2,), (3,)]), ("count",),)

Output:

+-----+
|count|
+-----+
|    1|
|    2|
|    3|
+-----+

Hope this helps!

abhaykagalkar
  • 52
  • 1
  • 5