0

A complete python noob here. I managed to create a Spark dataframe 'sparkDF', but get an error message when .show() is applied. Also, I had to convert json to Pandas dataframe first then to spark dataframe, but if anyone knows a more efficient way to convert json file to spark dataframe, please let me know.

!pip install pyspark
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import json

spark = SparkSession.builder.appName("SparkTrial").config("spark.some.config.option", "some-value").getOrCreate()

f = open('data.json')
data = json.load(f)

pddf=pd.json_normalize(data, "results")

sparkDF = spark.createDataFrame(pddf)

print(sparkDF) # output: DataFrame[v: double, vw: double, o: double, c: double, h: double, l: double, t: bigint, n: bigint]

sparkDF.show() # error occurs at this line

Error message:

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
Input In [7], in <cell line: 1>()
----> 1 sparkDF.show()

File ~\anaconda3\lib\site-packages\pyspark\sql\dataframe.py:606, in DataFrame.show(self, n, truncate, vertical)
   603     raise TypeError("Parameter 'vertical' must be a bool")
   605 if isinstance(truncate, bool) and truncate:
--> 606     print(self._jdf.showString(n, 20, vertical))
   607 else:
   608     try:

File ~\anaconda3\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
  1315 command = proto.CALL_COMMAND_NAME +\
  1316     self.command_header +\
  1317     args_command +\
  1318     proto.END_COMMAND_PART
  1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
  1322     answer, self.gateway_client, self.target_id, self.name)
  1324 for temp_arg in temp_args:
  1325     temp_arg._detach()

File ~\anaconda3\lib\site-packages\pyspark\sql\utils.py:190, in capture_sql_exception.<locals>.deco(*a, **kw)
   188 def deco(*a: Any, **kw: Any) -> Any:
   189     try:
--> 190         return f(*a, **kw)
   191     except Py4JJavaError as e:
   192         converted = convert_exception(e.java_exception)

File ~\anaconda3\lib\site-packages\py4j\protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
   324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
   325 if answer[1] == REFERENCE_TYPE:
--> 326     raise Py4JJavaError(
   327         "An error occurred while calling {0}{1}{2}.\n".
   328         format(target_id, ".", name), value)
   329 else:
   330     raise Py4JError(
   331         "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
   332         format(target_id, ".", name, value))

Py4JJavaError: An error occurred while calling o46.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0) (192.168.0.19 executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
   at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
   at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
   at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
   at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
   at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
   at org.apache.spark.scheduler.Task.run(Task.scala:136)
   at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
   at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
   at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
   at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
   at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.net.SocketTimeoutException: Accept timed out
   at java.base/java.net.PlainSocketImpl.waitForNewConnection(Native Method)
   at java.base/java.net.PlainSocketImpl.socketAccept(PlainSocketImpl.java:163)
   at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
   at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
   at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
   at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
   ... 29 more

Driver stacktrace:
   at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
   at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
   at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
   at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
   at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
   at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
   at scala.Option.foreach(Option.scala:407)
   at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
   at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
   at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
   at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
   at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
   at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
   at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
   at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:506)
   at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
   at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
   at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
   at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
   at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
   at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
   at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
   at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
   at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
   at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
   at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
   at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
   at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
   at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
   at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
   at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
   at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
   at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
   at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
   at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
   at java.base/java.lang.reflect.Method.invoke(Method.java:566)
   at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
   at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
   at py4j.Gateway.invoke(Gateway.java:282)
   at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
   at py4j.commands.CallCommand.execute(CallCommand.java:79)
   at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
   at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
   at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
   at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
   at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
   at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
   at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
   at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
   at org.apache.spark.scheduler.Task.run(Task.scala:136)
   at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
   at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
   at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
   at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
   ... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
   at java.base/java.net.PlainSocketImpl.waitForNewConnection(Native Method)
   at java.base/java.net.PlainSocketImpl.socketAccept(PlainSocketImpl.java:163)
   at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
   at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
   at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
   at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
   ... 29 more
Jae
  • 21
  • 3
  • the error reads - *org.apache.spark.SparkException: Python worker failed to connect back.* - it can be due to multiple reasons, but you might want to check if your configs / paths are correct and reachable. see [this SO Q](https://stackoverflow.com/q/53252181/8279585) – samkart Oct 19 '22 at 06:29
  • @samkart Thank you! using findspark.init() line worked for me – Jae Oct 20 '22 at 04:00

0 Answers0