I am working on a PySpark project where I'm trying to fit a MultilayerPerceptronClassifier model to my text data using the fit method.I am using the Word2ve model provided bu Mllib to extract features . However, I keep running into an ArrayIndexOutOfBoundsException error when I try to run the fit method. Specifically, the error message says :
Py4JJavaError Traceback (most recent call last)
<ipython-input-1-d46ecefd3281> in <module>
41 # Use Word2Vec to generate word embeddings
42 word2Vec_pipeline = Pipeline(stages=[tokenizer, word2Vec, labelIndexer, mlp])
---> 43 word2Vec_model = mlp.fit(trainingData_word2Vec)
44 word2Vec_predictions = word2Vec_model.transform(testData_word2Vec)
45
~\Documents\spark-3.3.1-bin-hadoop3\python\pyspark\ml\base.py in fit(self, dataset, params)
203 return self.copy(params)._fit(dataset)
204 else:
--> 205 return self._fit(dataset)
206 else:
207 raise TypeError(
~\Documents\spark-3.3.1-bin-hadoop3\python\pyspark\ml\wrapper.py in _fit(self, dataset)
381
382 def _fit(self, dataset: DataFrame) -> JM:
--> 383 java_model = self._fit_java(dataset)
384 model = self._create_model(java_model)
385 return self._copyValues(model)
~\Documents\spark-3.3.1-bin-hadoop3\python\pyspark\ml\wrapper.py in _fit_java(self, dataset)
378
379 self._transfer_params_to_java()
--> 380 return self._java_obj.fit(dataset._jdf)
381
382 def _fit(self, dataset: DataFrame) -> JM:
~\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1319
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1323
~\Documents\spark-3.3.1-bin-hadoop3\python\pyspark\sql\utils.py in deco(*a, **kw)
188 def deco(*a: Any, **kw: Any) -> Any:
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
192 converted = convert_exception(e.java_exception)
~\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o181.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 39) (DESKTOP-HILGIEG executor driver): java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker.$anonfun$stack$4(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker.$anonfun$stack$4$adapted(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:431)
at org.apache.spark.ml.ann.DataStacker.$anonfun$stack$3(Layer.scala:664)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:224)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1518)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1445)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1509)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1332)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:376)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:327)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
at org.apache.spark.rdd.RDD.count(RDD.scala:1274)
at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
at org.apache.spark.mllib.optimization.LBFGS.optimizeWithLossReturned(LBFGS.scala:154)
at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:855)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.$anonfun$train$1(MultilayerPerceptronClassifier.scala:228)
at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:184)
at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:93)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException
at java.lang.System.arraycopy(Native Method)
at org.apache.spark.ml.ann.DataStacker.$anonfun$stack$4(Layer.scala:665)
at org.apache.spark.ml.ann.DataStacker.$anonfun$stack$4$adapted(Layer.scala:664)
at scala.collection.immutable.List.foreach(List.scala:431)
at org.apache.spark.ml.ann.DataStacker.$anonfun$stack$3(Layer.scala:664)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
at org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:224)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:302)
at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1518)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1445)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1509)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1332)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:376)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:327)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
I'm not sure what's causing this error, but it seems like it might be related to how I'm handling my data or how I'm using PySpark. I've checked my data and my code, but I can't seem to find any null values . Can anyone provide some insight into what might be going wrong here and how I can fix it?
I traid to fit the MultilayerPerceptronClassifier of Mllib using features extracted by Word2vec and i keep getting java.lang.ArrayIndexOutOfBoundsException. Although I have checked the null values and I have none.