I need to multiply a matrix by a vector in PySpark. In my understanding, this should be possible by using PySpark's BlockMatrix.multiply()
function.
However, I'm unable to create a BlockMatrix
in the first place. Using this simplified code:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.mllib.linalg.distributed import BlockMatrix
spark = SparkSession.builder.getOrCreate()
rdd = spark.sparkContext.parallelize([[0,1], [1,2]])
blockMatrx = BlockMatrix(rdd, 4000, 4000)
Causes the following error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-4-360ec98840e3> in <module>
1 rdd = spark.sparkContext.parallelize([[0,1], [1,2]])
----> 2 blockMatrx = BlockMatrix(rdd, 4000, 4000)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\mllib\linalg\distributed.py in __init__(self, blocks, rowsPerBlock, colsPerBlock, numRows, numCols)
1215 # ((blockRowIndex, blockColIndex), sub-matrix) tuples on
1216 # the Scala side.
-> 1217 java_matrix = callMLlibFunc("createBlockMatrix", blocks.toDF(),
1218 int(rowsPerBlock), int(colsPerBlock),
1219 int(numRows), int(numCols))
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\sql\session.py in toDF(self, schema, sampleRatio)
64 [Row(name='Alice', age=1)]
65 """
---> 66 return sparkSession.createDataFrame(self, schema, sampleRatio)
67
68 RDD.toDF = toDF
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\sql\session.py in createDataFrame(self, data, schema, samplingRatio, verifySchema)
673 return super(SparkSession, self).createDataFrame(
674 data, schema, samplingRatio, verifySchema)
--> 675 return self._create_dataframe(data, schema, samplingRatio, verifySchema)
676
677 def _create_dataframe(self, data, schema, samplingRatio, verifySchema):
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\sql\session.py in _create_dataframe(self, data, schema, samplingRatio, verifySchema)
696
697 if isinstance(data, RDD):
--> 698 rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
699 else:
700 rdd, schema = self._createFromLocal(map(prepare, data), schema)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\sql\session.py in _createFromRDD(self, rdd, schema, samplingRatio)
484 """
485 if schema is None or isinstance(schema, (list, tuple)):
--> 486 struct = self._inferSchema(rdd, samplingRatio, names=schema)
487 converter = _create_converter(struct)
488 rdd = rdd.map(converter)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\sql\session.py in _inferSchema(self, rdd, samplingRatio, names)
458 :class:`pyspark.sql.types.StructType`
459 """
--> 460 first = rdd.first()
461 if not first:
462 raise ValueError("The first row in RDD is empty, "
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\rdd.py in first(self)
1584 ValueError: RDD is empty
1585 """
-> 1586 rs = self.take(1)
1587 if rs:
1588 return rs[0]
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\rdd.py in take(self, num)
1564
1565 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1566 res = self.context.runJob(self, takeUpToNumLeft, p)
1567
1568 items += res
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
1231 # SparkContext#runJob.
1232 mappedRDD = rdd.mapPartitions(partitionFunc)
-> 1233 sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
1234 return list(_load_from_socket(sock_info, mappedRDD._jrdd_deserializer))
1235
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\pyspark\sql\utils.py in deco(*a, **kw)
109 def deco(*a, **kw):
110 try:
--> 111 return f(*a, **kw)
112 except py4j.protocol.Py4JJavaError as e:
113 converted = convert_exception(e.java_exception)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python37_64\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 1) (192.168.2.79 executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
... 14 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2223)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2242)
at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:182)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:107)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:119)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:145)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:174)
... 14 more
Background
The reason for multiplying these two is to implement the Page Rank algorithm in parallel. The sequential algorithm, which I've implemented successfully, does something like this:
...
for i in range(num_iterations):
v = M_hat @ v
return v
Where v
is an eigenvector and M_hat
is the sparse matrix of pages and the links between them. I've not been able to build up the intuition to correctly understand how this program can be done in parallel using RDDs, even though people do give examples. Specifically, I'm not sure how the loop above is done in parallel.
BlockMatrix
is not meant for sparse matrices; however, multiply()
has not been implemented on sparse matrices in PySpark. As I research ways to multiply matrices in PySpark (1)(2)(3), I'm either unable to understand them, or they error out.
For full disclosure, this is an assignment. However, I've been researching about three days with little success, because the teacher has only quickly introduced Spark, and I'm extremely poor at understanding mathematical matrix operations.
Question
Make this work:
blockMatrx = BlockMatrix(rdd, 4000, 4000)
Update
This answer looks promising:
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.mllib.linalg.distributed import *
spark = SparkSession.builder.getOrCreate()
rows_1 = spark.sparkContext.parallelize([[1, 2], [4, 5], [7, 8]])
rows_2 = spark.sparkContext.parallelize([[1, 2], [4, 5]])
def as_block_matrix(rdd, rowsPerBlock=1024, colsPerBlock=1024):
return IndexedRowMatrix(
rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0]))
).toBlockMatrix(rowsPerBlock, colsPerBlock)
as_block_matrix(rows_1).multiply(as_block_matrix(rows_2))
However, I still get a SparkException: Job aborted due to stage failure
exception on the line rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0]))
.