Trying to train a model on pyspark using elephas but keep getting he following error when fitting the model/estimator. Am using PySpark ML, using transformers to transform the data from raw form to vectorised form. Trying to use keras and elephas to train NN in a distributed way. Getting the following error when trying to fit using elephas Estimator:
An error was encountered:
Could not serialize object: TypeError: can't pickle weakref objects
Traceback (most recent call last):
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/base.py", line 161, in fit
return self._fit(dataset)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/pipeline.py", line 114, in _fit
model = stage.fit(dataset)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/ml/base.py", line 161, in fit
return self._fit(dataset)
File "/envs/PySpark/lib64/python3.7/site-packages/elephas/ml_model.py", line 101, in _fit
validation_split=self.get_validation_split())
File "/envs/PySpark/lib64/python3.7/site-packages/elephas/spark_model.py", line 185, in fit
self._fit(rdd, **kwargs)
File "/envs/PySpark/lib64/python3.7/site-packages/elephas/spark_model.py", line 220, in _fit
training_outcomes = rdd.mapPartitions(worker.train).collect()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 949, in collect
sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2950, in _jrdd
self._jrdd_deserializer, profiler)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2828, in _wrap_function
pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2814, in _prepare_for_python_RDD
pickled_command = ser.dumps(command)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 447, in dumps
raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: TypeError: can't pickle weakref objects
Model:
#get input dimensions
input_dim = len(self.training_data.select('features').first()[0])
#configure the model
model = Sequential()
model.add(Dense(256, input_shape=(input_dim, )))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('sigmoid'))
#compiling the model
model.compile(optimizer='adam',
loss='binary_crossentropy')
#setting optimiser config
adam = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(adam)
#initialize SparkML Estimator and set all relevant properties
estimator = ElephasEstimator()
estimator.setFeaturesCol("features")
estimator.setLabelCol("label")
estimator.set_keras_model_config(model.to_json()) #provide serialized Keras model
estimator.set_nb_classes(2)
estimator.set_num_workers(1) #number of workers? How many?
estimator.set_epochs(20)
estimator.set_batch_size(64)
estimator.set_verbosity(1)
estimator.set_validation_split(0.2)
estimator.set_optimizer_config(opt_conf)
estimator.set_mode("synchronous")
estimator.set_loss("binary_crossentropy")
estimator.set_metrics([BinaryAccuracy()])
#build the model pipeline
self.model_pipeline = Pipeline(stages=[estimator])
#train the model on the training data
self.trained_model = self.model_pipeline.fit(self.training_data)
model_to_json() looks like this:
'{"class_name": "Sequential", "config": {"name": "sequential_7", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 11104], "dtype": "float32", "sparse": false, "ragged": false, "name": "dense_11_input"}}, {"class_name": "Dense", "config": {"name": "dense_11", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 11104], "units": 256, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Activation", "config": {"name": "activation_6", "trainable": true, "dtype": "float32", "activation": "relu"}}, {"class_name": "Dropout", "config": {"name": "dropout_4", "trainable": true, "dtype": "float32", "rate": 0.2, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_12", "trainable": true, "dtype": "float32", "units": 128, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Activation", "config": {"name": "activation_7", "trainable": true, "dtype": "float32", "activation": "relu"}}, {"class_name": "Dropout", "config": {"name": "dropout_5", "trainable": true, "dtype": "float32", "rate": 0.2, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_13", "trainable": true, "dtype": "float32", "units": 2, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Activation", "config": {"name": "activation_8", "trainable": true, "dtype": "float32", "activation": "sigmoid"}}]}, "keras_version": "2.11.0", "backend": "tensorflow"}'
Not sure why it's not working...