I want to perform a PCA inside a function where a PySpark dataframe (Dim: 41 x 1707, long, double) goes in as an input parameter. The Vector Assembler seems to work, but after that I only get errors:
def pca(pyspark_df):
assembler = VectorAssembler(inputCols = pyspark_df.columns, outputCol = 'features')
vect_ass = assembler.transform(pyspark_df).select('features')
scaler = StandardScaler(
inputCol = 'features',
outputCol = 'scaledFeatures',
withMean = True,
withStd = True
).fit(vect_ass)
df_scaled = scaler.transform(vect_ass)
n_components = 2
pca = PCA(
k = n_components,
inputCol = 'scaledFeatures',
outputCol = 'pcaFeatures'
).fit(df_scaled)
df_pca = pca.transform(df_scaled)
return df_pca
Here are the errors:
Traceback (most recent call last):
File "pca", line 1, in <module>
File "pca", line 14, in pca
File "/tmp/conda-8fabdeb4-45e7-4c12-a4c8-92c28aad9d1f/real/envs/conda-env/lib/python3.6/site-packages/pyspark/ml/base.py", line 129, in fit
return self._fit(dataset)
File "/tmp/conda-8fabdeb4-45e7-4c12-a4c8-92c28aad9d1f/real/envs/conda-env/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 321, in _fit
java_model = self._fit_java(dataset)
File "/tmp/conda-8fabdeb4-45e7-4c12-a4c8-92c28aad9d1f/real/envs/conda-env/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 318, in _fit_java
return self._java_obj.fit(dataset._jdf)
File "/tmp/conda-8fabdeb4-45e7-4c12-a4c8-92c28aad9d1f/real/envs/conda-env/lib/python3.6/site-packages/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/tmp/conda-8fabdeb4-45e7-4c12-a4c8-92c28aad9d1f/real/envs/conda-env/lib/python3.6/site-packages/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/tmp/conda-8fabdeb4-45e7-4c12-a4c8-92c28aad9d1f/real/envs/conda-env/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o4252.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 21.0 failed 4 times, most recent failure: Lost task 0.3 in stage 21.0 (TID 78, 10.0.75.115, executor 3): org.apache.spark.SparkException: Failed to execute user defined function(VectorAssembler$$Lambda$3889/0x0000000801a21040: (struct<sum_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,mean_c_mileage_delta:double,count_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,longest_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,mean_speed:double,max_speed:double,sum_c_session_duration_min_double_VectorAssembler_6e77f5ccfd0b:double,mean_c_session_duration_min:double,shortCount_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,shortSum_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,midCount_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,midSum_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,longCount_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,longSum_c_mileage_delta_double_VectorAssembler_6e77f5ccfd0b:double,Count_c_timeLOCAL_start_double_VectorAssembler_6e77f5ccfd0b:double,nightCount_double_VectorAssembler_6e77f5ccfd0b:double,morningCount_double_VectorAssembler_6e77f5ccfd0b:double,forenoonCount_double_VectorAssembler_6e77f5ccfd0b:double,noonCount_double_VectorAssembler_6e77f5ccfd0b:double,afternoonCount_double_VectorAssembler_6e77f5ccfd0b:double,eveningCount_double_VectorAssembler_6e77f5ccfd0b:double,sum_c_standingtime_after_drive_min:double,mean_c_standingtime_after_drive_min:double,mean_c_num_passengers:double,count_c_country_double_VectorAssembler_6e77f5ccfd0b:double,abs_mean_speed:double,eveningCountRel:double,afternoonCountRel:double,noonCountRel:double,forenoonCountRel:double,morningCountRel:double,nightCountRel:double,shortSum_c_mileage_delta_rel:double,midSum_c_mileage_delta_rel:double,longSum_c_mileage_delta_rel:double,shortCount_c_mileage_delta_rel:double,midCount_c_mileage_delta_rel:double,longCount_c_mileage_delta_rel:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.processInputs(ObjectAggregationIterator.scala:151)
at org.apache.spark.sql.execution.aggregate.ObjectAggregationIterator.<init>(ObjectAggregationIterator.scala:78)
at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2(ObjectHashAggregateExec.scala:129)
at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$2$adapted(ObjectHashAggregateExec.scala:107)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:859)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:859)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
...
What is wrong?