I searched a lot for my problem but to no avail.
This is the runtime version I have on my Databricks cluster: 10.4 LTS ML (includes Apache Spark 3.2.1, Scala 2.12)
I have this code below:
with mlflow.start_run():
rf = RandomForestRegressor(labelCol='Duration',
featuresCol='scaled_features')
name = "Random Forest"
model = rf.fit(train)
predictions = model.transform(test)
evaluator_r2 = RegressionEvaluator(
labelCol="Duration", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
mlflow.log_param("Algorithm", name)
mlflow.log_metric("R2 score", r2)
# Convert to ONNX model
num_features = model.numFeatures # this is 13 features and for GBTree as well
#initial_type = buildInitialTypesSimple(scaled_data.select("scaled_features"))
initial_type = [('features', FloatTensorType([1, num_features]))]
onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)
# Log model
mlflow.onnx.log_model(onnx.SerializeToString(), registered_model_name="onnx_random_forest")
The model trains well, but when it arrives at the line "onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)", I get this error below:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<command-1060338014211318> in <module>
19 initial_type = [('features', FloatTensorType([1, num_features]))]
20
---> 21 onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)
22
23 # Log model
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/main.py in convert_sparkml(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
164
165 from .sparkml.convert import convert
--> 166 return convert(model, name, initial_types, doc_string, target_opset, targeted_onnx,
167 custom_conversion_functions, custom_shape_calculators, spark_session)
168
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/convert.py in convert(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
66
67 # Infer variable shapes
---> 68 topology.compile()
69
70 # Convert our Topology object into ONNX. The outcome is an ONNX model.
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in compile(self)
676 self._resolve_duplicates()
677 self._fix_shapes()
--> 678 self._infer_all_types()
679 self._check_structure()
680
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in _infer_all_types(self)
551 pass # in Keras converter, the shape calculator can be optional.
552 else:
--> 553 operator.infer_types()
554
555 def _resolve_duplicates(self):
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in infer_types(self)
105 def infer_types(self):
106 # Invoke a core inference function
--> 107 get_shape_calculator(self.type)(self)
108
109
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/operator_converters/decision_tree_regressor.py in calculate_decision_tree_regressor_output_shapes(operator)
31 def calculate_decision_tree_regressor_output_shapes(operator):
32 check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1)
---> 33 N = operator.inputs[0].type.shape[0]
34 operator.outputs[0].type = FloatTensorType(shape=[N, 1])
35
AttributeError: 'NoneType' object has no attribute 'shape'
Here are the imports:
import pyspark.sql.functions as f
from pyspark.sql.types import *
import mlflow
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, date
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from onnxmltools import convert_sparkml
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple, FloatTensorType
import onnxmltools.convert.common.data_types
I used optionally
from skl2onnx.common.data_types import FloatTensorType
import onnxmltools.convert.common.data_types
from onnxmltools.convert.common.data_types import FloatTensorType
from py4j.java_gateway import java_import
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")
For Gradient-boosted tree I get a different error, but kind of pointing in the same direction.
with mlflow.start_run():
name = 'GBTree'
gbt = GBTRegressor(labelCol='Duration',
featuresCol='scaled_features')
model = gbt.fit(train)
predictions = model.transform(test)
evaluator_r2 = RegressionEvaluator(
labelCol="Duration", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
mlflow.log_param("Algorithm", name)
mlflow.log_metric("R2 score", r2)
# Convert to ONNX model
num_features = model.numFeatures
#initial_types = buildInitialTypesSimple(scaled_data.select("scaled_features"))
initial_type = [('features', FloatTensorType([None, num_features]))]
onnx = convert_sparkml(model=model, name="GBTree", initial_types=initial_type)
# Log model
mlflow.onnx.log_model(onnx.SerializeToString(), registered_model_name="onnx_GBTree")
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<command-3643615854794992> in <module>
20 initial_type = [('features', FloatTensorType([None, num_features]))]
21
---> 22 onnx = convert_sparkml(model=model, name="GBTree", initial_types=initial_type)
23
24 # Log model
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/main.py in convert_sparkml(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
164
165 from .sparkml.convert import convert
--> 166 return convert(model, name, initial_types, doc_string, target_opset, targeted_onnx,
167 custom_conversion_functions, custom_shape_calculators, spark_session)
168
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/convert.py in convert(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
66
67 # Infer variable shapes
---> 68 topology.compile()
69
70 # Convert our Topology object into ONNX. The outcome is an ONNX model.
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in compile(self)
676 self._resolve_duplicates()
677 self._fix_shapes()
--> 678 self._infer_all_types()
679 self._check_structure()
680
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in _infer_all_types(self)
551 pass # in Keras converter, the shape calculator can be optional.
552 else:
--> 553 operator.infer_types()
554
555 def _resolve_duplicates(self):
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in infer_types(self)
105 def infer_types(self):
106 # Invoke a core inference function
--> 107 get_shape_calculator(self.type)(self)
108
109
/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/operator_converters/gbt_classifier.py in calculate_gbt_classifier_output_shapes(operator)
66 def calculate_gbt_classifier_output_shapes(operator):
67 check_input_and_output_numbers(operator, input_count_range=1, output_count_range=[1, 2])
---> 68 check_input_and_output_types(operator, good_input_types=[FloatTensorType, Int64TensorType])
69 if len(operator.inputs[0].type.shape) != 2:
70 raise RuntimeError('Input must be a [N, C]-tensor')
/databricks/python/lib/python3.8/site-packages/onnxconverter_common/utils.py in check_input_and_output_types(operator, good_input_types, good_output_types)
320 for variable in operator.inputs:
321 if type(variable.type) not in good_input_types:
--> 322 raise RuntimeError('Operator %s (type: %s) got an input %s with a wrong type %s. Only %s are allowed'
323 % (operator.full_name, operator.type, variable.full_name, type(variable.type),
324 good_input_types))
RuntimeError: Operator pyspark_ml_regression_GBTRegressionModel (type: pyspark.ml.regression.GBTRegressionModel) got an input scaled_features with a wrong type <class 'NoneType'>. Only [<class 'onnxconverter_common.data_types.FloatTensorType'>, <class 'onnxconverter_common.data_types.Int64TensorType'>] are allowed
If I missed something, please tell me and I will try to provide it, thank you.