0

I searched a lot for my problem but to no avail.

This is the runtime version I have on my Databricks cluster: 10.4 LTS ML (includes Apache Spark 3.2.1, Scala 2.12)

I have this code below:

with mlflow.start_run():
  rf = RandomForestRegressor(labelCol='Duration', 
                            featuresCol='scaled_features')
  name = "Random Forest"
  model = rf.fit(train)
  
  predictions = model.transform(test)
  
  evaluator_r2 = RegressionEvaluator(
  labelCol="Duration", predictionCol="prediction", metricName="r2")
  r2 = evaluator_r2.evaluate(predictions)
  
  mlflow.log_param("Algorithm", name)
  mlflow.log_metric("R2 score", r2)
  
  # Convert to ONNX model
  num_features = model.numFeatures # this is 13 features and for GBTree as well
  #initial_type = buildInitialTypesSimple(scaled_data.select("scaled_features"))
  initial_type = [('features', FloatTensorType([1, num_features]))]
  
  onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)

  # Log model
  mlflow.onnx.log_model(onnx.SerializeToString(), registered_model_name="onnx_random_forest")

The model trains well, but when it arrives at the line "onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)", I get this error below:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<command-1060338014211318> in <module>
     19   initial_type = [('features', FloatTensorType([1, num_features]))]
     20 
---> 21   onnx = convert_sparkml(model=model, name="Random Forest", initial_types=initial_type)
     22 
     23   # Log model

/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/main.py in convert_sparkml(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
    164 
    165     from .sparkml.convert import convert
--> 166     return convert(model, name, initial_types, doc_string, target_opset, targeted_onnx,
    167                    custom_conversion_functions, custom_shape_calculators, spark_session)
    168 

/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/convert.py in convert(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
     66 
     67     # Infer variable shapes
---> 68     topology.compile()
     69 
     70     # Convert our Topology object into ONNX. The outcome is an ONNX model.

/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in compile(self)
    676         self._resolve_duplicates()
    677         self._fix_shapes()
--> 678         self._infer_all_types()
    679         self._check_structure()
    680 

/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in _infer_all_types(self)
    551                 pass  # in Keras converter, the shape calculator can be optional.
    552             else:
--> 553                 operator.infer_types()
    554 
    555     def _resolve_duplicates(self):

/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in infer_types(self)
    105     def infer_types(self):
    106         # Invoke a core inference function
--> 107         get_shape_calculator(self.type)(self)
    108 
    109 

/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/operator_converters/decision_tree_regressor.py in calculate_decision_tree_regressor_output_shapes(operator)
     31 def calculate_decision_tree_regressor_output_shapes(operator):
     32     check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1)
---> 33     N = operator.inputs[0].type.shape[0]
     34     operator.outputs[0].type = FloatTensorType(shape=[N, 1])
     35 

AttributeError: 'NoneType' object has no attribute 'shape'

Here are the imports:

import pyspark.sql.functions as f
from pyspark.sql.types import *
import mlflow
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns     
from datetime import datetime, date
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from onnxmltools import convert_sparkml
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple, FloatTensorType
import onnxmltools.convert.common.data_types 

I used optionally

from skl2onnx.common.data_types import FloatTensorType

import onnxmltools.convert.common.data_types 
from onnxmltools.convert.common.data_types import FloatTensorType

from py4j.java_gateway import java_import
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")

For Gradient-boosted tree I get a different error, but kind of pointing in the same direction.

with mlflow.start_run():
  name = 'GBTree'
  
  gbt = GBTRegressor(labelCol='Duration', 
                            featuresCol='scaled_features')
  model = gbt.fit(train)
  
  predictions = model.transform(test)
  
  evaluator_r2 = RegressionEvaluator(
  labelCol="Duration", predictionCol="prediction", metricName="r2")
  r2 = evaluator_r2.evaluate(predictions)
  
  mlflow.log_param("Algorithm", name)
  mlflow.log_metric("R2 score", r2)
  
  # Convert to ONNX model
  num_features = model.numFeatures
  #initial_types = buildInitialTypesSimple(scaled_data.select("scaled_features"))
  initial_type = [('features', FloatTensorType([None, num_features]))]
  
  onnx = convert_sparkml(model=model, name="GBTree", initial_types=initial_type)

  # Log model
  mlflow.onnx.log_model(onnx.SerializeToString(), registered_model_name="onnx_GBTree")

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<command-3643615854794992> in <module>
     20   initial_type = [('features', FloatTensorType([None, num_features]))]
     21 
---> 22   onnx = convert_sparkml(model=model, name="GBTree", initial_types=initial_type)
     23 
     24   # Log model

/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/main.py in convert_sparkml(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
    164 
    165     from .sparkml.convert import convert
--> 166     return convert(model, name, initial_types, doc_string, target_opset, targeted_onnx,
    167                    custom_conversion_functions, custom_shape_calculators, spark_session)
    168 

/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/convert.py in convert(model, name, initial_types, doc_string, target_opset, targeted_onnx, custom_conversion_functions, custom_shape_calculators, spark_session)
     66 
     67     # Infer variable shapes
---> 68     topology.compile()
     69 
     70     # Convert our Topology object into ONNX. The outcome is an ONNX model.

/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in compile(self)
    676         self._resolve_duplicates()
    677         self._fix_shapes()
--> 678         self._infer_all_types()
    679         self._check_structure()
    680 

/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in _infer_all_types(self)
    551                 pass  # in Keras converter, the shape calculator can be optional.
    552             else:
--> 553                 operator.infer_types()
    554 
    555     def _resolve_duplicates(self):

/databricks/python/lib/python3.8/site-packages/onnxconverter_common/topology.py in infer_types(self)
    105     def infer_types(self):
    106         # Invoke a core inference function
--> 107         get_shape_calculator(self.type)(self)
    108 
    109 

/databricks/python/lib/python3.8/site-packages/onnxmltools/convert/sparkml/operator_converters/gbt_classifier.py in calculate_gbt_classifier_output_shapes(operator)
     66 def calculate_gbt_classifier_output_shapes(operator):
     67     check_input_and_output_numbers(operator, input_count_range=1, output_count_range=[1, 2])
---> 68     check_input_and_output_types(operator, good_input_types=[FloatTensorType, Int64TensorType])
     69     if len(operator.inputs[0].type.shape) != 2:
     70         raise RuntimeError('Input must be a [N, C]-tensor')

/databricks/python/lib/python3.8/site-packages/onnxconverter_common/utils.py in check_input_and_output_types(operator, good_input_types, good_output_types)
    320         for variable in operator.inputs:
    321             if type(variable.type) not in good_input_types:
--> 322                 raise RuntimeError('Operator %s (type: %s) got an input %s with a wrong type %s. Only %s are allowed'
    323                                    % (operator.full_name, operator.type, variable.full_name, type(variable.type),
    324                                       good_input_types))

RuntimeError: Operator pyspark_ml_regression_GBTRegressionModel (type: pyspark.ml.regression.GBTRegressionModel) got an input scaled_features with a wrong type <class 'NoneType'>. Only [<class 'onnxconverter_common.data_types.FloatTensorType'>, <class 'onnxconverter_common.data_types.Int64TensorType'>] are allowed

If I missed something, please tell me and I will try to provide it, thank you.

0 Answers0