While running the endpoint testing in Azure ML, I am experiencing one error related to the reading of input data.
Steps followed :
- Running Gradient boost model 2.Train and test the data and save it in the model. pkl file
- Registering the model on azure ML and deploying the configuration with the code
- Reading score.py for the init() and run()
Train.py code
%%writefile $script_folder/train.py
import argparse
import os
import numpy as np
import pandas as pd
from sklearn.svm import SVC
import joblib
import pickle
from azureml.core import Workspace, Dataset, Experiment
from azureml.core import Run
import re
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
import math
import pickle
#ws = Workspace.from_config()
#az_dataset = Dataset.get_by_name(ws, 'pricing')
# let user feed in 2 parameters, the location of the data files (from datastore), and the regularization rate of the logistic regression model
#parser = argparse.ArgumentParser()
#parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
#parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
#args = parser.parse_args()
train_data = pd.read_csv("C:\\Users\\abhay\\Downloads\\Projects_DataScience\\Ensemble_Machine_Learning\\dataset\\train_update.csv")
column_datatypes = train_data.dtypes
categorical_columns = list(column_datatypes[column_datatypes=="object"].index.values)
continuous_columns = list(column_datatypes[column_datatypes=="float64"].index.values)
continuous_columns.remove('loss')
total_rows = train_data.shape[0]
columns_with_blanks_cat = np.random.randint(1,116,2)
columns_with_blanks_cont = np.random.randint(117,130,3)
columns_with_blank = np.append(columns_with_blanks_cat,columns_with_blanks_cont)
#for every column insert 5 blanks at random locations
for col in columns_with_blank:
rows_with_blanks = np.random.randint(1,total_rows,5)
train_data.iloc[rows_with_blanks,col] = np.nan
class Data_preprocessing:
def __init__(self,train_data):
self.train_data = train_data
def missing_value_continuous(self,column_names_with_specific_type,imputation_type="mean"): # null value imputation with mean value
if imputation_type=="mean": # mean imputation
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mean_imputer.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type]=mean_imputer.transform(self.train_data[column_names_with_specific_type])
if imputation_type=="median": # median imputation
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
median_imputer.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type]=median_imputer.transform(self.train_data[column_names_with_specific_type])
return self.train_data
def missing_value_categorical(self,column_names_with_specific_type,imputation_type="most_frequent"): # check for missing categorical column values
most_frequent = SimpleImputer(strategy="most_frequent")
most_frequent.fit(self.train_data[column_names_with_specific_type])
self.train_data[column_names_with_specific_type] = most_frequent.transform(train_data[column_names_with_specific_type])
return self.train_data
def outlier_treatment(self,Q1,Q3,IQR,columns_with_outlier,action): # outlier treatmenr
if action=="median":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
meadian_outlier = np.median(self.train_data[column_name])
self.train_data.loc[self.train_data[((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))].index,column_name]=meadian_outlier
if action=="mean":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
mean_outlier = np.mean(self.train_data[column_name])
self.train_data.loc[self.train_data[((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))].index,column_name]=mean_outlier
if action=="remove":
for i in range(len(columns_with_outlier)):
column_name = columns_with_outlier[i]
self.train_data = self.train_data[~((self.train_data[column_name]<(Q1[column_name]-(1.5*IQR[column_name])))|(self.train_data[column_name]>(Q3[column_name]+(1.5*IQR[column_name]))))]
return self.train_data
column_names = np.array(train_data.columns)
Data_preprocessing_obj = Data_preprocessing(train_data)
train_data = Data_preprocessing_obj.missing_value_continuous(continuous_columns,"median")
train_data = Data_preprocessing_obj.missing_value_categorical(categorical_columns)
columns_with_outlier = ['cont7','cont9','cont10']
Q1 = train_data[continuous_columns].quantile(0.25)
Q3 = train_data[continuous_columns].quantile(0.75)
IQR = (Q3-Q1)
train_data = Data_preprocessing_obj.outlier_treatment(Q1,Q3,IQR,columns_with_outlier,"median")
def feature_selection_numerical_variables(train_data,qthreshold,corr_threshold,exclude_numerical_cols_list):
num_colums = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = list(train_data.select_dtypes(include=num_colums).columns)
numerical_columns = [column for column in numerical_columns if column not in exclude_numerical_cols_list]
#remove variables with constant variance
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(train_data[numerical_columns])
constant_columns = [column for column in train_data[numerical_columns].columns
if column not in train_data[numerical_columns].columns[constant_filter.get_support()]]
if len(constant_columns)>0:
train_data.drop(labels=constant_columns, axis=1, inplace=True)
#remove deleted columns from dataframe
numerical_columns = [column for column in numerical_columns if column not in constant_columns]
#remove variables with qconstant variance
#Remove quasi-constant variables
qconstant_filter = VarianceThreshold(threshold=qthreshold)
qconstant_filter.fit(train_data[numerical_columns])
qconstant_columns = [column for column in train_data[numerical_columns].columns
if column not in train_data[numerical_columns].columns[constant_filter.get_support()]]
if len(qconstant_columns)>0:
train_data.drop(labels=qconstant_columns, axis=1, inplace=True)
#remove deleted columns from dataframe
numerical_columns = [column for column in numerical_columns if column not in qconstant_columns]
#remove correlated variables
correlated_features = set()
correlation_matrix = train_data[numerical_columns].corr()
ax = sns.heatmap(
correlation_matrix,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right');
#print(correlation_matrix)
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > corr_threshold:
colname = correlation_matrix.columns[i]
colcompared = correlation_matrix.columns[j]
#check if the column compared against is not in the columns excluded list
if colcompared not in correlated_features:
correlated_features.add(colname)
train_data.drop(labels=correlated_features, axis=1, inplace=True)
return train_data,constant_columns,qconstant_columns,correlated_features
train_data,constant_columns,qconstant_columns,correlated_features =feature_selection_numerical_variables(train_data,0.01,0.75,['loss','id'],)
for cf1 in categorical_columns:
le = LabelEncoder()
le.fit(train_data[cf1].unique())
filename = cf1+".sav"
pickle.dump(le, open(filename, 'wb'))
train_data[cf1] = le.transform(train_data[cf1])
#snippet to calculate the unique values with a categorical columns
df = pd.DataFrame(columns=["Column_Name","Count"])
for cat in categorical_columns:
unique_value_count = len(train_data[cat].unique())
df = df.append({'Column_Name': cat, "Count":int(unique_value_count)}, ignore_index=True)
columns_unique_value = np.array(df.Count.value_counts().index)
#snippet to identify the dependent/correlated categorical variables and drop them
columns_to_drop_cat = set()
correlated_columns = dict()
for unique_value_count in columns_unique_value:
if unique_value_count>1:
categorical_columns = df.loc[df.Count==unique_value_count,'Column_Name']
categorical_columns = categorical_columns.reset_index(drop=True)
columns_length=len(categorical_columns)
for col in range(columns_length-1):
column_to_compare = categorical_columns[col]
columns_compare_against = categorical_columns[(col+1):columns_length]
chi_scores = chi2(train_data[columns_compare_against],train_data[column_to_compare])
if column_to_compare not in columns_to_drop_cat:
columns_to_be_dropped = [i for i in range(len(columns_compare_against)) if chi_scores[1][i]<=0.05]
columns_to_drop_array = np.array(columns_compare_against)[columns_to_be_dropped]
correlated_columns[column_to_compare]=columns_to_drop_array
columns_to_drop_cat.update(columns_to_drop_array)
train_data = train_data.drop(columns_to_drop_cat,axis=1)
correlated_features = list(correlated_features)
columns_to_drop_cat = list(columns_to_drop_cat)
columns_to_drop_cat.extend(correlated_features)
columns_to_drop = columns_to_drop_cat.copy()
#output the columns_to_drop file to a csv
columns_to_drop_df=pd.DataFrame(columns_to_drop,columns=['colnames'])
#columns_to_drop_df.to_csv("/model/columns_to_drop.csv",index=False)
train_data['loss'] = np.log(train_data['loss'])
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
#convert the int64 columns categorical
Column_datatypes= train_data.dtypes
Integer_columns = list(Column_datatypes.where(lambda x: x =="int64").dropna().index.values)
train_data[Integer_columns] = train_data[Integer_columns].astype('category',copy=False)
X,y = train_data.drop(['id','loss'],axis=1),train_data['loss']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # perform train test split
ref_cols=X_train.columns
from sklearn.ensemble import GradientBoostingRegressor #GBM algorithm
gbm_base = GradientBoostingRegressor(
max_depth=2,
n_estimators=3,
learning_rate=1.0)
trained_model=gbm_base.fit(X_train,y_train)
# Predict the outcome using Test data - Score Model
Y_test_predict_tuned = gbm_base.predict(X_test)
# Get the probability score - Scored Probabilities
#Y_prob = gbm_base.predict_proba(X_test)[:, 1]
# Get Confusion matrix and the accuracy/score - Evaluate
score =np.sqrt(mean_squared_error(y_test, Y_test_predict_tuned))
#print('Export the model to model.pkl')
#f = open('fwrk2.pkl', 'wb')
#pickle.dump(trained_model, f)
#f.close()
#print('Import the model from model.pkl')
#f2 = open('fwrk2.pkl', 'rb')
#clf2 = pickle.load(f2)
#X_new = [[154, 54, 35]]
#print('New Sample:', X_new)
#print('Predicted class:', clf2.predict(X_new))
#os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
#joblib.dump(value=trained_model, filename='outputs/fwrk2.pkl')
Reading the score.py
%%writefile score.py
import json
import numpy as np
import os
import pickle
import pandas as pd
import joblib
from sklearn.ensemble import GradientBoostingRegressor
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType
from azureml.core.model import Model
def init():
global model
#model = joblib.load('recommender.pkl')
model_path = Model.get_model_path('fwrk2')
model = joblib.load(model_path)
input_sample = pd.DataFrame(data=[{"cat1":0, "cat4": 0, "cat14": 0, "cat15": 0, "cat18": 0, "cat19": 0, "cat20": 0, "cat21": 0
, "cat22": 0, "cat35": 0, "cat42":0, "cat47": 0, "cat48": 0, "cat55": 0
, "cat56": 0, "cat58": 0, "cat59": 0, "cat60": 0, "cat61": 0, "cat62": 0
, "cat63": 0, "cat64": 0, "cat68": 0, "cat70": 0, "cat76": 0, "cat77":0
, "cat78": 0, "cat82": 0, "cat85": 0, "cat86": 0, "cat89": 0, "cat91": 0
, "cat92": 0, "cat93": 0, "cat94":0, "cat96": 0, "cat97": 0, "cat99": 0
, "cat100": 0, "cat101": 0, "cat103": 0, "cat105": 0, "cat107": 0, "cat109":0
, "cat110": 0, "cat111": 0, "cat112": 0, "cat113": 0, "cat116": 0, "cont1": 0
, "cont2": 0, "cont3": 0, "cont4": 0, "cont5": 0
, "cont6": 0, "cont7": 0, "cont8": 0, "cont14": 0}])
output_sample = np.array([0]) # This is a integer type sample. Use the data type that reflects the expected result
@input_schema('data', PandasParameterType(input_sample))
@output_schema(NumpyParameterType(output_sample))
def run(data):
try:
result = model.predict(data)
# you can return any datatype as long as it is JSON-serializable
return result.tolist()
except Exception as e:
error = str(e)
return error
The endpoint publish is succeeded, and I can see the test feature on the azure portal to enter values, post entering the values.
[{"cat1":0, "cat4": 0, "cat14": 0, "cat15": 0, "cat18": 0, "cat19": 0, "cat20": 0, "cat21": 0
, "cat22": 0, "cat35": 0, "cat42":0, "cat47": 0, "cat48": 0, "cat55": 0
, "cat56": 0, "cat58": 0, "cat59": 0, "cat60": 0, "cat61": 0, "cat62": 0
, "cat63": 0, "cat64": 0, "cat68": 0, "cat70": 0, "cat76": 0, "cat77":0
, "cat78": 0, "cat82": 0, "cat85": 0, "cat86": 0, "cat89": 0, "cat91": 0
, "cat92": 0, "cat93": 0, "cat94":0, "cat96": 0, "cat97": 0, "cat99": 0
, "cat100": 0, "cat101": 0, "cat103": 0, "cat105": 0, "cat107": 0, "cat109":0
, "cat110": 0, "cat111": 0, "cat112": 0, "cat113": 0, "cat116": 0, "cont1": 0
, "cont2": 0, "cont3": 0, "cont4": 0, "cont5": 0
, "cont6": 0, "cont7": 0, "cont8": 0, "cont14": 0}])
Error: "'GradientBoostingRegressor' object has no attribute 'n_features"
Please can someone guide what could be the problem in executing the above input sample? Is it related to the version of the package, and if yes, then how to update it and solve it?