In order to get the specific rules applied to a trained sample on a decision tree classifier, we need to use the decision_path method: decision_path(X[, check_input])
.
Now, working on a short text classification model, I have pipelined a feature union on the vectorization of text features and applied gridsearch to find an optimized model as shown in the code below.
- This makes it difficult to call the X features into the
decision_path
method as I keep getting errors - Also I would finally have sought to illustrate the path with the text features instead of the numeric vectorized features...
data, target = df['doc_text'], target_column
data_train, data_test, target_train, target_test, indices_train, indices_test = train_test_split(
data, target, df.index, random_state=0)
#combine words an character grams features
vectorizer = FeatureUnion([
('word_vectorizer', TfidfVectorizer(
sublinear_tf=True,
min_df=2,
#strip_accents='unicode',
#encoding='latin-1'
analyzer='word',
#token_pattern=r'\w{1,}',
ngram_range=(2,5),
norm='l2')),
#dtype = np.float32, #da error
#max_features = 6000)),
('char_vectorizer', TfidfVectorizer(
sublinear_tf=True,
min_df=5,
stop_words='english',
strip_accents='unicode',
analyzer='char',
ngram_range=(2, 5),
norm='l2',
#dtype=np.float32,
max_features = 8000))
])
pipelinedt= Pipeline([
("tfidf", vectorizer),
("clfdt", DecisionTreeClassifier(criterion="entropy",max_depth=7)),
])
tree_para ={ "clfdt__max_depth": (7,25,100),
"clfdt__min_samples_leaf": (1,5,10),
"tfidf__word_vectorizer__max_df": (0.5, 0.75),
"tfidf__word_vectorizer__min_df": (2,),
"tfidf__char_vectorizer__max_df": (0.5, 0.75),
"tfidf__char_vectorizer__min_df": (3,),
"tfidf__char_vectorizer__use_idf": (True, False),
"tfidf__word_vectorizer__use_idf": (True, False),
"tfidf__word_vectorizer__ngram_range": ((1, 2), (2, 4), ),
"tfidf__char_vectorizer__ngram_range": ((4, 5),),
}
if __name__ == "__main__":
dt = GridSearchCV(pipelinedt, tree_para, cv=5, n_jobs=-1, verbose=1)
dt.fit(data_train, target_train)
best_clf= dt.best_estimator_
best_score= dt.best_score_
best_parameters = dt.best_estimator_.get_params()
print ("Best-clf es",best_clf)
print ("Best-score es",best_score)
print("Best Parameters: \n{}\n".format(dt.best_params_))
from sklearn import tree
feature = dt.best_estimator_.named_steps['clfdt'].tree_.feature
threshold = dt.best_estimator_.named_steps['clfdt'].tree_.threshold
node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].get_feature_names_out())
sample_id = 6108
node_index = node_indicator.indices[
node_indicator.indptr[sample_id] : node_indicator.indptr[sample_id + 1]
]
print("Rules used to predict sample {id}:\n".format(id=sample_id))
for node_id in node_index:
# continue to the next node if it is a leaf node
if leaf_id[sample_id] == node_id:
continue
# check if value of the split feature for sample 0 is below threshold
if data_test[sample_id, feature[node_id]] <= threshold[node_id]:
threshold_sign = "<="
else:
threshold_sign = ">"
print(
"decision node {node} : (X_test[{sample}, {feature}] = {value}) "
"{inequality} {threshold})".format(
node=node_id,
sample=sample_id,
feature=feature[node_id],
value=data_test[sample_id, feature[node_id]],
inequality=threshold_sign,
threshold=threshold[node_id],
)
)
I have tried playing with the node_indicator variable on many alternatives without success as I have trouble calling the features which have been vectorized and pipelined.
Actually, I would clearly be much interested after this works to also print out the decision tree path to the chosen sample using the text features and not the vectorized ones.
Anybody can help how to tackle this.
The last error I got on my above trial where I desperately tried the method .get_feature_names_out()
is :
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_13648/4239244359.py in <module>
9 threshold = dt.best_estimator_.named_steps['clfdt'].tree_.threshold
10
---> 11 node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].get_feature_names_out()[2])
12 #node_indicator = dt.best_estimator_.named_steps['clfdt'].decision_path(pipelinedt.named_steps['tfidf'].fit_transform(data_test))
13 #pipe['tfid'].idf_
~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in decision_path(self, X, check_input)
542 indicates that the samples goes through the nodes.
543 """
--> 544 X = self._validate_X_predict(X, check_input)
545 return self.tree_.decision_path(X)
546
~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in _validate_X_predict(self, X, check_input)
431 """Validate the training data on predict (probabilities)."""
432 if check_input:
--> 433 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
434 if issparse(X) and (
435 X.indices.dtype != np.intc or X.indptr.dtype != np.intc
~\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
559 raise ValueError("Validation should be done on X, y or both.")
560 elif not no_val_X and no_val_y:
--> 561 X = check_array(X, **check_params)
562 out = X
563 elif no_val_X and not no_val_y:
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
736 array = array.astype(dtype, casting="unsafe", copy=False)
737 else:
--> 738 array = np.asarray(array, order=order, dtype=dtype)
739 except ComplexWarning as complex_warning:
740 raise ValueError(
ValueError: could not convert string to float: 'word_vectorizer__acpt is'