When executing a sample code, I am encountering the following problem: "RuntimeError: A pipeline has not yet been optimized. Please call fit() first.
The Problem with TPOT Automated Machine Learning in Python. I am trying to make the example: Dataset 2: Mushroom Classification (https://towardsdatascience.com/tpot-automated-machine-learning-in-python-4c063b3e5de9)
source code: https://www.kaggle.com/discdiver/tpot-mushroom-classification-task/
I tried to change the position of tpot.fit (X_train, y_train), but it doesn't solve the problem.
Library
import time
import gc
import pandas as pd
import numpy as np
import seaborn as sns
import timeit
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(font_scale=1.5, palette="colorblind")
import category_encoders
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier
# Read data
df_cogumelo = pd.read_csv('agaricus-lepiota.csv')
# Visualization
pd.options.display.max_columns = 200
pd.options.display.width = 200
# separate out X
X = df_cogumelo.reindex(columns=[x for x in df_cogumelo.columns.values if x != 'class'])
X = X.apply(LabelEncoder().fit_transform)
# separate out y
y = df_cogumelo.reindex(columns=['class'])
print(y['class'].value_counts())
y = np.ravel(y) # flatten the y array
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=10)
print(X_train.describe())
print("\n\n\n")
print(X_train.info())
# generation and population_size determine how many populations are made.
tpot = TPOTClassifier(verbosity=3,
scoring="accuracy",
random_state=10,
periodic_checkpoint_folder="tpot_mushroom_results",
n_jobs=-1,
generations=2,
population_size=10, use_dask=True) #use_dask=True
times = []
scores = []
winning_pipes = []
# run several fits
for x in range(10):
start_time = timeit.default_timer()
tpot.fit(X_train, y_train)
elapsed = timeit.default_timer() - start_time
times.append(elapsed)
winning_pipes.append(tpot.fitted_pipeline_)
scores.append(tpot.score(X_test, y_test))
tpot.export('tpot_mushroom.py')
# output results
times = [time/60 for time in times]
print('Times:', times)
print('Scores:', scores)
print('Winning pipelines:', winning_pipes)
#The expected result is as follows:
#https://www.kaggle.com/discdiver/tpot-#mushroom-classification-task/