I'm trying to create a Python script for feature selection using PyGAD
My code is shown below, nonetheless, it is returning that all the features are the best subset. How can I be sure it is correct?
import pygad
import numpy
from sklearn.model_selection import train_test_split, cross_val_score
from src.learner_params import target_column, model_features
from sklearn.datasets import load_breast_cancer
from lightgbm import LGBMClassifier as lgbm
from sklearn.metrics import roc_auc_score
bc = load_breast_cancer()
bst = lgbm(random_state = 42,n_estimators=1, max_depth=2)
function_inputs = bc.feature_names
X, y = bc.data,bc.target
X = pd.DataFrame(X, columns=bc.feature_names)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
random_state=42)
def fitness_func(ga_instance, solution, solution_idx):
mask = np.where(solution ==1, True, False)
selected_features = np.array(bc.feature_names)[mask]
X_tmp = X_train.loc[:,selected_features]
score = cross_val_score(bst, X_tmp, y_train, scoring="roc_auc", cv = 2).mean()
fitness = score
return fitness
m = len(bc.feature_names)
fitness_function = fitness_func
gene_space = np.full(m,1)
num_generations = 100
num_parents_mating = 4
sol_per_pop = 8
num_genes = m
init_range_low = -2
init_range_high = 5
parent_selection_type = "sss"
keep_parents = 2
crossover_type = "single_point"
mutation_type = "random"
mutation_percent_genes = 100
ga_instance = pygad.GA(gene_space=gene_space,
num_generations=num_generations,
num_parents_mating=num_parents_mating,
fitness_func=fitness_function,
sol_per_pop=sol_per_pop,
num_genes=num_genes,
keep_parents=keep_parents,
crossover_type=crossover_type,
mutation_type=mutation_type,
mutation_percent_genes=mutation_percent_genes)
ga_instance.run()
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print("Parameters of the best solution : {solution}".format(solution=solution))
print("Fitness value of the best solution = {solution_fitness}".format(solution_fitness=solution_fitness))