I am working on a project using the Telco Customer Churn Kaggle dataset and implementing a prediction API using FastAPI. The goal is to make churn predictions based on user input. I have trained multiple models (K-Nearest Neighbors, Logistic Regression, Support Vector Machine, Random Forest, Gradient Boosting) using the dataset and saved them along with the corresponding label encoders.
However, when I try to make predictions using the API, I encounter an "Unknown value" error for the 'PaymentMethod' column. The error message states that the value provided is unknown and lists the valid values as "Bank transfer (automatic), Credit card (automatic), Electronic check, Mailed check." Interestingly, the provided value 'Electronic check' is included in the valid values.
I have checked my code and ensured that the label encoding is correctly applied during prediction. I have also verified that the label encoder for 'PaymentMethod' is loaded correctly from the saved model files. However, I am still facing this error.
I have included the relevant code snippets below for reference:
train_models.py: This script loads the Telco Customer Churn dataset, preprocesses it, performs feature selection, encodes categorical variables, handles class imbalance, trains multiple models using cross-validation and hyperparameter tuning, and saves the trained models and label encoders.
predict_api.py: This script implements the FastAPI application, loads the saved models and label encoders, and provides an endpoint for making churn predictions based on user input.
I would greatly appreciate any insights or suggestions on what might be causing this error and how to resolve it. Thank you in advance for your help!
Here is my train_models.py:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
import joblib
import os
# Step 1: Load the data
data = pd.read_csv('telco_churn_dataset.csv')
# Step 2: Data Preprocessing
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
# Step 3: Handle Missing Values
data = data.dropna()
# Step 4: Feature Selection
selected_features = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'PaymentMethod', 'Contract', 'Churn']
data_selected = data[selected_features]
# Step 5: Encode Categorical Variables
categorical_cols = ['PaymentMethod', 'Contract']
label_encoders = {}
for col in categorical_cols:
label_encoder = LabelEncoder()
data_selected[col] = label_encoder.fit_transform(data_selected[col].astype(str))
label_encoders[col] = label_encoder
# Step 6: Calculate Baseline Churn Rate
baseline_rate = data_selected['Churn'].value_counts(normalize=True).get('No', 0.0)
print("Baseline Churn Rate:", baseline_rate)
# Step 7: Handle Imbalance in the Dataset
X = data_selected.drop('Churn', axis=1) # Independent variables
y = data_selected['Churn'] # Target variable
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)
# Step 8: Splitting the Resampled Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
# Step 9: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 10: Check for Correlation
correlation_matrix = pd.DataFrame(X_train_scaled, columns=X_train.columns).corr()
print("Correlation Matrix:")
print(correlation_matrix)
# Step 11: Model Training and Evaluation with Cross-Validation and Hyperparameter Tuning
models = {
"K-Nearest Neighbors": KNeighborsClassifier(),
"Logistic Regression": LogisticRegression(),
"Support Vector Machine": SVC(),
"Random Forest": RandomForestClassifier(),
"Gradient Boosting": GradientBoostingClassifier()
}
trained_models = {}
for name, model in models.items():
print("\n" + name)
# Define the hyperparameters grid for the model
param_grid = {}
if name == "K-Nearest Neighbors":
param_grid = {"n_neighbors": [3, 5, 7]}
elif name == "Logistic Regression":
param_grid = {"C": [0.1, 1, 10]}
elif name == "Support Vector Machine":
param_grid = {"C": [0.1, 1, 10], "gamma": [0.1, 1, 10]}
elif name == "Random Forest":
param_grid = {"n_estimators": [50, 100, 150]}
elif name == "Gradient Boosting":
param_grid = {"learning_rate": [0.1, 0.5, 1], "n_estimators": [50, 100, 150]}
# Perform cross-validation with hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
# Print the best hyperparameters and cross-validation results
print("Best Hyperparameters:", grid_search.best_params_)
print("Cross-Validation Accuracy: {:.2f}".format(grid_search.best_score_))
# Train the model with the best hyperparameters
model = grid_search.best_estimator_
model.fit(X_train_scaled, y_train)
# Evaluate the model on the test set
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}".format(accuracy))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
trained_models[name] = model
joblib.dump(model, os.path.join("trained_models", f"{name.replace(' ', '_')}.joblib"))
# Save the label encoders with names corresponding to categorical variables
for col, label_encoder in label_encoders.items():
joblib.dump(label_encoder, os.path.join("trained_models", f"label_encoders_{col.replace(' ', '_')}.joblib"))
print("\nModels trained and saved successfully!")
Here is my predict_api.py:
from fastapi import FastAPI
from pydantic import BaseModel
import numpy as np
import os
import joblib
class PredictionRequest(BaseModel):
SeniorCitizen: int
tenure: int
MonthlyCharges: float
TotalCharges: float
PaymentMethod: str
Contract: str
model_name: str
app = FastAPI()
@app.on_event("startup")
async def load_models():
model_directory = "trained_models"
if not os.path.exists(model_directory):
os.makedirs(model_directory)
trained_models = {}
label_encoders = {}
model_files = [file for file in os.listdir(model_directory) if file.endswith(".joblib")]
for model_file in model_files:
if model_file.startswith("label_encoders"):
model_name = model_file.split("_")[-1].split(".")[0]
label_encoders[model_name] = joblib.load(os.path.join(model_directory, model_file))
else:
model_name = model_file.split(".")[0].replace('_', ' ')
trained_models[model_name] = joblib.load(os.path.join(model_directory, model_file))
if not trained_models or not label_encoders:
print("Label encoders and trained models not found. Please run 'train_models.py' first to generate them.")
else:
app.state.trained_models = trained_models
app.state.label_encoders = label_encoders
@app.post("/predict")
async def predict_churn(request: PredictionRequest):
try:
trained_models = app.state.trained_models
label_encoders = app.state.label_encoders
except AttributeError:
return {"message": "Label encoders not found. Please run 'train_models.py' first to generate them."}
input_data = np.array([
[
int(request.SeniorCitizen),
int(request.tenure),
float(request.MonthlyCharges),
float(request.TotalCharges),
label_encoders['PaymentMethod'].transform([request.PaymentMethod])[0],
label_encoders['Contract'].transform([request.Contract])[0]
]
])
if request.model_name not in trained_models:
return {"message": "Invalid model name. Available models: " + ", ".join(trained_models.keys())}
model = trained_models[request.model_name]
prediction = model.predict(input_data)
return {"prediction": prediction[0]}