Problem Statement
A telecommunications company wants to predict whether a customer will churn (leave the service) based on account details, service usage, and demographic information. The goal is to build machine learning models that classify customers as likely to churn (1) or not (0).
Dataset
This dataset contains 7,043 customer records with features such as:
- Tenure — how long a customer has been with the company.
- Monthly charges — amount billed each month.
- Total charges — total amount paid.
- Contract Type — month-to-month, one-year, or two-year.
- Payment Method — electronic check, credit card, etc.
- Internet and Phone Services — indicates whether the customer subscribes to various services.
- Churn — (Target variable: 1 = Churn, 0 = No Churn).
Step 1: Load and Prepare Data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer# Load dataset
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)
# Convert 'TotalCharges' to numeric and handle missing values safely
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0)
# Encode target variable: Churn (Yes -> 1, No -> 0)
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
# Select relevant features
numerical_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = ["gender", "SeniorCitizen", "Partner", "Dependents",
"PhoneService", "MultipleLines", "InternetService",
"OnlineSecurity", "OnlineBackup", "DeviceProtection",
"TechSupport", "StreamingTV", "StreamingMovies",
"Contract", "PaperlessBilling", "PaymentMethod"]
# Split dataset into training and testing sets
X = df[numerical_features + categorical_features]
y = df["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Preprocessing: Standardize numerical features, OneHotEncode categorical features
preprocessor = ColumnTransformer([
("num", StandardScaler(), numerical_features),
("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])
Step 2: Train Models
Logistic Regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionlogreg = Pipeline([
("preprocessor", preprocessor),
("classifier", LogisticRegression(max_iter=500))
])
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
Decision Tree
from sklearn.tree import DecisionTreeClassifierdtc = Pipeline([
("preprocessor", preprocessor),
("classifier", DecisionTreeClassifier(max_depth=5, random_state=42))
])
dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)
Random Forest
from sklearn.ensemble import RandomForestClassifierrf = Pipeline([
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
Step 3: Evaluate Performance
from sklearn.metrics import accuracy_score, classification_reportprint("=== Logistic Regression Performance ===")
print(classification_report(y_test, y_pred_logreg))
print("\n=== Decision Tree Performance ===")
print(classification_report(y_test, y_pred_dtc))
print("\n=== Random Forest Performance ===")
print(classification_report(y_test, y_pred_rf))
This classification report provides key performance metrics:
- Precision: How many predicted positives were correct.
- Recall: How many actual positives were correctly identified.
- F1-score: Balance between precision and recall.
- Accuracy: Overall correctness of the model.
Step 4: Compare Model Performance
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix# Function to plot confusion matrices
def plot_confusion_matrix(y_true, y_pred, model_name):
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Churn", "Churn"], yticklabels=["No Churn", "Churn"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title(f"Confusion Matrix - {model_name}")
plt.show()
# Plot confusion matrices for each model
plot_confusion_matrix(y_test, y_pred_logreg, "Logistic Regression")
plot_confusion_matrix(y_test, y_pred_dtc, "Decision Tree")
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest")