i am working on a project to check for churn prediction, but my data is very imbalanced I tried so many things but this the best model I can get to my main problem is that I want recall and Precision to be balanced
i tried to use a Synthetic data to train the model and test it using the original data, but it still gives me a really low Precision
the link to my data : https://www.kaggle.com/datasets/blastchar/telco-customer-churn
my code:
# ============================ # 1. Imports # ============================ import pandas as pd import numpy as np from pathlib import Path from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix import matplotlib.pyplot as plt from lightgbm import LGBMClassifier import warnings; warnings.filterwarnings("ignore") # ============================ # 2. File paths # ============================ BASE_DIR = Path(r"") PATH_ORIG = BASE_DIR / "" PATH_SYNTH = BASE_DIR / "" # ============================ # 3. Load data # ============================ df_orig = pd.read_csv(PATH_ORIG) df_synth = pd.read_csv(PATH_SYNTH) df_orig["TotalCharges"] = pd.to_numeric(df_orig["TotalCharges"], errors="coerce") df_orig.dropna(subset=["TotalCharges"], inplace=True) df_orig.reset_index(drop=True, inplace=True) # ============================ # 4. Features & label # ============================ selected_features = [ "Contract", "tenure", "MonthlyCharges", "TotalCharges", "OnlineSecurity", "TechSupport", "InternetService" ] # Removed 'PaymentMethod' target_col = "Churn" def prepare(df): df = df[selected_features + [target_col]].copy() if df[target_col].dtype == "O": df[target_col] = df[target_col].map({"No":0, "Yes":1}) return df df_orig = prepare(df_orig) df_synth = prepare(df_synth) # ============================ # 5. Split & Combine Data # ============================ X_orig, y_orig = df_orig[selected_features], df_orig[target_col] X_train_o, X_test, y_train_o, y_test = train_test_split( X_orig, y_orig, test_size=0.25, stratify=y_orig, random_state=42 ) X_train = pd.concat([X_train_o, df_synth[selected_features]], axis=0).reset_index(drop=True) y_train = pd.concat([y_train_o, df_synth[target_col]], axis=0).reset_index(drop=True) # Force all categoricals to string cat_features = ["Contract", "OnlineSecurity", "TechSupport", "InternetService"] for col in cat_features: X_train[col] = X_train[col].astype(str) X_test[col] = X_test[col].astype(str) # ============================ # 6. Preprocessing # ============================ num_features = ["tenure", "MonthlyCharges", "TotalCharges"] numeric_tf = Pipeline([ ("minmax", MinMaxScaler()), ("std", StandardScaler()) ]) categorical_tf = OneHotEncoder(handle_unknown="ignore", sparse_output=False) preprocess = ColumnTransformer([ ("num", numeric_tf, num_features), ("cat", categorical_tf, cat_features) ]) # ============================ # 7. LightGBM model # ============================ pipe = Pipeline([ ("prep", preprocess), ("clf", LGBMClassifier( objective="binary", random_state=42, n_estimators=300, scale_pos_weight=1.5 # weight to reduce false positives )) ]) param_grid = { "clf__learning_rate": [0.05, 0.1], "clf__max_depth": [3, 5], "clf__num_leaves": [15, 31], } grid = GridSearchCV(pipe, param_grid=param_grid, scoring="f1", cv=3, n_jobs=-1) grid.fit(X_train, y_train) # ============================ # 8. Predictions & Threshold Tuning # ============================ y_proba = grid.predict_proba(X_test)[:, 1] prec, rec, thresh = precision_recall_curve(y_test, y_proba) f1s = 2 * (prec * rec) / (prec + rec + 1e-6) best_idx = np.argmax(f1s) best_thresh = thresh[best_idx] # Try threshold adjustment if you want higher precision manually # best_thresh = 0.42 # Optional override y_pred = (y_proba >= best_thresh).astype(int) # ============================ # 9. Reporting # ============================ print(f"Best CV params → {grid.best_params_}") print(f"Optimal threshold for max F1 = {best_thresh:.4f}") print("\nClassification report on ORIGINAL test:") print(classification_report(y_test, y_pred, digits=4)) print("\nConfusion matrix [[TN, FP], [FN, TP]]:\n", confusion_matrix(y_test, y_pred)) # ============================ # 10. Plot PR Curve # ============================ plt.figure(figsize=(6,4)) plt.plot(rec, prec, label="PR curve") plt.scatter(rec[best_idx], prec[best_idx], marker="o", s=100, label=f"Best F1 @ t={best_thresh:.2f}") plt.xlabel("Recall") plt.ylabel("Precision") plt.title("Precision‑Recall curve (original test)") plt.legend() plt.tight_layout() plt.show()[![enter image description here][1]][1]