{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb1f108",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from xgboost import XGBClassifier\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, recall_score, precision_score\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.experimental import enable_iterative_imputer  # noqa\n",
    "from sklearn.impute import IterativeImputer\n",
    "\n",
    "# 提取的颜色\n",
    "left_color_hex = '#72b6a1'  # 绿色\n",
    "right_color_hex = '#e99675'  # 橙色\n",
    "\n",
    "# Set global font to Times New Roman and increase font sizes\n",
    "plt.rcParams['font.family'] = 'serif'\n",
    "plt.rcParams['font.serif'] = ['Times New Roman']\n",
    "plt.rcParams['font.size'] = 12\n",
    "plt.rcParams['axes.titlesize'] = 14\n",
    "plt.rcParams['axes.labelsize'] = 12\n",
    "plt.rcParams['xtick.labelsize'] = 10\n",
    "plt.rcParams['ytick.labelsize'] = 10\n",
    "plt.rcParams['legend.fontsize'] = 12\n",
    "\n",
    "# Load the dataset\n",
    "file_path = 'S4.Raw Data.xlsx'\n",
    "data = pd.read_excel(file_path)\n",
    "target = 'T2DM'\n",
    "\n",
    "# Handle NaN values using IterativeImputer\n",
    "imputer = IterativeImputer(random_state=42)\n",
    "data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)\n",
    "\n",
    "# Exclude columns 'OGTT', 'FPG', 'T2DM_Report', 'Medical insurance' from features\n",
    "exclude_columns = ['OGTT', 'FPG', 'T2DM_Report', 'Medical insurance']\n",
    "X = data_imputed.drop(columns=[target] + exclude_columns)\n",
    "y = data_imputed[target]\n",
    "\n",
    "# Count the number of samples in each class before under-sampling\n",
    "print(\"Original class distribution:\")\n",
    "print(y.value_counts())\n",
    "\n",
    "# Count the number of samples in each class\n",
    "class_counts = y.value_counts()\n",
    "minority_class_count = class_counts.min()\n",
    "\n",
    "# Define the sampling strategy to keep the minority class intact\n",
    "sampling_strategy = {class_counts.idxmin(): minority_class_count, class_counts.idxmax(): minority_class_count}\n",
    "\n",
    "rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)\n",
    "X_resampled, y_resampled = rus.fit_resample(X, y)\n",
    "\n",
    "# Count the number of samples in each class after under-sampling\n",
    "print(\"\\nClass distribution after under-sampling:\")\n",
    "print(y_resampled.value_counts())\n",
    "\n",
    "# Split data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)\n",
    "\n",
    "# Continuous and categorical features\n",
    "continuous_features = ['BMI', 'Sedentary time', 'Age', 'Daily sleep duration']\n",
    "categorical_features = [col for col in X_train.columns if col not in continuous_features]\n",
    "\n",
    "# Preprocessing pipelines\n",
    "preprocessor_standard = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', StandardScaler(), continuous_features),\n",
    "        ('cat', OneHotEncoder(), categorical_features)\n",
    "    ])\n",
    "\n",
    "preprocessor_no_transform = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', 'passthrough', continuous_features),\n",
    "        ('cat', 'passthrough', categorical_features)\n",
    "    ])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "204ab6a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "# Define the models with their initial hyperparameter grids\n",
    "initial_param_grids = {\n",
    "    'Logistic Regression': {\n",
    "        'logreg__penalty': ['l2', 'l1'],\n",
    "        'logreg__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],\n",
    "        'logreg__solver': ['liblinear', 'saga'],\n",
    "        'logreg__class_weight': [None, 'balanced']\n",
    "    },\n",
    "    'SVM': {\n",
    "        'svc__C': [0.1, 1, 10, 100],\n",
    "        'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],\n",
    "        'svc__gamma': ['scale', 'auto'],\n",
    "        'svc__class_weight': [None, 'balanced']\n",
    "    },\n",
    "    'Random Forest': {\n",
    "        'rf__n_estimators': [100, 200, 300, 400, 500],\n",
    "        'rf__max_depth': [None, 10, 20, 30],\n",
    "        'rf__min_samples_split': [2, 5, 10],\n",
    "        'rf__min_samples_leaf': [1, 2, 4],\n",
    "        'rf__bootstrap': [True, False]\n",
    "    },\n",
    "    'XGBoost': {\n",
    "        'xgb__n_estimators': [100, 200, 300, 400, 500],\n",
    "        'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],\n",
    "        'xgb__max_depth': [3, 4, 5, 6, 7],\n",
    "        'xgb__min_child_weight': [1, 3, 5],\n",
    "        'xgb__gamma': [0, 0.1, 0.2, 0.3],\n",
    "        'xgb__subsample': [0.6, 0.8, 1.0],\n",
    "        'xgb__colsample_bytree': [0.6, 0.8, 1.0]\n",
    "    },\n",
    "    'LightGBM': {\n",
    "        'lgbm__num_leaves': [20, 30, 40, 50],\n",
    "        'lgbm__max_depth': [None, 10, 20, 30],\n",
    "        'lgbm__learning_rate': [0.01, 0.1, 0.2],\n",
    "        'lgbm__n_estimators': [100, 200, 300, 400, 500],\n",
    "        'lgbm__min_child_samples': [20, 30, 40],\n",
    "        'lgbm__subsample': [0.6, 0.8, 1.0]\n",
    "    }\n",
    "}\n",
    "\n",
    "# Define models with default hyperparameters\n",
    "pipelines = {\n",
    "    'Logistic Regression': Pipeline([\n",
    "        ('preprocessor', preprocessor_standard),\n",
    "        ('logreg', LogisticRegression(max_iter=10000))\n",
    "    ]),\n",
    "    'SVM': Pipeline([\n",
    "        ('preprocessor', preprocessor_standard),\n",
    "        ('svc', SVC(probability=True))\n",
    "    ]),\n",
    "    'Random Forest': Pipeline([\n",
    "        ('preprocessor', preprocessor_no_transform),\n",
    "        ('rf', RandomForestClassifier())\n",
    "    ]),\n",
    "    'XGBoost': Pipeline([\n",
    "        ('preprocessor', preprocessor_no_transform),\n",
    "        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))\n",
    "    ]),\n",
    "    'LightGBM': Pipeline([\n",
    "        ('preprocessor', preprocessor_no_transform),\n",
    "        ('lgbm', LGBMClassifier())\n",
    "    ])\n",
    "}\n",
    "\n",
    "# Perform RandomizedSearchCV for each model\n",
    "best_params_initial = {}\n",
    "for model_name, pipeline in pipelines.items():\n",
    "    print(f\"Tuning hyperparameters for {model_name}...\")\n",
    "    random_search = RandomizedSearchCV(pipeline, param_distributions=initial_param_grids[model_name], \n",
    "                                       n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)\n",
    "    random_search.fit(X_train, y_train)\n",
    "    best_params_initial[model_name] = random_search.best_params_\n",
    "    print(f\"Best parameters for {model_name}: {random_search.best_params_}\")\n",
    "    print(f\"Best AUC for {model_name}: {random_search.best_score_:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb0d404",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "# Helper function to handle NoneType values in fine-tuning\n",
    "def get_fine_tuned_range(param, decrement=1, increment=1):\n",
    "    if param is None:\n",
    "        return [None]\n",
    "    return [param - decrement, param, param + increment]\n",
    "\n",
    "# Fine-tuned hyperparameter grids based on initial tuning results\n",
    "fine_tuned_param_grids = {\n",
    "    'Logistic Regression': {\n",
    "        'logreg__C': get_fine_tuned_range(best_params_initial['Logistic Regression']['logreg__C'], 0.5, 0.5)\n",
    "    },\n",
    "    'SVM': {\n",
    "        'svc__C': get_fine_tuned_range(best_params_initial['SVM']['svc__C'], 0.5, 0.5),\n",
    "        'svc__gamma': [best_params_initial['SVM']['svc__gamma']]\n",
    "    },\n",
    "    'Random Forest': {\n",
    "        'rf__n_estimators': get_fine_tuned_range(best_params_initial['Random Forest']['rf__n_estimators'], 50, 50),\n",
    "        'rf__max_depth': get_fine_tuned_range(best_params_initial['Random Forest']['rf__max_depth'], 5, 5),\n",
    "        'rf__min_samples_split': get_fine_tuned_range(best_params_initial['Random Forest']['rf__min_samples_split'], 1, 1),\n",
    "        'rf__min_samples_leaf': get_fine_tuned_range(best_params_initial['Random Forest']['rf__min_samples_leaf'], 1, 1),\n",
    "        'rf__bootstrap': [best_params_initial['Random Forest']['rf__bootstrap']]\n",
    "    },\n",
    "    'XGBoost': {\n",
    "        'xgb__n_estimators': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__n_estimators'], 50, 50),\n",
    "        'xgb__learning_rate': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__learning_rate'], 0.05, 0.05),\n",
    "        'xgb__max_depth': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__max_depth'], 1, 1),\n",
    "        'xgb__min_child_weight': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__min_child_weight'], 1, 1),\n",
    "        'xgb__gamma': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__gamma'], 0.05, 0.05),\n",
    "        'xgb__subsample': [best_params_initial['XGBoost']['xgb__subsample']],\n",
    "        'xgb__colsample_bytree': [best_params_initial['XGBoost']['xgb__colsample_bytree']]\n",
    "    },\n",
    "    'LightGBM': {\n",
    "        'lgbm__num_leaves': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__num_leaves'], 5, 5),\n",
    "        'lgbm__max_depth': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__max_depth'], 1, 1),\n",
    "        'lgbm__learning_rate': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__learning_rate'], 0.05, 0.05),\n",
    "        'lgbm__n_estimators': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__n_estimators'], 50, 50),\n",
    "        'lgbm__min_child_samples': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__min_child_samples'], 5, 5),\n",
    "        'lgbm__subsample': [best_params_initial['LightGBM']['lgbm__subsample']]\n",
    "    }\n",
    "}\n",
    "\n",
    "# Perform GridSearchCV for each model\n",
    "best_params_fine_tuned = {}\n",
    "for model_name, pipeline in pipelines.items():\n",
    "    print(f\"Fine-tuning hyperparameters for {model_name}...\")\n",
    "    grid_search = GridSearchCV(pipeline, param_grid=fine_tuned_param_grids[model_name], \n",
    "                               cv=5, scoring='roc_auc', n_jobs=-1)\n",
    "    grid_search.fit(X_train, y_train)\n",
    "    best_params_fine_tuned[model_name] = grid_search.best_params_\n",
    "    print(f\"Fine-tuned best parameters for {model_name}: {grid_search.best_params_}\")\n",
    "    print(f\"Fine-tuned best AUC for {model_name}: {grid_search.best_score_:.4f}\")\n",
    "\n",
    "# Remove prefix from best parameters for each model\n",
    "def remove_prefix(best_params, prefix):\n",
    "    return {key.split(f'{prefix}__')[1]: value for key, value in best_params.items()}\n",
    "\n",
    "best_params_processed = {\n",
    "    'Logistic Regression': remove_prefix(best_params_fine_tuned['Logistic Regression'], 'logreg'),\n",
    "    'SVM': remove_prefix(best_params_fine_tuned['SVM'], 'svc'),\n",
    "    'Random Forest': remove_prefix(best_params_fine_tuned['Random Forest'], 'rf'),\n",
    "    'XGBoost': remove_prefix(best_params_fine_tuned['XGBoost'], 'xgb'),\n",
    "    'LightGBM': remove_prefix(best_params_fine_tuned['LightGBM'], 'lgbm')\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6cd60b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import display, HTML\n",
    "\n",
    "# Define a function to calculate and plot metrics\n",
    "def calculate_metrics(models, X_test, y_test):\n",
    "    metrics = {'Model': [], 'AUC': [], 'Accuracy': [], 'Recall': [], 'Precision': []}\n",
    "    plt.figure(figsize=(14, 10), dpi=300)\n",
    "    \n",
    "    for name, model in models.items():\n",
    "        model.fit(X_train, y_train)\n",
    "        y_pred_proba = model.predict_proba(X_test)[:, 1]\n",
    "        auc = roc_auc_score(y_test, y_pred_proba)\n",
    "        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n",
    "        \n",
    "        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.4f})')\n",
    "        \n",
    "        # Calculate and store metrics\n",
    "        y_pred = model.predict(X_test)\n",
    "        accuracy = accuracy_score(y_test, y_pred)\n",
    "        recall = recall_score(y_test, y_pred)\n",
    "        precision = precision_score(y_test, y_pred)\n",
    "        \n",
    "        metrics['Model'].append(name)\n",
    "        metrics['AUC'].append(auc)\n",
    "        metrics['Accuracy'].append(accuracy)\n",
    "        metrics['Recall'].append(recall)\n",
    "        metrics['Precision'].append(precision)\n",
    "    \n",
    "    plt.plot([0, 1], [0, 1], 'k--')\n",
    "    plt.xlabel('False Positive Rate')\n",
    "    plt.ylabel('True Positive Rate')\n",
    "    plt.title('The AUC-ROC Curves of 5 ML Models')\n",
    "    plt.legend(loc='lower right')\n",
    "    plt.grid(True)\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    metrics_df = pd.DataFrame(metrics)\n",
    "    return metrics_df\n",
    "\n",
    "# Models with fine-tuned parameters\n",
    "fine_tuned_models = {\n",
    "    'Logistic Regression': Pipeline([\n",
    "        ('preprocessor', preprocessor_standard),\n",
    "        ('logreg', LogisticRegression(**best_params_processed['Logistic Regression'], max_iter=10000))\n",
    "    ]),\n",
    "    'SVM': Pipeline([\n",
    "        ('preprocessor', preprocessor_standard),\n",
    "        ('svc', SVC(**best_params_processed['SVM'], probability=True))\n",
    "    ]),\n",
    "    'Random Forest': Pipeline([\n",
    "        ('preprocessor', preprocessor_no_transform),\n",
    "        ('rf', RandomForestClassifier(**best_params_processed['Random Forest']))\n",
    "    ]),\n",
    "    'XGBoost': Pipeline([\n",
    "        ('preprocessor', preprocessor_no_transform),\n",
    "        ('xgb', XGBClassifier(**best_params_processed['XGBoost'], use_label_encoder=False, eval_metric='logloss'))\n",
    "    ]),\n",
    "    'LightGBM': Pipeline([\n",
    "        ('preprocessor', preprocessor_no_transform),\n",
    "        ('lgbm', LGBMClassifier(**best_params_processed['LightGBM']))\n",
    "    ])\n",
    "}\n",
    "\n",
    "# Calculate metrics and plot AUC-ROC curves\n",
    "metrics_df = calculate_metrics(fine_tuned_models, X_test, y_test)\n",
    "\n",
    "# Display the metrics in a table\n",
    "metrics_df_display = metrics_df.set_index('Model')\n",
    "display(HTML(metrics_df_display.to_html()))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62f68769",
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install shap lightgbm\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2e07572",
   "metadata": {},
   "outputs": [],
   "source": [
    "import shap\n",
    "import lightgbm as lgb\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "# 使用最佳参数重新训练LightGBM模型\n",
    "best_params_lgbm = best_params_processed['LightGBM']\n",
    "\n",
    "# 创建并训练LightGBM模型\n",
    "lgbm_model = LGBMClassifier(**best_params_lgbm)\n",
    "lgbm_model.fit(X_train, y_train)\n",
    "\n",
    "# 创建SHAP解释器\n",
    "explainer = shap.TreeExplainer(lgbm_model)\n",
    "shap_values = explainer.shap_values(X_test)\n",
    "\n",
    "# 创建图形对象\n",
    "fig, ax = plt.subplots(figsize=(8, 8), dpi=300)\n",
    "shap.summary_plot(shap_values, X_test, plot_type=\"bar\", feature_names=X_test.columns, show=False)\n",
    "plt.title('Feature Importance (Bar Plot)', fontsize=16)\n",
    "plt.savefig('shap_bar_plot.png', bbox_inches='tight')\n",
    "plt.close(fig)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8, 8), dpi=300)\n",
    "shap.summary_plot(shap_values, X_test, feature_names=X_test.columns, show=False)\n",
    "plt.title('Feature Importance (Bee Swarm Plot)', fontsize=16)\n",
    "plt.savefig('shap_bee_swarm_plot.png', bbox_inches='tight')\n",
    "plt.close(fig)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1945a2d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from PIL import Image, ImageDraw, ImageFont\n",
    "\n",
    "# 读取保存的图像\n",
    "bar_plot = Image.open('shap_bar_plot.png')\n",
    "bee_swarm_plot = Image.open('shap_bee_swarm_plot.png')\n",
    "\n",
    "# 创建并排拼接的图像\n",
    "total_width = bar_plot.width + bee_swarm_plot.width\n",
    "max_height = max(bar_plot.height, bee_swarm_plot.height)\n",
    "\n",
    "new_img = Image.new('RGB', (total_width, max_height), (255, 255, 255))\n",
    "new_img.paste(bar_plot, (0, 0))\n",
    "new_img.paste(bee_swarm_plot, (bar_plot.width, 0))\n",
    "\n",
    "# 添加A和B标记\n",
    "draw = ImageDraw.Draw(new_img)\n",
    "font = ImageFont.truetype(\"arial\", 100)\n",
    "\n",
    "draw.text((20, 20), \"A\", fill=\"black\", font=font)\n",
    "draw.text((bar_plot.width + 20, 20), \"B\", fill=\"black\", font=font)\n",
    "\n",
    "# 保存并显示最终的拼接图像\n",
    "new_img.save('shap_combined_plot.png')\n",
    "new_img.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56050b6a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}