{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ddb1f108", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from xgboost import XGBClassifier\n", "from lightgbm import LGBMClassifier\n", "from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, recall_score, precision_score\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.experimental import enable_iterative_imputer # noqa\n", "from sklearn.impute import IterativeImputer\n", "\n", "# 提取的颜色\n", "left_color_hex = '#72b6a1' # 绿色\n", "right_color_hex = '#e99675' # 橙色\n", "\n", "# Set global font to Times New Roman and increase font sizes\n", "plt.rcParams['font.family'] = 'serif'\n", "plt.rcParams['font.serif'] = ['Times New Roman']\n", "plt.rcParams['font.size'] = 12\n", "plt.rcParams['axes.titlesize'] = 14\n", "plt.rcParams['axes.labelsize'] = 12\n", "plt.rcParams['xtick.labelsize'] = 10\n", "plt.rcParams['ytick.labelsize'] = 10\n", "plt.rcParams['legend.fontsize'] = 12\n", "\n", "# Load the dataset\n", "file_path = 'S4.Raw Data.xlsx'\n", "data = pd.read_excel(file_path)\n", "target = 'T2DM'\n", "\n", "# Handle NaN values using IterativeImputer\n", "imputer = IterativeImputer(random_state=42)\n", "data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)\n", "\n", "# Exclude columns 'OGTT', 'FPG', 'T2DM_Report', 'Medical insurance' from features\n", "exclude_columns = ['OGTT', 'FPG', 'T2DM_Report', 'Medical insurance']\n", "X = data_imputed.drop(columns=[target] + exclude_columns)\n", "y = data_imputed[target]\n", "\n", "# Count the number of samples in each class before under-sampling\n", "print(\"Original class distribution:\")\n", "print(y.value_counts())\n", "\n", "# Count the number of samples in each class\n", "class_counts = y.value_counts()\n", "minority_class_count = class_counts.min()\n", "\n", "# Define the sampling strategy to keep the minority class intact\n", "sampling_strategy = {class_counts.idxmin(): minority_class_count, class_counts.idxmax(): minority_class_count}\n", "\n", "rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)\n", "X_resampled, y_resampled = rus.fit_resample(X, y)\n", "\n", "# Count the number of samples in each class after under-sampling\n", "print(\"\\nClass distribution after under-sampling:\")\n", "print(y_resampled.value_counts())\n", "\n", "# Split data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)\n", "\n", "# Continuous and categorical features\n", "continuous_features = ['BMI', 'Sedentary time', 'Age', 'Daily sleep duration']\n", "categorical_features = [col for col in X_train.columns if col not in continuous_features]\n", "\n", "# Preprocessing pipelines\n", "preprocessor_standard = ColumnTransformer(\n", " transformers=[\n", " ('num', StandardScaler(), continuous_features),\n", " ('cat', OneHotEncoder(), categorical_features)\n", " ])\n", "\n", "preprocessor_no_transform = ColumnTransformer(\n", " transformers=[\n", " ('num', 'passthrough', continuous_features),\n", " ('cat', 'passthrough', categorical_features)\n", " ])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "204ab6a1", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "# Define the models with their initial hyperparameter grids\n", "initial_param_grids = {\n", " 'Logistic Regression': {\n", " 'logreg__penalty': ['l2', 'l1'],\n", " 'logreg__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],\n", " 'logreg__solver': ['liblinear', 'saga'],\n", " 'logreg__class_weight': [None, 'balanced']\n", " },\n", " 'SVM': {\n", " 'svc__C': [0.1, 1, 10, 100],\n", " 'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],\n", " 'svc__gamma': ['scale', 'auto'],\n", " 'svc__class_weight': [None, 'balanced']\n", " },\n", " 'Random Forest': {\n", " 'rf__n_estimators': [100, 200, 300, 400, 500],\n", " 'rf__max_depth': [None, 10, 20, 30],\n", " 'rf__min_samples_split': [2, 5, 10],\n", " 'rf__min_samples_leaf': [1, 2, 4],\n", " 'rf__bootstrap': [True, False]\n", " },\n", " 'XGBoost': {\n", " 'xgb__n_estimators': [100, 200, 300, 400, 500],\n", " 'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],\n", " 'xgb__max_depth': [3, 4, 5, 6, 7],\n", " 'xgb__min_child_weight': [1, 3, 5],\n", " 'xgb__gamma': [0, 0.1, 0.2, 0.3],\n", " 'xgb__subsample': [0.6, 0.8, 1.0],\n", " 'xgb__colsample_bytree': [0.6, 0.8, 1.0]\n", " },\n", " 'LightGBM': {\n", " 'lgbm__num_leaves': [20, 30, 40, 50],\n", " 'lgbm__max_depth': [None, 10, 20, 30],\n", " 'lgbm__learning_rate': [0.01, 0.1, 0.2],\n", " 'lgbm__n_estimators': [100, 200, 300, 400, 500],\n", " 'lgbm__min_child_samples': [20, 30, 40],\n", " 'lgbm__subsample': [0.6, 0.8, 1.0]\n", " }\n", "}\n", "\n", "# Define models with default hyperparameters\n", "pipelines = {\n", " 'Logistic Regression': Pipeline([\n", " ('preprocessor', preprocessor_standard),\n", " ('logreg', LogisticRegression(max_iter=10000))\n", " ]),\n", " 'SVM': Pipeline([\n", " ('preprocessor', preprocessor_standard),\n", " ('svc', SVC(probability=True))\n", " ]),\n", " 'Random Forest': Pipeline([\n", " ('preprocessor', preprocessor_no_transform),\n", " ('rf', RandomForestClassifier())\n", " ]),\n", " 'XGBoost': Pipeline([\n", " ('preprocessor', preprocessor_no_transform),\n", " ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))\n", " ]),\n", " 'LightGBM': Pipeline([\n", " ('preprocessor', preprocessor_no_transform),\n", " ('lgbm', LGBMClassifier())\n", " ])\n", "}\n", "\n", "# Perform RandomizedSearchCV for each model\n", "best_params_initial = {}\n", "for model_name, pipeline in pipelines.items():\n", " print(f\"Tuning hyperparameters for {model_name}...\")\n", " random_search = RandomizedSearchCV(pipeline, param_distributions=initial_param_grids[model_name], \n", " n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)\n", " random_search.fit(X_train, y_train)\n", " best_params_initial[model_name] = random_search.best_params_\n", " print(f\"Best parameters for {model_name}: {random_search.best_params_}\")\n", " print(f\"Best AUC for {model_name}: {random_search.best_score_:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ddb0d404", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "# Helper function to handle NoneType values in fine-tuning\n", "def get_fine_tuned_range(param, decrement=1, increment=1):\n", " if param is None:\n", " return [None]\n", " return [param - decrement, param, param + increment]\n", "\n", "# Fine-tuned hyperparameter grids based on initial tuning results\n", "fine_tuned_param_grids = {\n", " 'Logistic Regression': {\n", " 'logreg__C': get_fine_tuned_range(best_params_initial['Logistic Regression']['logreg__C'], 0.5, 0.5)\n", " },\n", " 'SVM': {\n", " 'svc__C': get_fine_tuned_range(best_params_initial['SVM']['svc__C'], 0.5, 0.5),\n", " 'svc__gamma': [best_params_initial['SVM']['svc__gamma']]\n", " },\n", " 'Random Forest': {\n", " 'rf__n_estimators': get_fine_tuned_range(best_params_initial['Random Forest']['rf__n_estimators'], 50, 50),\n", " 'rf__max_depth': get_fine_tuned_range(best_params_initial['Random Forest']['rf__max_depth'], 5, 5),\n", " 'rf__min_samples_split': get_fine_tuned_range(best_params_initial['Random Forest']['rf__min_samples_split'], 1, 1),\n", " 'rf__min_samples_leaf': get_fine_tuned_range(best_params_initial['Random Forest']['rf__min_samples_leaf'], 1, 1),\n", " 'rf__bootstrap': [best_params_initial['Random Forest']['rf__bootstrap']]\n", " },\n", " 'XGBoost': {\n", " 'xgb__n_estimators': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__n_estimators'], 50, 50),\n", " 'xgb__learning_rate': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__learning_rate'], 0.05, 0.05),\n", " 'xgb__max_depth': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__max_depth'], 1, 1),\n", " 'xgb__min_child_weight': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__min_child_weight'], 1, 1),\n", " 'xgb__gamma': get_fine_tuned_range(best_params_initial['XGBoost']['xgb__gamma'], 0.05, 0.05),\n", " 'xgb__subsample': [best_params_initial['XGBoost']['xgb__subsample']],\n", " 'xgb__colsample_bytree': [best_params_initial['XGBoost']['xgb__colsample_bytree']]\n", " },\n", " 'LightGBM': {\n", " 'lgbm__num_leaves': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__num_leaves'], 5, 5),\n", " 'lgbm__max_depth': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__max_depth'], 1, 1),\n", " 'lgbm__learning_rate': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__learning_rate'], 0.05, 0.05),\n", " 'lgbm__n_estimators': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__n_estimators'], 50, 50),\n", " 'lgbm__min_child_samples': get_fine_tuned_range(best_params_initial['LightGBM']['lgbm__min_child_samples'], 5, 5),\n", " 'lgbm__subsample': [best_params_initial['LightGBM']['lgbm__subsample']]\n", " }\n", "}\n", "\n", "# Perform GridSearchCV for each model\n", "best_params_fine_tuned = {}\n", "for model_name, pipeline in pipelines.items():\n", " print(f\"Fine-tuning hyperparameters for {model_name}...\")\n", " grid_search = GridSearchCV(pipeline, param_grid=fine_tuned_param_grids[model_name], \n", " cv=5, scoring='roc_auc', n_jobs=-1)\n", " grid_search.fit(X_train, y_train)\n", " best_params_fine_tuned[model_name] = grid_search.best_params_\n", " print(f\"Fine-tuned best parameters for {model_name}: {grid_search.best_params_}\")\n", " print(f\"Fine-tuned best AUC for {model_name}: {grid_search.best_score_:.4f}\")\n", "\n", "# Remove prefix from best parameters for each model\n", "def remove_prefix(best_params, prefix):\n", " return {key.split(f'{prefix}__')[1]: value for key, value in best_params.items()}\n", "\n", "best_params_processed = {\n", " 'Logistic Regression': remove_prefix(best_params_fine_tuned['Logistic Regression'], 'logreg'),\n", " 'SVM': remove_prefix(best_params_fine_tuned['SVM'], 'svc'),\n", " 'Random Forest': remove_prefix(best_params_fine_tuned['Random Forest'], 'rf'),\n", " 'XGBoost': remove_prefix(best_params_fine_tuned['XGBoost'], 'xgb'),\n", " 'LightGBM': remove_prefix(best_params_fine_tuned['LightGBM'], 'lgbm')\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e6cd60b6", "metadata": {}, "outputs": [], "source": [ "from IPython.display import display, HTML\n", "\n", "# Define a function to calculate and plot metrics\n", "def calculate_metrics(models, X_test, y_test):\n", " metrics = {'Model': [], 'AUC': [], 'Accuracy': [], 'Recall': [], 'Precision': []}\n", " plt.figure(figsize=(14, 10), dpi=300)\n", " \n", " for name, model in models.items():\n", " model.fit(X_train, y_train)\n", " y_pred_proba = model.predict_proba(X_test)[:, 1]\n", " auc = roc_auc_score(y_test, y_pred_proba)\n", " fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n", " \n", " plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.4f})')\n", " \n", " # Calculate and store metrics\n", " y_pred = model.predict(X_test)\n", " accuracy = accuracy_score(y_test, y_pred)\n", " recall = recall_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred)\n", " \n", " metrics['Model'].append(name)\n", " metrics['AUC'].append(auc)\n", " metrics['Accuracy'].append(accuracy)\n", " metrics['Recall'].append(recall)\n", " metrics['Precision'].append(precision)\n", " \n", " plt.plot([0, 1], [0, 1], 'k--')\n", " plt.xlabel('False Positive Rate')\n", " plt.ylabel('True Positive Rate')\n", " plt.title('The AUC-ROC Curves of 5 ML Models')\n", " plt.legend(loc='lower right')\n", " plt.grid(True)\n", " plt.tight_layout()\n", " plt.show()\n", " \n", " metrics_df = pd.DataFrame(metrics)\n", " return metrics_df\n", "\n", "# Models with fine-tuned parameters\n", "fine_tuned_models = {\n", " 'Logistic Regression': Pipeline([\n", " ('preprocessor', preprocessor_standard),\n", " ('logreg', LogisticRegression(**best_params_processed['Logistic Regression'], max_iter=10000))\n", " ]),\n", " 'SVM': Pipeline([\n", " ('preprocessor', preprocessor_standard),\n", " ('svc', SVC(**best_params_processed['SVM'], probability=True))\n", " ]),\n", " 'Random Forest': Pipeline([\n", " ('preprocessor', preprocessor_no_transform),\n", " ('rf', RandomForestClassifier(**best_params_processed['Random Forest']))\n", " ]),\n", " 'XGBoost': Pipeline([\n", " ('preprocessor', preprocessor_no_transform),\n", " ('xgb', XGBClassifier(**best_params_processed['XGBoost'], use_label_encoder=False, eval_metric='logloss'))\n", " ]),\n", " 'LightGBM': Pipeline([\n", " ('preprocessor', preprocessor_no_transform),\n", " ('lgbm', LGBMClassifier(**best_params_processed['LightGBM']))\n", " ])\n", "}\n", "\n", "# Calculate metrics and plot AUC-ROC curves\n", "metrics_df = calculate_metrics(fine_tuned_models, X_test, y_test)\n", "\n", "# Display the metrics in a table\n", "metrics_df_display = metrics_df.set_index('Model')\n", "display(HTML(metrics_df_display.to_html()))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "62f68769", "metadata": {}, "outputs": [], "source": [ "pip install shap lightgbm\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f2e07572", "metadata": {}, "outputs": [], "source": [ "import shap\n", "import lightgbm as lgb\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "\n", "\n", "# 使用最佳参数重新训练LightGBM模型\n", "best_params_lgbm = best_params_processed['LightGBM']\n", "\n", "# 创建并训练LightGBM模型\n", "lgbm_model = LGBMClassifier(**best_params_lgbm)\n", "lgbm_model.fit(X_train, y_train)\n", "\n", "# 创建SHAP解释器\n", "explainer = shap.TreeExplainer(lgbm_model)\n", "shap_values = explainer.shap_values(X_test)\n", "\n", "# 创建图形对象\n", "fig, ax = plt.subplots(figsize=(8, 8), dpi=300)\n", "shap.summary_plot(shap_values, X_test, plot_type=\"bar\", feature_names=X_test.columns, show=False)\n", "plt.title('Feature Importance (Bar Plot)', fontsize=16)\n", "plt.savefig('shap_bar_plot.png', bbox_inches='tight')\n", "plt.close(fig)\n", "\n", "fig, ax = plt.subplots(figsize=(8, 8), dpi=300)\n", "shap.summary_plot(shap_values, X_test, feature_names=X_test.columns, show=False)\n", "plt.title('Feature Importance (Bee Swarm Plot)', fontsize=16)\n", "plt.savefig('shap_bee_swarm_plot.png', bbox_inches='tight')\n", "plt.close(fig)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1945a2d9", "metadata": {}, "outputs": [], "source": [ "from PIL import Image, ImageDraw, ImageFont\n", "\n", "# 读取保存的图像\n", "bar_plot = Image.open('shap_bar_plot.png')\n", "bee_swarm_plot = Image.open('shap_bee_swarm_plot.png')\n", "\n", "# 创建并排拼接的图像\n", "total_width = bar_plot.width + bee_swarm_plot.width\n", "max_height = max(bar_plot.height, bee_swarm_plot.height)\n", "\n", "new_img = Image.new('RGB', (total_width, max_height), (255, 255, 255))\n", "new_img.paste(bar_plot, (0, 0))\n", "new_img.paste(bee_swarm_plot, (bar_plot.width, 0))\n", "\n", "# 添加A和B标记\n", "draw = ImageDraw.Draw(new_img)\n", "font = ImageFont.truetype(\"arial\", 100)\n", "\n", "draw.text((20, 20), \"A\", fill=\"black\", font=font)\n", "draw.text((bar_plot.width + 20, 20), \"B\", fill=\"black\", font=font)\n", "\n", "# 保存并显示最终的拼接图像\n", "new_img.save('shap_combined_plot.png')\n", "new_img.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "56050b6a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }