#!/usr/bin/env python
# coding: utf-8

# In[40]:


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# 提取的颜色
left_color_hex = '#72b6a1'  # 绿色
right_color_hex = '#e99675'  # 橙色

# Set global font to Times New Roman and increase font sizes
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman']
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 12

# Load the dataset
file_path = 'S4.Raw Data.xlsx'
data = pd.read_excel(file_path)
target = 'T2DM'

# Handle NaN values using IterativeImputer
imputer = IterativeImputer(random_state=42)
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Report original distribution of the target data
print("Original distribution of the target data (T2DM):")
print(data_imputed[target].value_counts())

# Plot original distribution of the target data
plt.figure(figsize=(14, 6), dpi=300)  # Increase figure size and DPI for better clarity

plt.subplot(1, 2, 1)
sns.countplot(x=target, data=data_imputed, palette='Set2')
plt.title('Original Distribution in Raw Dataset')
plt.xlabel('T2DM')
plt.ylabel('Counts')
plt.grid(True, linestyle='--', alpha=0.6)

# Annotate counts on the original distribution plot
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                       ha='center', va='bottom', fontsize=10, color='black')

# Exclude columns 'OGTT', 'FPG', 'T2DM_Report' from features
exclude_columns = ['OGTT', 'FPG', 'T2DM_Report']
X = data_imputed.drop(columns=[target] + exclude_columns)
y = data_imputed[target]

# Count the number of samples in each class
class_counts = y.value_counts()
minority_class_count = class_counts.min()

# Define the sampling strategy to keep the minority class intact
sampling_strategy = {class_counts.idxmin(): minority_class_count, class_counts.idxmax(): minority_class_count}

rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Report distribution after undersampling
print("\nDistribution after RandomUnderSampling:")
print(pd.Series(y_resampled).value_counts())

# Create a DataFrame with the resampled data
resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
resampled_data[target] = y_resampled

# Plot the distribution of the target data after applying Random UnderSampler
plt.subplot(1, 2, 2)
sns.countplot(x=target, data=resampled_data, palette='Set2')
plt.title('Distribution After RandomUnderSampling in Raw Dataset')
plt.xlabel('T2DM')
plt.ylabel('Counts')
plt.grid(True, linestyle='--', alpha=0.6)

# Annotate counts on the resampled distribution plot
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), 
                       ha='center', va='bottom', fontsize=10, color='black')

# Add labels "A" and "B" to the plots
plt.subplot(1, 2, 1)
plt.text(-0.1, 1.05, 'A', transform=plt.gca().transAxes, fontsize=20, fontweight='bold')

plt.subplot(1, 2, 2)
plt.text(-0.1, 1.05, 'B', transform=plt.gca().transAxes, fontsize=20, fontweight='bold')

# Show the plots
plt.tight_layout()
plt.show()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Apply RF-RFE for feature selection
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=1, step=1)
rfe = rfe.fit(X_train, y_train)

# Get feature rankings
ranking = rfe.ranking_
features = X.columns
ranked_features = sorted(zip(ranking, features))

# Determine the optimal number of features by maximizing AUC
auc_scores = []
for i in range(1, len(features) + 1):
    selected_features = [f for rank, f in ranked_features[:i]]
    rf.fit(X_train[selected_features], y_train)
    y_pred = rf.predict_proba(X_test[selected_features])[:, 1]
    auc = roc_auc_score(y_test, y_pred)
    auc_scores.append(auc)

optimal_num_features = auc_scores.index(max(auc_scores)) + 1
selected_features = [f for rank, f in ranked_features[:optimal_num_features]]

# Report the selected features
print("\nOptimal number of features:", optimal_num_features)
print("Selected features:", selected_features)

# Plot AUC scores vs. number of features
plt.figure(figsize=(10, 6), dpi=300)
plt.plot(range(1, len(features) + 1), auc_scores, marker='o', color=left_color_hex)
plt.xlabel('Number of Features')
plt.ylabel('AUC Score')
plt.title('AUC Scores vs. Number of Features')
plt.axvline(optimal_num_features, color='red', linestyle='--')
plt.text(optimal_num_features - 3.2, max(auc_scores) + 0.01, f'Optimal Number: {optimal_num_features}', color='red', ha='left')
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(range(1, len(features) + 1))  # Ensure x-axis has integer ticks
plt.tight_layout()
plt.show()


# In[ ]: