Delete model_hyper_param_tuning.py

74bad1aa · sm1524 · b25b85a1 · b25b85a1
Commit 74bad1aa authored 1 month ago by sm1524
--- a/model_hyper_param_tuning.py
+++ b/model_hyper_param_tuning.py
-"""
-This script performs data preprocessing, model training, hyperparameter optimization,
-and evaluation for Acute Kidney Injury (AKI) prediction using a neural network.
-
-**Main Components:**
-1. **Preprocessing**: Converts raw data into a format suitable for model training. Includes:
-   - Handling missing values.
-   - Calculating statistical features (mean, median, std deviation, etc.).
-   - Normalizing the data.
-   - Balancing class distribution (oversampling the minority class). This is to help during training at
-   to achieve a better accuracy and F3 score.
-
-2. **Model Training**: Defines a neural network (AKINet) and trains it using cross-validation
-   and hyperparameter optimization (via Optuna).
-
-3. **Hyperparameter Optimization**:
-   - Explores the best configuration of hidden layers, learning rate, batch size, and thresholds.
-   - Uses 5-fold cross-validation for robust evaluation.
-
-4. **Splitting Data**:
-   - Splits the dataset into train/test subsets.The data are split into 80% training and 20% test subsets.
-   - 5-fold cross-validation further splits the training data into smaller folds where 80% of are for actual training
-   and 20% of are for validation. The validation serve to assess where the performance of the model is improving with
-   more epochs as to enforce an early stopping to find the point which the model generalizes better. This is repeated
-   across all folds for a given hyperparameter set and the average metrics of accuracy and F3 score are assesed
-   according to the Test set held out.
-
-5. **Outputs**:
-   - Saves the best hyperparameters and normalization constants as JSON files.
-   - Outputs the trained model.
-"""
-
-
-
-
-
-import torch
-import torch.nn as nn
-from torch.optim import Adam
-from torch.utils.data import DataLoader, Dataset
-from sklearn.model_selection import train_test_split, StratifiedKFold
-from sklearn.metrics import fbeta_score, accuracy_score
-import optuna
-import numpy as np
-import pandas as pd
-import json
-
-def save_dict_to_json(dictionary, filename):
-    """
-    Save a dictionary to a JSON file.
-
-    Args:
-        dictionary (dict): Dictionary to save.
-        filename (str): Name of the output JSON file.
-    """
-
-    with open(filename, 'w') as file:
-        json.dump(dictionary, file, indent=4)
-
-def preprocessor(filename):
-    """
-    Preprocess the dataset by handling missing values, calculating statistical features,
-    normalizing data, and balancing classes.
-
-    Args:
-        filename (str): Path to the CSV file.
-
-    Returns:
-        tuple: Processed dataframe and normalization constants.
-    """
-    # Load the dataset
-    df = pd.read_csv(filename)
-
-    # Identify columns with creatinine test dates and results
-    date_columns = [col for col in df.columns if 'creatinine_date_' in col]
-    result_columns = [col for col in df.columns if 'creatinine_result_' in col]
-
-    # Convert date columns to datetime format
-    for col in date_columns:
-        df[col] = pd.to_datetime(df[col], errors='coerce')  # Handle invalid dates as NaT
-
-    new_rows = []
-    for _, row in df.iterrows():
-        age = row['age']
-        sex = 1 if str(row['sex']).lower() == 'm' else 0  # Convert 'sex' to binary
-        aki = 1 if str(row['aki']).lower() == 'y' else 0  # Convert 'aki' to binary
-
-        latest_date = None
-        latest_value = None
-        previous_values = []
-
-        # Extract the latest test result and previous results within the last 365 days
-        for date_col, result_col in zip(date_columns, result_columns):
-            if pd.notna(row[date_col]):
-                if latest_date is None or row[date_col] > latest_date:
-                    if latest_value is not None:
-                        if (latest_date - row[date_col]).days <= 365:
-                            previous_values.append(latest_value)
-                    latest_date = row[date_col]
-                    latest_value = row[result_col]
-                else:
-                    if (latest_date - row[date_col]).days <= 365:
-                        previous_values.append(row[result_col])
-
-        # Calculate statistical features (mean, median, standard deviation) of previous values
-        if previous_values:
-            median_previous = float(pd.Series(previous_values).median())
-            mean_previous = float(pd.Series(previous_values).mean())
-            std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
-        else:
-            median_previous = latest_value
-            mean_previous = latest_value
-            std_dev_previous = 0.0
-
-        # Compute the absolute percentage difference
-        abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)
-
-        # Append the processed row
-        new_rows.append(
-            [age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff]
-        )
-
-    # Create a new DataFrame with processed features
-    new_df = pd.DataFrame(new_rows, columns=[
-        'age', 'sex', 'aki', 'latest_creatinine_value',
-        'median_previous', 'mean_previous', 'std_dev_previous',
-        'abs_percentage_diff'
-    ])
-
-    # Normalize numeric features and save normalization constants
-    normalization_constants = {}
-    for col in ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous', 'abs_percentage_diff']:
-        mean = new_df[col].mean()
-        std = new_df[col].std()
-        normalization_constants[col] = {'mean': mean, 'std': std}
-        new_df[col] = (new_df[col] - mean) / std
-
-    # Handle class imbalance via oversampling and Gaussian noise addition.
-    aki_counts = new_df['aki'].value_counts()
-    print(f"Class counts: {aki_counts}")
-    imbalance_ratio = aki_counts.min() / aki_counts.max()
-
-    if imbalance_ratio < 0.5:
-        print("Class imbalance detected. Oversampling the minority class...")
-        minority_class = new_df[new_df['aki'] == aki_counts.idxmin()]
-        num_samples = aki_counts.max() - aki_counts.min()
-        oversampled_minority = minority_class.sample(n=num_samples, replace=True, random_state=42)
-
-        # Add Gaussian noise to allow the model to generlise better
-        numeric_columns = ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous', 'abs_percentage_diff']
-        noise = np.random.normal(0, 0.5, size=oversampled_minority[numeric_columns].shape)
-        oversampled_minority[numeric_columns] += noise
-
-        new_df = pd.concat([new_df, oversampled_minority], ignore_index=True)
-
-    return new_df, normalization_constants
-
-
-
-
-
-
-# Dataset Class
-class AKIDataset(Dataset):
-    """
-    Custom PyTorch Dataset for AKI data.
-    """
-
-    def __init__(self, data):
-        """
-        Initialize the dataset with data.
-
-        Args:
-            data (pd.DataFrame): Dataframe containing features and labels.
-        """
-
-        self.data = data
-        self.features = data.drop(columns=['aki']).values
-        self.labels = data['aki'].values
-
-    def __len__(self):
-        return len(self.data)
-
-    def __getitem__(self, idx):
-        x = self.features[idx]
-        y = self.labels[idx]
-        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
-
-# Neural Network Class
-class AKINet(nn.Module):
-    """
-    Neural network for AKI prediction.
-    """
-
-    def __init__(self, input_size, num_hidden_layers, hidden_layer_size):
-        """
-        Initialize the network structure.
-
-        Args:
-            input_size (int): Number of input features.
-            num_hidden_layers (int): Number of hidden layers.
-            hidden_layer_size (int): Size of each hidden layer.
-        """
-
-        super(AKINet, self).__init__()
-        layers = []
-        for _ in range(num_hidden_layers):
-            layers.append(nn.Linear(input_size if not layers else hidden_layer_size, hidden_layer_size))
-            layers.append(nn.ReLU())
-        layers.append(nn.Linear(hidden_layer_size, 1))
-        layers.append(nn.Sigmoid())
-        self.net = nn.Sequential(*layers)
-
-    def forward(self, x):
-        return self.net(x)
-
-# Training and Validation Function
-def train_and_validate(model, train_loader, val_loader, criterion, optimizer, device, threshold, patience, max_epochs):
-    """
-    Train and validate the model with early stopping.
-
-    Args:
-        model (nn.Module): Neural network model.
-        train_loader (DataLoader): DataLoader for training data.
-        val_loader (DataLoader): DataLoader for validation data.
-        criterion (Loss): Loss function.
-        optimizer (Optimizer): Optimizer.
-        device (torch.device): Device to run the training on.
-        threshold (float): Classification threshold.
-        patience (int): Patience for early stopping.
-        max_epochs (int): Maximum number of epochs.
-
-    Returns:
-        tuple: Best F3 score and accuracy on validation data.
-    """
-
-    model.to(device)
-    best_val_f3 = 0
-    patience_counter = 0
-    for epoch in range(max_epochs):
-        # Training
-        model.train()
-        train_loss = 0
-        for x_batch, y_batch in train_loader:
-            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
-            optimizer.zero_grad()
-            preds = model(x_batch).squeeze()
-            loss = criterion(preds, y_batch)
-            loss.backward()
-            optimizer.step()
-            train_loss += loss.item()
-
-        # Validation
-        model.eval()
-        val_preds, val_labels = [], []
-        val_loss = 0
-        with torch.no_grad():
-            for x_batch, y_batch in val_loader:
-                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
-                preds = model(x_batch).squeeze()
-                val_loss += criterion(preds, y_batch).item()
-                val_preds.extend((preds > threshold).cpu().numpy())
-                val_labels.extend(y_batch.cpu().numpy())
-
-        # Metrics
-        val_f3 = fbeta_score(val_labels, val_preds, beta=3)
-        val_acc = accuracy_score(val_labels, val_preds)
-
-        #The metrics of from the validation set are computed along with the loss and are printed for visual purposes
-        print(f"Epoch {epoch + 1}/{max_epochs} - Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F3: {val_f3:.4f}")
-
-        # Early Stopping
-        if val_f3 > best_val_f3: # if the validation F3 score improves across epochs the training continues
-            best_val_f3 = val_f3 # saving the best F3 score for this training run
-            patience_counter = 0
-        else:
-            patience_counter += 1
-
-        if patience_counter >= patience:  # if the validation F3 score does not improve for 20 epochs the training stops
-            print("Early stopping triggered.")
-            break
-
-    return best_val_f3, val_acc
-
-# Modified Optuna Objective Function for Cross-Validation
-def objective(trial, train_data, test_data, device):
-    """
-    Train and validate the model with early stopping.
-
-    Args:
-        model (nn.Module): Neural network model.
-        train_loader (DataLoader): DataLoader for training data.
-        val_loader (DataLoader): DataLoader for validation data.
-        criterion (Loss): Loss function.
-        optimizer (Optimizer): Optimizer.
-        device (torch.device): Device to run the training on.
-        threshold (float): Classification threshold.
-        patience (int): Patience for early stopping.
-        max_epochs (int): Maximum number of epochs.
-
-    Returns:
-        tuple: Best F3 score and accuracy on validation data.
-    """
-    #Hyper parameter to search for
-    num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 5)
-    hidden_layer_size = trial.suggest_categorical("hidden_layer_size", [32, 64, 128, 256, 512, 1024])
-    learning_rate = trial.suggest_categorical("learning_rate", [0.001, 0.005, 0.01, 0.02, 0.05])
-    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256, 512, 1024])
-    threshold = trial.suggest_categorical("threshold", [0.25, 0.375, 0.5, 0.625, 0.75])
-
-    # 5-Fold Cross-Validation
-    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
-    f3_scores = []
-    for fold, (train_idx, val_idx) in enumerate(skf.split(train_data, train_data['aki'])):
-        # Split into train and validation sets for the current fold again in 80-20% splits
-        train_fold = train_data.iloc[train_idx]
-        val_fold = train_data.iloc[val_idx]
-
-        # Create DataLoaders
-        train_loader = DataLoader(AKIDataset(train_fold), batch_size=batch_size, shuffle=True)
-        val_loader = DataLoader(AKIDataset(val_fold), batch_size=batch_size, shuffle=False)
-
-        # Initialize model, criterion, and optimizer
-        model = AKINet(input_size=train_fold.shape[1] - 1, num_hidden_layers=num_hidden_layers, hidden_layer_size=hidden_layer_size)
-        criterion = nn.BCELoss()
-        optimizer = Adam(model.parameters(), lr=learning_rate)
-
-        # Train and validate
-        best_f3, _ = train_and_validate(model, train_loader, val_loader, criterion, optimizer, device, threshold, patience=20, max_epochs=500)
-        f3_scores.append(best_f3)
-
-    # Evaluate on the fixed test set with the current hyperparameters
-    test_loader = DataLoader(AKIDataset(test_data), batch_size=batch_size, shuffle=False)
-    model.eval()
-    test_preds, test_labels = [], []
-    with torch.no_grad():
-        for x_batch, y_batch in test_loader:
-            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
-            preds = model(x_batch).squeeze()
-            test_preds.extend((preds > threshold).cpu().numpy())
-            test_labels.extend(y_batch.cpu().numpy())
-
-    test_f3 = fbeta_score(test_labels, test_preds, beta=3)
-    test_acc = accuracy_score(test_labels, test_preds)
-
-    # Print results for the current trial
-    print(f"Trial: {trial.number}, Avg F3 (CV): {np.mean(f3_scores):.4f}, Test F3: {test_f3:.4f}, Test Accuracy: {test_acc:.4f}")
-
-    # Use the average F3 score across folds for Optuna optimization
-    trial.set_user_attr("average_f3", np.mean(f3_scores))
-    trial.set_user_attr("test_f3", test_f3)
-    trial.set_user_attr("test_accuracy", test_acc)
-
-    return np.mean(f3_scores)
-
-# Run Optuna with Fixed Train-Test Split
-def run_optuna_with_fixed_test(data, n_trials=10):
-    """
-        Perform hyperparameter optimization using Optuna.
-
-        Args:
-            data (pd.DataFrame): Preprocessed data.
-            n_trials (int): Number of optimization trials.
-
-        Returns:
-            optuna.trial.FrozenTrial: Best trial object.
-        """
-
-    # Split into fixed train and test sets
-    train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['aki'], random_state=42)
-
-    # Device setup to check if GPU is available or not .
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    # Optuna study where we maximize for F3 scores
-    study = optuna.create_study(direction="maximize", study_name="AKI Optimization")
-    study.optimize(lambda trial: objective(trial, train_data, test_data, device), n_trials=n_trials)
-
-    # Print best results
-    print("\nBest trial:")
-    print(f"Hyperparameters: {study.best_trial.params}")
-    print(f"Average F3 Score (CV): {study.best_trial.value:.4f}")
-    print(f"Test F3 Score: {study.best_trial.user_attrs['test_f3']:.4f}")
-    print(f"Test Accuracy: {study.best_trial.user_attrs['test_accuracy']:.4f}")
-    return study.best_trial
-
-
-
-if __name__ == '__main__':
-    # Preprocess the dataset
-    print("Preprocessing the dataset...")
-    processed_data, normalization_constants = preprocessor('training.csv')
-    print("Dataset preprocessing completed.")
-
-    # Save normalization constants
-    save_dict_to_json(normalization_constants, 'normalization_constants.json')
-    print("Normalization constants saved to 'normalization_constants.json'.")
-
-    # Run Optuna
-    print("Starting hyperparameter optimization with Optuna...")
-    best_trial = run_optuna_with_fixed_test(processed_data, n_trials=10)
-
-    # Save best hyperparameters
-    best_hyperparameters = best_trial.params
-    best_hyperparameters['average_f3'] = best_trial.value
-    best_hyperparameters['test_f3'] = best_trial.user_attrs['test_f3']
-    best_hyperparameters['test_accuracy'] = best_trial.user_attrs['test_accuracy']
-    save_dict_to_json(best_hyperparameters, 'best_hyperparameters.json')
-    print("Best hyperparameters saved to 'best_hyperparameters.json'.")
\ No newline at end of file