Skip to content
Snippets Groups Projects
Final_Model_Training.py 12.76 KiB
"""
This script preprocesses and combines training and testing datasets, trains the final model using the best hyperparameters, and saves the trained model and normalization constants.

**Main Components:**
1. **Data Preprocessing**:
   - Both `training.csv` and `test.csv` are preprocessed similarly to the initial preprocessing in the hyperparameter tuning phase.
   - Features such as mean, median, standard deviation, and absolute percentage difference are calculated.
   - The datasets are concatenated and normalized using computed normalization constants, which are saved as `final_normalization_constants.json`.

2. **Oversampling**:
   - The minority class in the combined dataset is oversampled to balance class distribution.
   - Gaussian noise is added to the oversampled data to improve generalization.

3. **Model Training**:
   - The final model is trained on the combined dataset using the best hyperparameters obtained during hyperparameter tuning.
   - A 90% train and 10% validation split is used for early stopping.
   - The model with the best validation F3 score is saved as `docker_model.pkl`.

**Outputs**:
   - Saves the final normalization constants and the trained model.
"""


from model_hyper_param_tuning import  AKIDataset, AKINet
import json
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, fbeta_score
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.optim import Adam
import numpy as np



def load_saved_parameters():
    """
    Load the best hyperparameters saved during the tuning phase.

    Returns:
        dict: The best hyperparameters.
    """

    with open('best_hyperparameters.json', 'r') as f:
        best_hyperparameters = json.load(f)
    return best_hyperparameters


def full_data_preprocessor(filename,filename2):
    """
    Preprocess and combine two datasets.

    Args:
        filename1 (str): Path to the first dataset (e.g., training.csv).
        filename2 (str): Path to the second dataset (e.g., test.csv).

    Returns:
        pd.DataFrame: Combined, preprocessed dataset.
    """


    # Load the dataset
    df = pd.read_csv(filename)

    # Identify columns with creatinine test dates and results

    date_columns = [col for col in df.columns if 'creatinine_date_' in col]
    result_columns = [col for col in df.columns if 'creatinine_result_' in col]

    # Convert date columns to datetime format
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')  # Handle invalid dates as NaT
    new_rows = []
    for _, row in df.iterrows():
        age = row['age']
        sex = 1 if str(row['sex']).lower() == 'm' else 0 # Convert 'sex' to binary
        aki = 1 if str(row['aki']).lower() == 'y' else 0 # Convert 'aki' to binary
        latest_date = None
        latest_value = None
        previous_values = []

        # Extract the latest test result and previous results within the last 365 days
        for date_col, result_col in zip(date_columns, result_columns):
            if pd.notna(row[date_col]):
                if latest_date is None or row[date_col] > latest_date:
                    if latest_value is not None:
                        if (latest_date - row[date_col]).days <= 365:
                            previous_values.append(latest_value)
                    latest_date = row[date_col]
                    latest_value = row[result_col]
                else:
                    if (latest_date - row[date_col]).days <= 365:
                        previous_values.append(row[result_col])
        # Calculate statistical features (mean, median, standard deviation) of previous values
        if previous_values:
            median_previous = float(pd.Series(previous_values).median())
            mean_previous = float(pd.Series(previous_values).mean())
            std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
        else:
            median_previous = latest_value
            mean_previous = latest_value
            std_dev_previous = 0.0

        # Compute the absolute percentage difference
        abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)

        # Append the processed row
        new_rows.append(
            [age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff])

    # Create a new DataFrame with processed features
    new_df = pd.DataFrame(new_rows, columns=['age', 'sex', 'aki', 'latest_creatinine_value',
                                             'median_previous', 'mean_previous', 'std_dev_previous',

                                             'abs_percentage_diff'])
    #read the second dataset 'test.csv'
    df = pd.read_csv(filename2)

    # Identify columns with creatinine test dates and results
    date_columns = [col for col in df.columns if 'creatinine_date_' in col]
    result_columns = [col for col in df.columns if 'creatinine_result_' in col]

    # Convert date columns to datetime format
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')  # Handle invalid dates as NaT
    new_rows = []
    for _, row in df.iterrows():
        age = row['age']
        sex = 1 if str(row['sex']).lower() == 'm' else 0 # Convert 'sex' to binary
        aki = 1 if str(row['aki']).lower() == 'y' else 0 # Convert 'aki' to binary

        latest_date = None
        latest_value = None
        previous_values = []

        # Extract the latest test result and previous results within the last 365 days
        for date_col, result_col in zip(date_columns, result_columns):
            if pd.notna(row[date_col]):
                if latest_date is None or row[date_col] > latest_date:
                    if latest_value is not None:
                        if (latest_date - row[date_col]).days <= 365:
                            previous_values.append(latest_value)
                    latest_date = row[date_col]
                    latest_value = row[result_col]
                else:
                    if (latest_date - row[date_col]).days <= 365:
                        previous_values.append(row[result_col])

        # Calculate statistical features (mean, median, standard deviation) of previous values
        if previous_values:
            median_previous = float(pd.Series(previous_values).median())
            mean_previous = float(pd.Series(previous_values).mean())
            std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
        else:
            median_previous = latest_value
            mean_previous = latest_value
            std_dev_previous = 0.0

        # Compute the absolute percentage difference
        abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)

        # Append the processed row
        new_rows.append(
            [age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff])

    # Create a new DataFrame with processed features
    new_df2 = pd.DataFrame(new_rows, columns=['age', 'sex', 'aki', 'latest_creatinine_value',
                                             'median_previous', 'mean_previous', 'std_dev_previous',
                                             'abs_percentage_diff'])

    #Combine the two dataframes into a unique one to produce a large training dataset
    df_final=pd.concat([new_df,new_df2],ignore_index=True)

    # Normalize numeric features and save normalization constants
    normalization_constants = {}
    for col in ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous',
                'abs_percentage_diff']:
        mean = df_final[col].mean()
        std = df_final[col].std()
        normalization_constants[col] = {'mean': mean, 'std': std}
        df_final[col] = (df_final[col] - mean) / std

    #save the normalization constants such that in the fututre when a test set is given it can be normalized according
    #to the same constants
    with open('final_normalization_constants.json', 'w') as f:
        json.dump(normalization_constants, f)


    # Handle class imbalance via oversampling and Gaussian noise addition.
    aki_counts = df_final['aki'].value_counts()
    print(f"Class counts: {aki_counts}")
    imbalance_ratio = aki_counts.min() / aki_counts.max()
    if imbalance_ratio < 0.5:
        print("Class imbalance detected. Oversampling the minority class...")
        minority_class = df_final[df_final['aki'] == aki_counts.idxmin()]
        num_samples = aki_counts.max() - aki_counts.min()
        oversampled_minority = minority_class.sample(n=num_samples, replace=True, random_state=42)

        # Add Gaussian noise to allow the model to generalise better
        numeric_columns = ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous',
                           'abs_percentage_diff']
        noise = np.random.normal(0, 0.5, size=oversampled_minority[numeric_columns].shape)
        oversampled_minority[numeric_columns] += noise

        df_final = pd.concat([df_final, oversampled_minority], ignore_index=True)
    return df_final



def train_final_model_on_combined_data(data, best_hyperparameters, device):
    """
    Train the final model on the combined dataset.

    Args:
        data (pd.DataFrame): Combined and preprocessed dataset.
        best_hyperparameters (dict): Best hyperparameters from tuning.
        device (torch.device): Device for training.

    Returns:
        nn.Module: Trained model.
    """

    # Split data into training and validation sets
    train_data, val_data = train_test_split(
        data, test_size=0.1, stratify=data['aki'], random_state=42
    )  # 90% train, 10% validation

    # Prepare DataLoaders
    batch_size = best_hyperparameters['batch_size']
    train_loader = DataLoader(
        AKIDataset(train_data), batch_size=batch_size, shuffle=True
    )
    val_loader = DataLoader(
        AKIDataset(val_data), batch_size=batch_size, shuffle=False
    )

    # Initialize model, criterion, and optimizer
    model = AKINet(
        input_size=train_data.shape[1] - 1,
        num_hidden_layers=best_hyperparameters['num_hidden_layers'],
        hidden_layer_size=best_hyperparameters['hidden_layer_size']
    )
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=best_hyperparameters['learning_rate'])

    # Training loop with early stopping
    best_val_f3 = 0
    patience_counter = 0
    patience = 20
    max_epochs = 500

    for epoch in range(max_epochs):
        # Training
        model.train()
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            preds = model(x_batch).squeeze()
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                preds = model(x_batch).squeeze()
                val_preds.extend((preds > best_hyperparameters['threshold']).cpu().numpy())
                val_labels.extend(y_batch.cpu().numpy())

        # Early Stopping
        # Calculate metrics on the validation set for F3 score
        val_f3 = fbeta_score(val_labels, val_preds, beta=3)
        if val_f3 > best_val_f3: # if the validation F3 score improves across epochs the training continues
            best_val_f3 = val_f3
            patience_counter = 0
            torch.save(model.state_dict(), "docker_model.pkl")  # Save the best model
        else:
            patience_counter += 1

        if patience_counter >= patience: # if the validation F3 score does not improve for 20 epochs the training stops
            print("Early stopping triggered.")
            break

    print(f"Final Validation F3 Score on Combined Data: {best_val_f3:.4f}")
    return model


if __name__ == '__main__':
    # Load best hyperparameters
    best_hyperparameters = load_saved_parameters()
    # Concatenate and preprocess train.csv and test.csv
    print("Combining and preprocessing train.csv and test.csv...")
    data = full_data_preprocessor('training.csv', 'test.csv')
    # Train the final model
    print("Training the final model on the combined dataset...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_final_model_on_combined_data(data, best_hyperparameters, device)
    print("Final model saved as 'docker_model.pkl' with normalization constants.")