Skip to content
Snippets Groups Projects 12.76 KiB
This script preprocesses and combines training and testing datasets, trains the final model using the best hyperparameters, and saves the trained model and normalization constants.

**Main Components:**
1. **Data Preprocessing**:
   - Both `training.csv` and `test.csv` are preprocessed similarly to the initial preprocessing in the hyperparameter tuning phase.
   - Features such as mean, median, standard deviation, and absolute percentage difference are calculated.
   - The datasets are concatenated and normalized using computed normalization constants, which are saved as `final_normalization_constants.json`.

2. **Oversampling**:
   - The minority class in the combined dataset is oversampled to balance class distribution.
   - Gaussian noise is added to the oversampled data to improve generalization.

3. **Model Training**:
   - The final model is trained on the combined dataset using the best hyperparameters obtained during hyperparameter tuning.
   - A 90% train and 10% validation split is used for early stopping.
   - The model with the best validation F3 score is saved as `docker_model.pkl`.

   - Saves the final normalization constants and the trained model.

from model_hyper_param_tuning import  AKIDataset, AKINet
import json
import torch
from import DataLoader
from sklearn.metrics import accuracy_score, fbeta_score
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.optim import Adam
import numpy as np

def load_saved_parameters():
    Load the best hyperparameters saved during the tuning phase.

        dict: The best hyperparameters.

    with open('best_hyperparameters.json', 'r') as f:
        best_hyperparameters = json.load(f)
    return best_hyperparameters

def full_data_preprocessor(filename,filename2):
    Preprocess and combine two datasets.

        filename1 (str): Path to the first dataset (e.g., training.csv).
        filename2 (str): Path to the second dataset (e.g., test.csv).

        pd.DataFrame: Combined, preprocessed dataset.

    # Load the dataset
    df = pd.read_csv(filename)

    # Identify columns with creatinine test dates and results

    date_columns = [col for col in df.columns if 'creatinine_date_' in col]
    result_columns = [col for col in df.columns if 'creatinine_result_' in col]

    # Convert date columns to datetime format
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')  # Handle invalid dates as NaT
    new_rows = []
    for _, row in df.iterrows():
        age = row['age']
        sex = 1 if str(row['sex']).lower() == 'm' else 0 # Convert 'sex' to binary
        aki = 1 if str(row['aki']).lower() == 'y' else 0 # Convert 'aki' to binary
        latest_date = None
        latest_value = None
        previous_values = []

        # Extract the latest test result and previous results within the last 365 days
        for date_col, result_col in zip(date_columns, result_columns):
            if pd.notna(row[date_col]):
                if latest_date is None or row[date_col] > latest_date:
                    if latest_value is not None:
                        if (latest_date - row[date_col]).days <= 365:
                    latest_date = row[date_col]
                    latest_value = row[result_col]
                    if (latest_date - row[date_col]).days <= 365:
        # Calculate statistical features (mean, median, standard deviation) of previous values
        if previous_values:
            median_previous = float(pd.Series(previous_values).median())
            mean_previous = float(pd.Series(previous_values).mean())
            std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
            median_previous = latest_value
            mean_previous = latest_value
            std_dev_previous = 0.0

        # Compute the absolute percentage difference
        abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)

        # Append the processed row
            [age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff])

    # Create a new DataFrame with processed features
    new_df = pd.DataFrame(new_rows, columns=['age', 'sex', 'aki', 'latest_creatinine_value',
                                             'median_previous', 'mean_previous', 'std_dev_previous',

    #read the second dataset 'test.csv'
    df = pd.read_csv(filename2)

    # Identify columns with creatinine test dates and results
    date_columns = [col for col in df.columns if 'creatinine_date_' in col]
    result_columns = [col for col in df.columns if 'creatinine_result_' in col]

    # Convert date columns to datetime format
    for col in date_columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')  # Handle invalid dates as NaT
    new_rows = []
    for _, row in df.iterrows():
        age = row['age']
        sex = 1 if str(row['sex']).lower() == 'm' else 0 # Convert 'sex' to binary
        aki = 1 if str(row['aki']).lower() == 'y' else 0 # Convert 'aki' to binary

        latest_date = None
        latest_value = None
        previous_values = []

        # Extract the latest test result and previous results within the last 365 days
        for date_col, result_col in zip(date_columns, result_columns):
            if pd.notna(row[date_col]):
                if latest_date is None or row[date_col] > latest_date:
                    if latest_value is not None:
                        if (latest_date - row[date_col]).days <= 365:
                    latest_date = row[date_col]
                    latest_value = row[result_col]
                    if (latest_date - row[date_col]).days <= 365:

        # Calculate statistical features (mean, median, standard deviation) of previous values
        if previous_values:
            median_previous = float(pd.Series(previous_values).median())
            mean_previous = float(pd.Series(previous_values).mean())
            std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
            median_previous = latest_value
            mean_previous = latest_value
            std_dev_previous = 0.0

        # Compute the absolute percentage difference
        abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)

        # Append the processed row
            [age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff])

    # Create a new DataFrame with processed features
    new_df2 = pd.DataFrame(new_rows, columns=['age', 'sex', 'aki', 'latest_creatinine_value',
                                             'median_previous', 'mean_previous', 'std_dev_previous',

    #Combine the two dataframes into a unique one to produce a large training dataset

    # Normalize numeric features and save normalization constants
    normalization_constants = {}
    for col in ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous',
        mean = df_final[col].mean()
        std = df_final[col].std()
        normalization_constants[col] = {'mean': mean, 'std': std}
        df_final[col] = (df_final[col] - mean) / std

    #save the normalization constants such that in the fututre when a test set is given it can be normalized according
    #to the same constants
    with open('final_normalization_constants.json', 'w') as f:
        json.dump(normalization_constants, f)

    # Handle class imbalance via oversampling and Gaussian noise addition.
    aki_counts = df_final['aki'].value_counts()
    print(f"Class counts: {aki_counts}")
    imbalance_ratio = aki_counts.min() / aki_counts.max()
    if imbalance_ratio < 0.5:
        print("Class imbalance detected. Oversampling the minority class...")
        minority_class = df_final[df_final['aki'] == aki_counts.idxmin()]
        num_samples = aki_counts.max() - aki_counts.min()
        oversampled_minority = minority_class.sample(n=num_samples, replace=True, random_state=42)

        # Add Gaussian noise to allow the model to generalise better
        numeric_columns = ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous',
        noise = np.random.normal(0, 0.5, size=oversampled_minority[numeric_columns].shape)
        oversampled_minority[numeric_columns] += noise

        df_final = pd.concat([df_final, oversampled_minority], ignore_index=True)
    return df_final

def train_final_model_on_combined_data(data, best_hyperparameters, device):
    Train the final model on the combined dataset.

        data (pd.DataFrame): Combined and preprocessed dataset.
        best_hyperparameters (dict): Best hyperparameters from tuning.
        device (torch.device): Device for training.

        nn.Module: Trained model.

    # Split data into training and validation sets
    train_data, val_data = train_test_split(
        data, test_size=0.1, stratify=data['aki'], random_state=42
    )  # 90% train, 10% validation

    # Prepare DataLoaders
    batch_size = best_hyperparameters['batch_size']
    train_loader = DataLoader(
        AKIDataset(train_data), batch_size=batch_size, shuffle=True
    val_loader = DataLoader(
        AKIDataset(val_data), batch_size=batch_size, shuffle=False

    # Initialize model, criterion, and optimizer
    model = AKINet(
        input_size=train_data.shape[1] - 1,
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=best_hyperparameters['learning_rate'])

    # Training loop with early stopping
    best_val_f3 = 0
    patience_counter = 0
    patience = 20
    max_epochs = 500

    for epoch in range(max_epochs):
        # Training
        for x_batch, y_batch in train_loader:
            x_batch, y_batch =,
            preds = model(x_batch).squeeze()
            loss = criterion(preds, y_batch)

        # Validation
        val_preds, val_labels = [], []
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch =,
                preds = model(x_batch).squeeze()
                val_preds.extend((preds > best_hyperparameters['threshold']).cpu().numpy())

        # Early Stopping
        # Calculate metrics on the validation set for F3 score
        val_f3 = fbeta_score(val_labels, val_preds, beta=3)
        if val_f3 > best_val_f3: # if the validation F3 score improves across epochs the training continues
            best_val_f3 = val_f3
            patience_counter = 0
  , "docker_model.pkl")  # Save the best model
            patience_counter += 1

        if patience_counter >= patience: # if the validation F3 score does not improve for 20 epochs the training stops
            print("Early stopping triggered.")

    print(f"Final Validation F3 Score on Combined Data: {best_val_f3:.4f}")
    return model

if __name__ == '__main__':
    # Load best hyperparameters
    best_hyperparameters = load_saved_parameters()
    # Concatenate and preprocess train.csv and test.csv
    print("Combining and preprocessing train.csv and test.csv...")
    data = full_data_preprocessor('training.csv', 'test.csv')
    # Train the final model
    print("Training the final model on the combined dataset...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_final_model_on_combined_data(data, best_hyperparameters, device)
    print("Final model saved as 'docker_model.pkl' with normalization constants.")