Something went wrong on our end
Final_Model_Training.py 12.76 KiB
"""
This script preprocesses and combines training and testing datasets, trains the final model using the best hyperparameters, and saves the trained model and normalization constants.
**Main Components:**
1. **Data Preprocessing**:
- Both `training.csv` and `test.csv` are preprocessed similarly to the initial preprocessing in the hyperparameter tuning phase.
- Features such as mean, median, standard deviation, and absolute percentage difference are calculated.
- The datasets are concatenated and normalized using computed normalization constants, which are saved as `final_normalization_constants.json`.
2. **Oversampling**:
- The minority class in the combined dataset is oversampled to balance class distribution.
- Gaussian noise is added to the oversampled data to improve generalization.
3. **Model Training**:
- The final model is trained on the combined dataset using the best hyperparameters obtained during hyperparameter tuning.
- A 90% train and 10% validation split is used for early stopping.
- The model with the best validation F3 score is saved as `docker_model.pkl`.
**Outputs**:
- Saves the final normalization constants and the trained model.
"""
from model_hyper_param_tuning import AKIDataset, AKINet
import json
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, fbeta_score
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.optim import Adam
import numpy as np
def load_saved_parameters():
"""
Load the best hyperparameters saved during the tuning phase.
Returns:
dict: The best hyperparameters.
"""
with open('best_hyperparameters.json', 'r') as f:
best_hyperparameters = json.load(f)
return best_hyperparameters
def full_data_preprocessor(filename,filename2):
"""
Preprocess and combine two datasets.
Args:
filename1 (str): Path to the first dataset (e.g., training.csv).
filename2 (str): Path to the second dataset (e.g., test.csv).
Returns:
pd.DataFrame: Combined, preprocessed dataset.
"""
# Load the dataset
df = pd.read_csv(filename)
# Identify columns with creatinine test dates and results
date_columns = [col for col in df.columns if 'creatinine_date_' in col]
result_columns = [col for col in df.columns if 'creatinine_result_' in col]
# Convert date columns to datetime format
for col in date_columns:
df[col] = pd.to_datetime(df[col], errors='coerce') # Handle invalid dates as NaT
new_rows = []
for _, row in df.iterrows():
age = row['age']
sex = 1 if str(row['sex']).lower() == 'm' else 0 # Convert 'sex' to binary
aki = 1 if str(row['aki']).lower() == 'y' else 0 # Convert 'aki' to binary
latest_date = None
latest_value = None
previous_values = []
# Extract the latest test result and previous results within the last 365 days
for date_col, result_col in zip(date_columns, result_columns):
if pd.notna(row[date_col]):
if latest_date is None or row[date_col] > latest_date:
if latest_value is not None:
if (latest_date - row[date_col]).days <= 365:
previous_values.append(latest_value)
latest_date = row[date_col]
latest_value = row[result_col]
else:
if (latest_date - row[date_col]).days <= 365:
previous_values.append(row[result_col])
# Calculate statistical features (mean, median, standard deviation) of previous values
if previous_values:
median_previous = float(pd.Series(previous_values).median())
mean_previous = float(pd.Series(previous_values).mean())
std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
else:
median_previous = latest_value
mean_previous = latest_value
std_dev_previous = 0.0
# Compute the absolute percentage difference
abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)
# Append the processed row
new_rows.append(
[age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff])
# Create a new DataFrame with processed features
new_df = pd.DataFrame(new_rows, columns=['age', 'sex', 'aki', 'latest_creatinine_value',
'median_previous', 'mean_previous', 'std_dev_previous',
'abs_percentage_diff'])
#read the second dataset 'test.csv'
df = pd.read_csv(filename2)
# Identify columns with creatinine test dates and results
date_columns = [col for col in df.columns if 'creatinine_date_' in col]
result_columns = [col for col in df.columns if 'creatinine_result_' in col]
# Convert date columns to datetime format
for col in date_columns:
df[col] = pd.to_datetime(df[col], errors='coerce') # Handle invalid dates as NaT
new_rows = []
for _, row in df.iterrows():
age = row['age']
sex = 1 if str(row['sex']).lower() == 'm' else 0 # Convert 'sex' to binary
aki = 1 if str(row['aki']).lower() == 'y' else 0 # Convert 'aki' to binary
latest_date = None
latest_value = None
previous_values = []
# Extract the latest test result and previous results within the last 365 days
for date_col, result_col in zip(date_columns, result_columns):
if pd.notna(row[date_col]):
if latest_date is None or row[date_col] > latest_date:
if latest_value is not None:
if (latest_date - row[date_col]).days <= 365:
previous_values.append(latest_value)
latest_date = row[date_col]
latest_value = row[result_col]
else:
if (latest_date - row[date_col]).days <= 365:
previous_values.append(row[result_col])
# Calculate statistical features (mean, median, standard deviation) of previous values
if previous_values:
median_previous = float(pd.Series(previous_values).median())
mean_previous = float(pd.Series(previous_values).mean())
std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
else:
median_previous = latest_value
mean_previous = latest_value
std_dev_previous = 0.0
# Compute the absolute percentage difference
abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)
# Append the processed row
new_rows.append(
[age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff])
# Create a new DataFrame with processed features
new_df2 = pd.DataFrame(new_rows, columns=['age', 'sex', 'aki', 'latest_creatinine_value',
'median_previous', 'mean_previous', 'std_dev_previous',
'abs_percentage_diff'])
#Combine the two dataframes into a unique one to produce a large training dataset
df_final=pd.concat([new_df,new_df2],ignore_index=True)
# Normalize numeric features and save normalization constants
normalization_constants = {}
for col in ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous',
'abs_percentage_diff']:
mean = df_final[col].mean()
std = df_final[col].std()
normalization_constants[col] = {'mean': mean, 'std': std}
df_final[col] = (df_final[col] - mean) / std
#save the normalization constants such that in the fututre when a test set is given it can be normalized according
#to the same constants
with open('final_normalization_constants.json', 'w') as f:
json.dump(normalization_constants, f)
# Handle class imbalance via oversampling and Gaussian noise addition.
aki_counts = df_final['aki'].value_counts()
print(f"Class counts: {aki_counts}")
imbalance_ratio = aki_counts.min() / aki_counts.max()
if imbalance_ratio < 0.5:
print("Class imbalance detected. Oversampling the minority class...")
minority_class = df_final[df_final['aki'] == aki_counts.idxmin()]
num_samples = aki_counts.max() - aki_counts.min()
oversampled_minority = minority_class.sample(n=num_samples, replace=True, random_state=42)
# Add Gaussian noise to allow the model to generalise better
numeric_columns = ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous',
'abs_percentage_diff']
noise = np.random.normal(0, 0.5, size=oversampled_minority[numeric_columns].shape)
oversampled_minority[numeric_columns] += noise
df_final = pd.concat([df_final, oversampled_minority], ignore_index=True)
return df_final
def train_final_model_on_combined_data(data, best_hyperparameters, device):
"""
Train the final model on the combined dataset.
Args:
data (pd.DataFrame): Combined and preprocessed dataset.
best_hyperparameters (dict): Best hyperparameters from tuning.
device (torch.device): Device for training.
Returns:
nn.Module: Trained model.
"""
# Split data into training and validation sets
train_data, val_data = train_test_split(
data, test_size=0.1, stratify=data['aki'], random_state=42
) # 90% train, 10% validation
# Prepare DataLoaders
batch_size = best_hyperparameters['batch_size']
train_loader = DataLoader(
AKIDataset(train_data), batch_size=batch_size, shuffle=True
)
val_loader = DataLoader(
AKIDataset(val_data), batch_size=batch_size, shuffle=False
)
# Initialize model, criterion, and optimizer
model = AKINet(
input_size=train_data.shape[1] - 1,
num_hidden_layers=best_hyperparameters['num_hidden_layers'],
hidden_layer_size=best_hyperparameters['hidden_layer_size']
)
model.to(device)
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=best_hyperparameters['learning_rate'])
# Training loop with early stopping
best_val_f3 = 0
patience_counter = 0
patience = 20
max_epochs = 500
for epoch in range(max_epochs):
# Training
model.train()
for x_batch, y_batch in train_loader:
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
preds = model(x_batch).squeeze()
loss = criterion(preds, y_batch)
loss.backward()
optimizer.step()
# Validation
model.eval()
val_preds, val_labels = [], []
with torch.no_grad():
for x_batch, y_batch in val_loader:
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
preds = model(x_batch).squeeze()
val_preds.extend((preds > best_hyperparameters['threshold']).cpu().numpy())
val_labels.extend(y_batch.cpu().numpy())
# Early Stopping
# Calculate metrics on the validation set for F3 score
val_f3 = fbeta_score(val_labels, val_preds, beta=3)
if val_f3 > best_val_f3: # if the validation F3 score improves across epochs the training continues
best_val_f3 = val_f3
patience_counter = 0
torch.save(model.state_dict(), "docker_model.pkl") # Save the best model
else:
patience_counter += 1
if patience_counter >= patience: # if the validation F3 score does not improve for 20 epochs the training stops
print("Early stopping triggered.")
break
print(f"Final Validation F3 Score on Combined Data: {best_val_f3:.4f}")
return model
if __name__ == '__main__':
# Load best hyperparameters
best_hyperparameters = load_saved_parameters()
# Concatenate and preprocess train.csv and test.csv
print("Combining and preprocessing train.csv and test.csv...")
data = full_data_preprocessor('training.csv', 'test.csv')
# Train the final model
print("Training the final model on the combined dataset...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_final_model_on_combined_data(data, best_hyperparameters, device)
print("Final model saved as 'docker_model.pkl' with normalization constants.")