Skip to content
Snippets Groups Projects
Commit 74bad1aa authored by sm1524's avatar sm1524
Browse files

Delete model_hyper_param_tuning.py

parent b25b85a1
No related branches found
No related tags found
No related merge requests found
"""
This script performs data preprocessing, model training, hyperparameter optimization,
and evaluation for Acute Kidney Injury (AKI) prediction using a neural network.
**Main Components:**
1. **Preprocessing**: Converts raw data into a format suitable for model training. Includes:
- Handling missing values.
- Calculating statistical features (mean, median, std deviation, etc.).
- Normalizing the data.
- Balancing class distribution (oversampling the minority class). This is to help during training at
to achieve a better accuracy and F3 score.
2. **Model Training**: Defines a neural network (AKINet) and trains it using cross-validation
and hyperparameter optimization (via Optuna).
3. **Hyperparameter Optimization**:
- Explores the best configuration of hidden layers, learning rate, batch size, and thresholds.
- Uses 5-fold cross-validation for robust evaluation.
4. **Splitting Data**:
- Splits the dataset into train/test subsets.The data are split into 80% training and 20% test subsets.
- 5-fold cross-validation further splits the training data into smaller folds where 80% of are for actual training
and 20% of are for validation. The validation serve to assess where the performance of the model is improving with
more epochs as to enforce an early stopping to find the point which the model generalizes better. This is repeated
across all folds for a given hyperparameter set and the average metrics of accuracy and F3 score are assesed
according to the Test set held out.
5. **Outputs**:
- Saves the best hyperparameters and normalization constants as JSON files.
- Outputs the trained model.
"""
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import fbeta_score, accuracy_score
import optuna
import numpy as np
import pandas as pd
import json
def save_dict_to_json(dictionary, filename):
"""
Save a dictionary to a JSON file.
Args:
dictionary (dict): Dictionary to save.
filename (str): Name of the output JSON file.
"""
with open(filename, 'w') as file:
json.dump(dictionary, file, indent=4)
def preprocessor(filename):
"""
Preprocess the dataset by handling missing values, calculating statistical features,
normalizing data, and balancing classes.
Args:
filename (str): Path to the CSV file.
Returns:
tuple: Processed dataframe and normalization constants.
"""
# Load the dataset
df = pd.read_csv(filename)
# Identify columns with creatinine test dates and results
date_columns = [col for col in df.columns if 'creatinine_date_' in col]
result_columns = [col for col in df.columns if 'creatinine_result_' in col]
# Convert date columns to datetime format
for col in date_columns:
df[col] = pd.to_datetime(df[col], errors='coerce') # Handle invalid dates as NaT
new_rows = []
for _, row in df.iterrows():
age = row['age']
sex = 1 if str(row['sex']).lower() == 'm' else 0 # Convert 'sex' to binary
aki = 1 if str(row['aki']).lower() == 'y' else 0 # Convert 'aki' to binary
latest_date = None
latest_value = None
previous_values = []
# Extract the latest test result and previous results within the last 365 days
for date_col, result_col in zip(date_columns, result_columns):
if pd.notna(row[date_col]):
if latest_date is None or row[date_col] > latest_date:
if latest_value is not None:
if (latest_date - row[date_col]).days <= 365:
previous_values.append(latest_value)
latest_date = row[date_col]
latest_value = row[result_col]
else:
if (latest_date - row[date_col]).days <= 365:
previous_values.append(row[result_col])
# Calculate statistical features (mean, median, standard deviation) of previous values
if previous_values:
median_previous = float(pd.Series(previous_values).median())
mean_previous = float(pd.Series(previous_values).mean())
std_dev_previous = float(pd.Series(previous_values).std(ddof=0))
else:
median_previous = latest_value
mean_previous = latest_value
std_dev_previous = 0.0
# Compute the absolute percentage difference
abs_percentage_diff = abs((latest_value - mean_previous) / mean_previous)
# Append the processed row
new_rows.append(
[age, sex, aki, latest_value, median_previous, mean_previous, std_dev_previous, abs_percentage_diff]
)
# Create a new DataFrame with processed features
new_df = pd.DataFrame(new_rows, columns=[
'age', 'sex', 'aki', 'latest_creatinine_value',
'median_previous', 'mean_previous', 'std_dev_previous',
'abs_percentage_diff'
])
# Normalize numeric features and save normalization constants
normalization_constants = {}
for col in ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous', 'abs_percentage_diff']:
mean = new_df[col].mean()
std = new_df[col].std()
normalization_constants[col] = {'mean': mean, 'std': std}
new_df[col] = (new_df[col] - mean) / std
# Handle class imbalance via oversampling and Gaussian noise addition.
aki_counts = new_df['aki'].value_counts()
print(f"Class counts: {aki_counts}")
imbalance_ratio = aki_counts.min() / aki_counts.max()
if imbalance_ratio < 0.5:
print("Class imbalance detected. Oversampling the minority class...")
minority_class = new_df[new_df['aki'] == aki_counts.idxmin()]
num_samples = aki_counts.max() - aki_counts.min()
oversampled_minority = minority_class.sample(n=num_samples, replace=True, random_state=42)
# Add Gaussian noise to allow the model to generlise better
numeric_columns = ['age', 'latest_creatinine_value', 'median_previous', 'mean_previous', 'std_dev_previous', 'abs_percentage_diff']
noise = np.random.normal(0, 0.5, size=oversampled_minority[numeric_columns].shape)
oversampled_minority[numeric_columns] += noise
new_df = pd.concat([new_df, oversampled_minority], ignore_index=True)
return new_df, normalization_constants
# Dataset Class
class AKIDataset(Dataset):
"""
Custom PyTorch Dataset for AKI data.
"""
def __init__(self, data):
"""
Initialize the dataset with data.
Args:
data (pd.DataFrame): Dataframe containing features and labels.
"""
self.data = data
self.features = data.drop(columns=['aki']).values
self.labels = data['aki'].values
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.features[idx]
y = self.labels[idx]
return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
# Neural Network Class
class AKINet(nn.Module):
"""
Neural network for AKI prediction.
"""
def __init__(self, input_size, num_hidden_layers, hidden_layer_size):
"""
Initialize the network structure.
Args:
input_size (int): Number of input features.
num_hidden_layers (int): Number of hidden layers.
hidden_layer_size (int): Size of each hidden layer.
"""
super(AKINet, self).__init__()
layers = []
for _ in range(num_hidden_layers):
layers.append(nn.Linear(input_size if not layers else hidden_layer_size, hidden_layer_size))
layers.append(nn.ReLU())
layers.append(nn.Linear(hidden_layer_size, 1))
layers.append(nn.Sigmoid())
self.net = nn.Sequential(*layers)
def forward(self, x):
return self.net(x)
# Training and Validation Function
def train_and_validate(model, train_loader, val_loader, criterion, optimizer, device, threshold, patience, max_epochs):
"""
Train and validate the model with early stopping.
Args:
model (nn.Module): Neural network model.
train_loader (DataLoader): DataLoader for training data.
val_loader (DataLoader): DataLoader for validation data.
criterion (Loss): Loss function.
optimizer (Optimizer): Optimizer.
device (torch.device): Device to run the training on.
threshold (float): Classification threshold.
patience (int): Patience for early stopping.
max_epochs (int): Maximum number of epochs.
Returns:
tuple: Best F3 score and accuracy on validation data.
"""
model.to(device)
best_val_f3 = 0
patience_counter = 0
for epoch in range(max_epochs):
# Training
model.train()
train_loss = 0
for x_batch, y_batch in train_loader:
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
preds = model(x_batch).squeeze()
loss = criterion(preds, y_batch)
loss.backward()
optimizer.step()
train_loss += loss.item()
# Validation
model.eval()
val_preds, val_labels = [], []
val_loss = 0
with torch.no_grad():
for x_batch, y_batch in val_loader:
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
preds = model(x_batch).squeeze()
val_loss += criterion(preds, y_batch).item()
val_preds.extend((preds > threshold).cpu().numpy())
val_labels.extend(y_batch.cpu().numpy())
# Metrics
val_f3 = fbeta_score(val_labels, val_preds, beta=3)
val_acc = accuracy_score(val_labels, val_preds)
#The metrics of from the validation set are computed along with the loss and are printed for visual purposes
print(f"Epoch {epoch + 1}/{max_epochs} - Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F3: {val_f3:.4f}")
# Early Stopping
if val_f3 > best_val_f3: # if the validation F3 score improves across epochs the training continues
best_val_f3 = val_f3 # saving the best F3 score for this training run
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience: # if the validation F3 score does not improve for 20 epochs the training stops
print("Early stopping triggered.")
break
return best_val_f3, val_acc
# Modified Optuna Objective Function for Cross-Validation
def objective(trial, train_data, test_data, device):
"""
Train and validate the model with early stopping.
Args:
model (nn.Module): Neural network model.
train_loader (DataLoader): DataLoader for training data.
val_loader (DataLoader): DataLoader for validation data.
criterion (Loss): Loss function.
optimizer (Optimizer): Optimizer.
device (torch.device): Device to run the training on.
threshold (float): Classification threshold.
patience (int): Patience for early stopping.
max_epochs (int): Maximum number of epochs.
Returns:
tuple: Best F3 score and accuracy on validation data.
"""
#Hyper parameter to search for
num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 5)
hidden_layer_size = trial.suggest_categorical("hidden_layer_size", [32, 64, 128, 256, 512, 1024])
learning_rate = trial.suggest_categorical("learning_rate", [0.001, 0.005, 0.01, 0.02, 0.05])
batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256, 512, 1024])
threshold = trial.suggest_categorical("threshold", [0.25, 0.375, 0.5, 0.625, 0.75])
# 5-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f3_scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(train_data, train_data['aki'])):
# Split into train and validation sets for the current fold again in 80-20% splits
train_fold = train_data.iloc[train_idx]
val_fold = train_data.iloc[val_idx]
# Create DataLoaders
train_loader = DataLoader(AKIDataset(train_fold), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(AKIDataset(val_fold), batch_size=batch_size, shuffle=False)
# Initialize model, criterion, and optimizer
model = AKINet(input_size=train_fold.shape[1] - 1, num_hidden_layers=num_hidden_layers, hidden_layer_size=hidden_layer_size)
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
# Train and validate
best_f3, _ = train_and_validate(model, train_loader, val_loader, criterion, optimizer, device, threshold, patience=20, max_epochs=500)
f3_scores.append(best_f3)
# Evaluate on the fixed test set with the current hyperparameters
test_loader = DataLoader(AKIDataset(test_data), batch_size=batch_size, shuffle=False)
model.eval()
test_preds, test_labels = [], []
with torch.no_grad():
for x_batch, y_batch in test_loader:
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
preds = model(x_batch).squeeze()
test_preds.extend((preds > threshold).cpu().numpy())
test_labels.extend(y_batch.cpu().numpy())
test_f3 = fbeta_score(test_labels, test_preds, beta=3)
test_acc = accuracy_score(test_labels, test_preds)
# Print results for the current trial
print(f"Trial: {trial.number}, Avg F3 (CV): {np.mean(f3_scores):.4f}, Test F3: {test_f3:.4f}, Test Accuracy: {test_acc:.4f}")
# Use the average F3 score across folds for Optuna optimization
trial.set_user_attr("average_f3", np.mean(f3_scores))
trial.set_user_attr("test_f3", test_f3)
trial.set_user_attr("test_accuracy", test_acc)
return np.mean(f3_scores)
# Run Optuna with Fixed Train-Test Split
def run_optuna_with_fixed_test(data, n_trials=10):
"""
Perform hyperparameter optimization using Optuna.
Args:
data (pd.DataFrame): Preprocessed data.
n_trials (int): Number of optimization trials.
Returns:
optuna.trial.FrozenTrial: Best trial object.
"""
# Split into fixed train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['aki'], random_state=42)
# Device setup to check if GPU is available or not .
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Optuna study where we maximize for F3 scores
study = optuna.create_study(direction="maximize", study_name="AKI Optimization")
study.optimize(lambda trial: objective(trial, train_data, test_data, device), n_trials=n_trials)
# Print best results
print("\nBest trial:")
print(f"Hyperparameters: {study.best_trial.params}")
print(f"Average F3 Score (CV): {study.best_trial.value:.4f}")
print(f"Test F3 Score: {study.best_trial.user_attrs['test_f3']:.4f}")
print(f"Test Accuracy: {study.best_trial.user_attrs['test_accuracy']:.4f}")
return study.best_trial
if __name__ == '__main__':
# Preprocess the dataset
print("Preprocessing the dataset...")
processed_data, normalization_constants = preprocessor('training.csv')
print("Dataset preprocessing completed.")
# Save normalization constants
save_dict_to_json(normalization_constants, 'normalization_constants.json')
print("Normalization constants saved to 'normalization_constants.json'.")
# Run Optuna
print("Starting hyperparameter optimization with Optuna...")
best_trial = run_optuna_with_fixed_test(processed_data, n_trials=10)
# Save best hyperparameters
best_hyperparameters = best_trial.params
best_hyperparameters['average_f3'] = best_trial.value
best_hyperparameters['test_f3'] = best_trial.user_attrs['test_f3']
best_hyperparameters['test_accuracy'] = best_trial.user_attrs['test_accuracy']
save_dict_to_json(best_hyperparameters, 'best_hyperparameters.json')
print("Best hyperparameters saved to 'best_hyperparameters.json'.")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment