Skip to content
Snippets Groups Projects
Commit 69763092 authored by Albus L.'s avatar Albus L.
Browse files

Add README, justify model selection, add type checking and exception handling.

parent cc118185
No related branches found
No related tags found
No related merge requests found
SWEMLS_CW1
# SWEMLS_CW1: Traing AKI Prediction Model
## Overview
This project implements a machine learning pipeline to predict Acute Kidney Injury (AKI) from patient data using scikit-learn. The pipeline includes **feature extraction**, **model training**, and **inference**.
## Features
1. Feature extraction from tabular data.
2. Model training using a StackingClassifier with various ensemble models.
3. Inference and prediction export for unseen test datasets.
4. Evaluation metrics like F1 score to assess model performance.
## Model Selection
The following models were considered and their results are as follows:
- **RandomForestClassifier**: 0.9740
- **AdaBoostClassifier**: 0.9789
- **ExtraTreesClassifier**: 0.9832
- **HistGradientBoostingClassifier**: 0.9796
The final model, a **StackingClassifier**, combines the strengths of:
1. AdaBoostClassifier
2. ExtraTreesClassifier
3. HistGradientBoostingClassifier
The final estimator, `SGDClassifier`, is chosen for its simplicity and efficiency. This ensemble approach enhances prediction robustness by leveraging diverse learning strategies.
## Run testing
```
docker build -t coursework1 .
docker run -v ${PWD}:/data coursework1
```
\ No newline at end of file
```
......@@ -1193,7 +1193,7 @@ n
n
n
n
n
y
n
n
n
......@@ -2290,7 +2290,7 @@ n
n
n
n
n
y
n
n
n
......@@ -2335,7 +2335,7 @@ n
n
n
y
n
y
n
n
n
......@@ -3175,7 +3175,7 @@ n
y
n
n
n
y
y
y
n
......@@ -3431,7 +3431,7 @@ n
y
y
n
n
y
n
n
n
......@@ -3679,7 +3679,7 @@ y
n
n
n
n
y
n
n
n
......@@ -3842,7 +3842,7 @@ n
n
n
n
n
y
n
n
n
......@@ -3933,7 +3933,7 @@ n
y
y
n
n
y
n
y
n
......@@ -4633,7 +4633,7 @@ n
n
y
y
n
y
n
n
n
......@@ -5257,7 +5257,7 @@ n
n
y
n
n
y
y
n
n
......@@ -6764,7 +6764,7 @@ n
n
n
y
n
y
n
n
n
......
......@@ -2,19 +2,24 @@
import argparse
import csv
import logging
import numpy as np
import pandas as pd
from sklearn.ensemble import (
RandomForestClassifier,
AdaBoostClassifier,
ExtraTreesClassifier,
HistGradientBoostingClassifier,
StackingClassifier,
)
from sklearn.linear_model import SGDClassifier
from typing import Any
from typing import Any, Optional
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Global variable for the trained model
model: Any = None
seed = 42
model: Optional[Any] = None
def main() -> None:
"""
......@@ -25,23 +30,33 @@ def main() -> None:
parser.add_argument("--output", default="aki.csv", help="Path to the output CSV file for predictions.")
flags = parser.parse_args()
# Load and prepare training data
train_data = pd.read_csv("data/training.csv")
train_labels = (train_data["aki"] == "y").to_numpy()
train_features = feature_extraction(train_data)
try:
# Load and prepare training data
logging.info("Loading training data.")
train_data = pd.read_csv("data/training.csv")
train_labels = (train_data["aki"] == "y").to_numpy()
train_features = feature_extraction(train_data)
# Train the model
logging.info("Training the model.")
train(train_features, train_labels)
# Train the model
train(train_features, train_labels)
# Load and prepare test data
logging.info("Loading test data.")
test_data = pd.read_csv(flags.input)
test_features = feature_extraction(test_data)
# Load and prepare test data
test_data = pd.read_csv(flags.input)
test_features = feature_extraction(test_data)
# Perform inference
logging.info("Performing inference.")
predictions = infer(test_features)
# Perform inference
predictions = infer(test_features)
# Save predictions to the output file
logging.info("Saving predictions.")
save_predictions(predictions, flags.output)
logging.info("Process completed successfully.")
# Save predictions to the output file
save_predictions(predictions, flags.output)
except Exception as e:
logging.error(f"An error occurred: {e}", exc_info=True)
def feature_extraction(data: pd.DataFrame) -> np.ndarray:
"""
......@@ -53,24 +68,27 @@ def feature_extraction(data: pd.DataFrame) -> np.ndarray:
Returns:
np.ndarray: Extracted features with shape (n_samples, n_extracted_features).
"""
# Drop the "aki" column if it exists
data = data.drop(columns="aki", errors="ignore")
crea_data = data.iloc[:, 3::2]
# Extract features
features = [
data["age"].to_numpy(), # Age feature
(data["sex"] == "f").to_numpy().astype(int), # Binary encoding for sex
np.nanmean(crea_data, axis=1), # Mean of creatinine values
np.nanstd(crea_data, axis=1), # Standard deviation of creatinine values
np.nanmax(crea_data, axis=1), # Maximum creatinine value
np.nanmin(crea_data, axis=1), # Minimum creatinine value
np.nanmedian(crea_data, axis=1), # Median creatinine value
]
# Stack features into a single 2D numpy array
return np.stack(features, axis=1)
try:
# Drop the "aki" column if it exists
data = data.drop(columns="aki", errors="ignore")
crea_data = data.iloc[:, 3::2]
# Extract features
features = [
data["age"].to_numpy(), # Age feature
(data["sex"] == "f").to_numpy().astype(int), # Binary encoding for sex
np.nanmean(crea_data, axis=1), # Mean of creatinine values
np.nanstd(crea_data, axis=1), # Standard deviation of creatinine values
np.nanmax(crea_data, axis=1), # Maximum creatinine value
np.nanmin(crea_data, axis=1), # Minimum creatinine value
np.nanmedian(crea_data, axis=1), # Median creatinine value
]
# Stack features into a single 2D numpy array
return np.stack(features, axis=1)
except Exception as e:
logging.error("Feature extraction failed.", exc_info=True)
raise
def train(features: np.ndarray, labels: np.ndarray) -> None:
"""
......@@ -82,23 +100,22 @@ def train(features: np.ndarray, labels: np.ndarray) -> None:
"""
global model
# model = RandomForestClassifier() # 0.9740932642487047
# model = AdaBoostClassifier(n_estimators=100, random_state=42) # 0.9789848043970255
# model = ExtraTreesClassifier(n_estimators=100, random_state=42) # 0.9832366215344939
# model = HistGradientBoostingClassifier(random_state=42) # 0.9796839729119639
# Initialize a StackingClassifier with multiple base models and a final estimator
model = StackingClassifier(
estimators=[
('rf', RandomForestClassifier()),
('et', ExtraTreesClassifier()),
('hgbc', HistGradientBoostingClassifier())
],
final_estimator=SGDClassifier()
)
# Train the model
model.fit(features, labels)
try:
# Initialize a StackingClassifier with multiple base models and a final estimator
model = StackingClassifier(
estimators=[
('rf', AdaBoostClassifier(random_state=seed)),
('et', ExtraTreesClassifier(random_state=seed)),
('hgbc', HistGradientBoostingClassifier(random_state=seed)),
],
final_estimator=SGDClassifier(random_state=seed),
)
# Train the model
model.fit(features, labels)
except Exception as e:
logging.error("Model training failed.", exc_info=True)
raise
def infer(features: np.ndarray) -> np.ndarray:
"""
......@@ -121,34 +138,42 @@ def save_predictions(predictions: np.ndarray, output_path: str) -> None:
predictions (np.ndarray): Array of boolean predictions (True for "y", False for "n").
output_path (str): Path to the output CSV file.
"""
with open(output_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["aki"])
writer.writerows(["y" if p else "n"] for p in predictions)
try:
with open(output_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["aki"])
writer.writerows(["y" if p else "n"] for p in predictions)
except Exception as e:
logging.error("Saving predictions failed.", exc_info=True)
raise
def test() -> None:
"""
Evaluate the model's predictions against the ground truth in the test dataset.
Need Commented out when submitting for evaluation.
"""
pred = pd.read_csv("data/aki.csv")["aki"].to_numpy() == 'y'
act = pd.read_csv("data/test.csv")["aki"].to_numpy() == 'y'
# Compute confusion matrix components
tp = np.sum(np.logical_and(pred, act))
fp = np.sum(np.logical_and(pred, np.logical_not(act)))
fn = np.sum(np.logical_and(np.logical_not(pred), act))
tn = np.sum(np.logical_and(np.logical_not(pred), np.logical_not(act)))
# Calculate precision, recall, F1 score, and accuracy
precision = tp / (tp + fp) if tp + fp > 0 else 0
recall = tp / (tp + fn) if tp + fn > 0 else 0
f3_score = 2 * precision * recall / (precision + recall)
accuracy = (tp + tn) / len(act) if len(act) > 0 else 0
# Print metrics
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
print(f"F3 Score: {f3_score:.6f}")
try:
pred = pd.read_csv("data/aki.csv")["aki"].to_numpy() == 'y'
act = pd.read_csv("data/test.csv")["aki"].to_numpy() == 'y'
# Compute confusion matrix components
tp = np.sum(np.logical_and(pred, act))
fp = np.sum(np.logical_and(pred, np.logical_not(act)))
fn = np.sum(np.logical_and(np.logical_not(pred), act))
tn = np.sum(np.logical_and(np.logical_not(pred), np.logical_not(act)))
# Calculate precision, recall, F1 score, and accuracy
precision = tp / (tp + fp) if tp + fp > 0 else 0
recall = tp / (tp + fn) if tp + fn > 0 else 0
f3_score = 10 * precision * recall / (9 * precision + recall)
accuracy = (tp + tn) / len(act) if len(act) > 0 else 0
# Print metrics
logging.info(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
logging.info(f"F3 Score: {f3_score:.6f}, Accuracy: {accuracy:.6f}")
except Exception as e:
logging.error("Testing failed.", exc_info=True)
raise
if __name__ == "__main__":
main()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment