Add README, justify model selection, add type checking and exception handling.

69763092 · Albus L. · cc118185 · 69763092 · 69763092 · 69763092
Commit 69763092 authored 1 month ago by Albus L.
--- a/README.md
+++ b/README.md
-SWEMLS_CW1
+# SWEMLS_CW1: Traing AKI Prediction Model
+
+## Overview
+This project implements a machine learning pipeline to predict Acute Kidney Injury (AKI) from patient data using scikit-learn. The pipeline includes **feature extraction**, **model training**, and **inference**.
+
+## Features
+1. Feature extraction from tabular data.
+2. Model training using a StackingClassifier with various ensemble models.
+3. Inference and prediction export for unseen test datasets.
+4. Evaluation metrics like F1 score to assess model performance.
+
+## Model Selection
+The following models were considered and their results are as follows:
+- **RandomForestClassifier**: 0.9740
+- **AdaBoostClassifier**: 0.9789
+- **ExtraTreesClassifier**: 0.9832
+- **HistGradientBoostingClassifier**: 0.9796
+
+The final model, a **StackingClassifier**, combines the strengths of:
+1. AdaBoostClassifier
+2. ExtraTreesClassifier
+3. HistGradientBoostingClassifier
+
+The final estimator, `SGDClassifier`, is chosen for its simplicity and efficiency. This ensemble approach enhances prediction robustness by leveraging diverse learning strategies.
+
+## Run testing
+
 ```
 docker build -t coursework1 .
 docker run -v ${PWD}:/data coursework1
-```
\ No newline at end of file
+```
--- a/aki.csv
+++ b/aki.csv
@@ -1193,7 +1193,7 @@ n
 n
 n
 n
-n
+y
 n
 n
 n
@@ -2290,7 +2290,7 @@ n
 n
 n
 n
-n
+y
 n
 n
 n
@@ -2335,7 +2335,7 @@ n
 n
 n
 y
-n
+y
 n
 n
 n
@@ -3175,7 +3175,7 @@ n
 y
 n
 n
-n
+y
 y
 y
 n
@@ -3431,7 +3431,7 @@ n
 y
 y
 n
-n
+y
 n
 n
 n
@@ -3679,7 +3679,7 @@ y
 n
 n
 n
-n
+y
 n
 n
 n
@@ -3842,7 +3842,7 @@ n
 n
 n
 n
-n
+y
 n
 n
 n
@@ -3933,7 +3933,7 @@ n
 y
 y
 n
-n
+y
 n
 y
 n
@@ -4633,7 +4633,7 @@ n
 n
 y
 y
-n
+y
 n
 n
 n
@@ -5257,7 +5257,7 @@ n
 n
 y
 n
-n
+y
 y
 n
 n
@@ -6764,7 +6764,7 @@ n
 n
 n
 y
-n
+y
 n
 n
 n

--- a/model.py
+++ b/model.py
@@ -2,19 +2,24 @@

 import argparse
 import csv
+import logging
 import numpy as np
 import pandas as pd
 from sklearn.ensemble import (
-    RandomForestClassifier,
+    AdaBoostClassifier,
    ExtraTreesClassifier,
    HistGradientBoostingClassifier,
    StackingClassifier,
 )
 from sklearn.linear_model import SGDClassifier
-from typing import Any
+from typing import Any, Optional
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

 # Global variable for the trained model
-model: Any = None
+seed = 42
+model: Optional[Any] = None

 def main() -> None:
    """
@@ -25,23 +30,33 @@ def main() -> None:
    parser.add_argument("--output", default="aki.csv", help="Path to the output CSV file for predictions.")
    flags = parser.parse_args()

-    # Load and prepare training data
-    train_data = pd.read_csv("data/training.csv")
-    train_labels = (train_data["aki"] == "y").to_numpy()
-    train_features = feature_extraction(train_data)
+    try:
+        # Load and prepare training data
+        logging.info("Loading training data.")
+        train_data = pd.read_csv("data/training.csv")
+        train_labels = (train_data["aki"] == "y").to_numpy()
+        train_features = feature_extraction(train_data)
+
+        # Train the model
+        logging.info("Training the model.")
+        train(train_features, train_labels)

-    # Train the model
-    train(train_features, train_labels)
+        # Load and prepare test data
+        logging.info("Loading test data.")
+        test_data = pd.read_csv(flags.input)
+        test_features = feature_extraction(test_data)

-    # Load and prepare test data
-    test_data = pd.read_csv(flags.input)
-    test_features = feature_extraction(test_data)
+        # Perform inference
+        logging.info("Performing inference.")
+        predictions = infer(test_features)

-    # Perform inference
-    predictions = infer(test_features)
+        # Save predictions to the output file
+        logging.info("Saving predictions.")
+        save_predictions(predictions, flags.output)
+        logging.info("Process completed successfully.")

-    # Save predictions to the output file
-    save_predictions(predictions, flags.output)
+    except Exception as e:
+        logging.error(f"An error occurred: {e}", exc_info=True)

 def feature_extraction(data: pd.DataFrame) -> np.ndarray:
    """
@@ -53,24 +68,27 @@ def feature_extraction(data: pd.DataFrame) -> np.ndarray:
    Returns:
        np.ndarray: Extracted features with shape (n_samples, n_extracted_features).
    """
-    
-    # Drop the "aki" column if it exists
-    data = data.drop(columns="aki", errors="ignore")
-    crea_data = data.iloc[:, 3::2]
-
-    # Extract features
-    features = [
-        data["age"].to_numpy(),  # Age feature
-        (data["sex"] == "f").to_numpy().astype(int),  # Binary encoding for sex
-        np.nanmean(crea_data, axis=1),  # Mean of creatinine values
-        np.nanstd(crea_data, axis=1),  # Standard deviation of creatinine values
-        np.nanmax(crea_data, axis=1),  # Maximum creatinine value
-        np.nanmin(crea_data, axis=1),  # Minimum creatinine value
-        np.nanmedian(crea_data, axis=1),  # Median creatinine value
-    ]
-
-    # Stack features into a single 2D numpy array
-    return np.stack(features, axis=1)
+    try:
+        # Drop the "aki" column if it exists
+        data = data.drop(columns="aki", errors="ignore")
+        crea_data = data.iloc[:, 3::2]
+
+        # Extract features
+        features = [
+            data["age"].to_numpy(),  # Age feature
+            (data["sex"] == "f").to_numpy().astype(int),  # Binary encoding for sex
+            np.nanmean(crea_data, axis=1),  # Mean of creatinine values
+            np.nanstd(crea_data, axis=1),  # Standard deviation of creatinine values
+            np.nanmax(crea_data, axis=1),  # Maximum creatinine value
+            np.nanmin(crea_data, axis=1),  # Minimum creatinine value
+            np.nanmedian(crea_data, axis=1),  # Median creatinine value
+        ]
+
+        # Stack features into a single 2D numpy array
+        return np.stack(features, axis=1)
+    except Exception as e:
+        logging.error("Feature extraction failed.", exc_info=True)
+        raise

 def train(features: np.ndarray, labels: np.ndarray) -> None:
    """
@@ -82,23 +100,22 @@ def train(features: np.ndarray, labels: np.ndarray) -> None:
    """
    global model

-    # model = RandomForestClassifier() # 0.9740932642487047
-    # model = AdaBoostClassifier(n_estimators=100, random_state=42) # 0.9789848043970255
-    # model = ExtraTreesClassifier(n_estimators=100, random_state=42) # 0.9832366215344939
-    # model = HistGradientBoostingClassifier(random_state=42) # 0.9796839729119639
-
-    # Initialize a StackingClassifier with multiple base models and a final estimator
-    model = StackingClassifier(
-        estimators=[
-            ('rf', RandomForestClassifier()),
-            ('et', ExtraTreesClassifier()),
-            ('hgbc', HistGradientBoostingClassifier())
-        ],
-        final_estimator=SGDClassifier()
-    )
-
-    # Train the model
-    model.fit(features, labels)
+    try:
+        # Initialize a StackingClassifier with multiple base models and a final estimator
+        model = StackingClassifier(
+            estimators=[
+                ('rf', AdaBoostClassifier(random_state=seed)),
+                ('et', ExtraTreesClassifier(random_state=seed)),
+                ('hgbc', HistGradientBoostingClassifier(random_state=seed)),
+            ],
+            final_estimator=SGDClassifier(random_state=seed),
+        )
+
+        # Train the model
+        model.fit(features, labels)
+    except Exception as e:
+        logging.error("Model training failed.", exc_info=True)
+        raise

 def infer(features: np.ndarray) -> np.ndarray:
    """
@@ -121,34 +138,42 @@ def save_predictions(predictions: np.ndarray, output_path: str) -> None:
        predictions (np.ndarray): Array of boolean predictions (True for "y", False for "n").
        output_path (str): Path to the output CSV file.
    """
-    with open(output_path, "w", newline="") as csvfile:
-        writer = csv.writer(csvfile)
-        writer.writerow(["aki"])
-        writer.writerows(["y" if p else "n"] for p in predictions)
+    try:
+        with open(output_path, "w", newline="") as csvfile:
+            writer = csv.writer(csvfile)
+            writer.writerow(["aki"])
+            writer.writerows(["y" if p else "n"] for p in predictions)
+    except Exception as e:
+        logging.error("Saving predictions failed.", exc_info=True)
+        raise

 def test() -> None:
    """
    Evaluate the model's predictions against the ground truth in the test dataset.
    Need Commented out when submitting for evaluation.
    """
-    pred = pd.read_csv("data/aki.csv")["aki"].to_numpy() == 'y'
-    act = pd.read_csv("data/test.csv")["aki"].to_numpy() == 'y'
-
-    # Compute confusion matrix components
-    tp = np.sum(np.logical_and(pred, act))
-    fp = np.sum(np.logical_and(pred, np.logical_not(act)))
-    fn = np.sum(np.logical_and(np.logical_not(pred), act))
-    tn = np.sum(np.logical_and(np.logical_not(pred), np.logical_not(act)))
-
-    # Calculate precision, recall, F1 score, and accuracy
-    precision = tp / (tp + fp) if tp + fp > 0 else 0
-    recall = tp / (tp + fn) if tp + fn > 0 else 0
-    f3_score = 2 * precision * recall / (precision + recall)
-    accuracy = (tp + tn) / len(act) if len(act) > 0 else 0
-
-    # Print metrics
-    print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
-    print(f"F3 Score: {f3_score:.6f}")
+    try:
+        pred = pd.read_csv("data/aki.csv")["aki"].to_numpy() == 'y'
+        act = pd.read_csv("data/test.csv")["aki"].to_numpy() == 'y'
+
+        # Compute confusion matrix components
+        tp = np.sum(np.logical_and(pred, act))
+        fp = np.sum(np.logical_and(pred, np.logical_not(act)))
+        fn = np.sum(np.logical_and(np.logical_not(pred), act))
+        tn = np.sum(np.logical_and(np.logical_not(pred), np.logical_not(act)))
+
+        # Calculate precision, recall, F1 score, and accuracy
+        precision = tp / (tp + fp) if tp + fp > 0 else 0
+        recall = tp / (tp + fn) if tp + fn > 0 else 0
+        f3_score = 10 * precision * recall / (9 * precision + recall) 
+        accuracy = (tp + tn) / len(act) if len(act) > 0 else 0
+
+        # Print metrics
+        logging.info(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
+        logging.info(f"F3 Score: {f3_score:.6f}, Accuracy: {accuracy:.6f}")
+    except Exception as e:
+        logging.error("Testing failed.", exc_info=True)
+        raise

 if __name__ == "__main__":
    main()