Add comments to the code

c4b6adf7 · OnurZa · e1102ad8 · c4b6adf7
Commit c4b6adf7 authored 1 month ago by OnurZa
--- a/model.py
+++ b/model.py
@@ -9,31 +9,44 @@ import xgboost as xbg


 def preprocess_data(data):
-    # Always get the last column for each row
+    """
+    Preprocess the input data.
+    Extract the last createnine result
+    One-how encode the "sex" column
+    Calculate the average createnine result excluding the last one
+    Calculate the minimum createnine
+    """
+
+    # Identify all the createnine result columns
    createnine_result_columns = [col for col in data.columns if col.startswith("creatinine_result_")]

+    # Extract the last createnine result
    data["last_createnine_result"] = data[createnine_result_columns].apply(
        lambda x: x.dropna().iloc[-1] if not x.dropna().empty else None, axis=1)


-    # One-hot encode the sex column
+    # One-hot encode the "sex" column
    data["sex_encoded"] = data['sex'].map({'f': 0, 'm': 1})

+
+    # Calculate the average createnine result excluding the last one
    def average_excluding_last(row):
        values = row.dropna()
        if len(values) > 1:
            return values[:-1].astype(float).mean()
        else:
-                return 0
+            return 0
    
    data["creatinine_avg_excluding_last"] = data[createnine_result_columns].apply(average_excluding_last, axis=1)
    
+    # Calculate the minimum createnine result
    def calculate_min_value(row):
         values = row.dropna().astype(float)
         return values.min() if not values.empty else 0
    
    data["creatinine_min"] = data[createnine_result_columns].apply(calculate_min_value, axis=1) 
    
+    # Define the columns to select based on whether "aki" is present in the data
    if "aki" not in data.columns:
        columns_to_select = ["age", "last_createnine_result", "sex_encoded",
                    "creatinine_avg_excluding_last", "creatinine_min"]
@@ -41,9 +54,7 @@ def preprocess_data(data):
        columns_to_select = ["age", "last_createnine_result", "sex_encoded",
                    "creatinine_avg_excluding_last", "creatinine_min", "aki"]
        
-    
-
-
+    # Select and return the columns
    selected_columns = data[columns_to_select]
    return selected_columns

@@ -55,24 +66,28 @@ def main():
    parser.add_argument("--output", default="aki.csv")
    flags = parser.parse_args()

+    # Read the train and test data
    training_data = pd.read_csv("data/training.csv")
-
    test_data = pd.read_csv(flags.input)
+
+    # Write the predictions to the output file
    w = csv.writer(open(flags.output, "w"))
    w.writerow(("aki",))
+
+    # Preprocess training and testing data
    training_data = preprocess_data(training_data)
    testing_data = preprocess_data(test_data)

-
-    
+    # Prepare the training features and labels
    X_train = training_data.drop(columns=["aki"])
    target = "aki"
    y_train = training_data[target].map({'y': 1, 'n': 0})

+    # Use XGBoost to train the model
    model = xbg.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    model.fit(X_train, y_train)

-
+    # Handle test data based on whether "aki" is present
    if "aki" in testing_data.columns:
        testing_data = testing_data.dropna(subset=["aki"])
        y_test = testing_data["aki"].map({'y': 1, 'n': 0}).values
@@ -81,6 +96,7 @@ def main():
        y_test = None
        X_test = testing_data

+    # Predict AKI outcomes
    y_pred = model.predict(X_test)
    for i in y_pred:
        if i == 0:
@@ -93,7 +109,6 @@ def main():
    f3_score = fbeta_score(y_test, y_pred, beta=3)
    print("F3 score: ", f3_score)

-        #w.writerow((random.choice(["y", "n"]),))

 if __name__ == "__main__":
    main()
\ No newline at end of file