Skip to content
Snippets Groups Projects
Commit 2dd8adfa authored by Asia Belfiore's avatar Asia Belfiore
Browse files

Added test data handling for single patients.

parent 5c58a495
No related branches found
No related tags found
No related merge requests found
No preview for this file type
File added
No preview for this file type
This diff is collapsed.
...@@ -2,20 +2,37 @@ ...@@ -2,20 +2,37 @@
import argparse import argparse
import csv import csv
import random from sklearn.linear_model import SGDClassifier as SGD
import numpy as np
from utils import *
def main(): def main():
parser = argparse.ArgumentParser() try:
parser.add_argument("--input", default="test.csv") parser = argparse.ArgumentParser()
parser.add_argument("--output", default="aki.csv") parser.add_argument("--input", default="test.csv")
flags = parser.parse_args() parser.add_argument("--output", default="aki.csv")
r = csv.reader(open(flags.input)) flags = parser.parse_args()
w = csv.writer(open(flags.output, "w")) r = csv.reader(open(flags.input))
w.writerow(("aki",)) w = csv.writer(open(flags.output, "w"))
next(r) # skip headers w.writerow(("aki",))
for _ in r: next(r) # skip headers
# TODO: Implement a better model
w.writerow((random.choice(["y", "n"]),)) model = SGD(loss="hinge", penalty="l2", max_iter=100)
train_data = prepare_train_data('data/training.csv')
X_train = train_data.loc[:,'sex':'D'] # features
Y_train = train_data.loc[:,'aki'] # target labels
model.fit(X_train.values, Y_train.values) # train SGD model
for row in r:
# extract features from data
patient = prepare_test_data(row)
y_pred = model.predict(np.array(patient).reshape(1, -1))
# convert binary prediction to 'y'/'n' labels
aki_pred = to_label(y_pred[0], 'y', 'n')
w.writerow((aki_pred,))
except Exception as e:
print(f"An error occurred while running the prediction: {e}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
numpy==1.26.4
pandas==2.2.2
scikit-learn==1.4.2
\ No newline at end of file
import pandas as pd import pandas as pd
from sklearn.metrics import fbeta_score
def to_binary(label, truth): def to_binary(label, truth):
...@@ -39,7 +38,7 @@ def get_header(row_length, dataset_type): ...@@ -39,7 +38,7 @@ def get_header(row_length, dataset_type):
raise Exception(f"Invalid dataset type: {dataset_type}") raise Exception(f"Invalid dataset type: {dataset_type}")
test_num = (row_length - i) // 2 test_num = (row_length - i) // 2
j = 0 j = 0
while (j <= test_num) and (i < row_length): while (j < test_num) and (i < row_length):
header[i] = f'creatinine_date_{j}' header[i] = f'creatinine_date_{j}'
header[i+1] = f'creatinine_result_{j}' header[i+1] = f'creatinine_result_{j}'
j += 1 j += 1
...@@ -136,22 +135,22 @@ def extract_patient_features(patient, creatinine_columns): ...@@ -136,22 +135,22 @@ def extract_patient_features(patient, creatinine_columns):
last_col = patient[::-1].notnull().idxmax() last_col = patient[::-1].notnull().idxmax()
creatinine_test_dates = patient.loc[creatinine_columns[0]:last_col:2] creatinine_test_dates = patient.loc[creatinine_columns[0]:last_col:2]
creatinine_results = patient.loc[creatinine_columns[1]:last_col:2] creatinine_results = patient.loc[creatinine_columns[1]:last_col:2].astype(float)
sex = to_binary(patient['sex'], 'f') sex = to_binary(patient['sex'], 'f')
age = patient['age'] age = patient['age']
c1 = creatinine_results.iloc[-1] # most recent test result c1 = creatinine_results.iloc[-1] # most recent test result
rv1 = creatinine_results.min() # lowest value rv1 = creatinine_results.min() # lowest value
rv2 = creatinine_results.median() # median value rv2 = creatinine_results.astype(float).median(numeric_only=True) # median value
rv_ratio = calculate_rv_ratio(c1, rv1, rv2, creatinine_test_dates) rv_ratio = calculate_rv_ratio(c1, rv1, rv2, creatinine_test_dates)
(elapsed_time, creatinine_change) = get_change_in_last_two_days(creatinine_test_dates, creatinine_results) (elapsed_time, creatinine_change) = get_change_in_last_two_days(creatinine_test_dates, creatinine_results)
# print(f"Creatinine change in {elapsed_time} = {creatinine_change}") # print(f"Creatinine change in {elapsed_time} = {creatinine_change}")
return [sex, age, c1, rv1, rv2, rv_ratio, creatinine_change] return sex, age, c1, rv1, rv2, rv_ratio, creatinine_change
def process_patient_data(patient_data, creatinine_columns, data_type): def process_patient_data(patient_data, creatinine_columns, data_type):
...@@ -159,87 +158,75 @@ def process_patient_data(patient_data, creatinine_columns, data_type): ...@@ -159,87 +158,75 @@ def process_patient_data(patient_data, creatinine_columns, data_type):
Return the processed patient data Return the processed patient data
""" """
try: try:
patient = extract_patient_features(patient_data, creatinine_columns) sex, age, c1, rv1, rv2, rv_ratio, creatinine_change = extract_patient_features(patient_data, creatinine_columns)
if data_type == 'train': if data_type == 'train':
aki_diagnosis = to_binary(patient_data['aki'], 'y') aki_diagnosis = to_binary(patient_data['aki'], 'y')
return pd.Series((patient, aki_diagnosis)) return pd.Series((sex, age, c1, rv1, rv2, rv_ratio, creatinine_change, aki_diagnosis))
else: else:
return pd.Series([patient]) return pd.Series((sex, age, c1, rv1, rv2, rv_ratio, creatinine_change))
except Exception as e: except Exception as e:
print(f"An error occurred while processing patient data: {e}") print(f"An error occurred while processing patient data: {e}")
return ([], -1) if data_type == 'train' else [] return ([], -1) if data_type == 'train' else []
def prepare_dataset(data_path, data_type): def prepare_train_data(data):
""" """
Return the processed dataset for each patient Return the processed dataset for each patient
""" """
header_len = get_longest_row(data_path) header_len = get_longest_row(data)
header = get_header(header_len, data_type) header = get_header(header_len, 'train')
creatinine_columns = [col for col in header if 'creatinine' in col] creatinine_columns = [col for col in header if 'creatinine' in col]
patient_data = pd.read_csv(data_path, sep=',', names=header, skiprows=1) patient_data = pd.read_csv(data, sep=',', names=header, skiprows=1)
# change every test date column into datetime type # change every test date column into datetime type
for col in creatinine_columns[::2]: for col in creatinine_columns[::2]:
patient_data[col] = pd.to_datetime(patient_data[col]) patient_data[col] = pd.to_datetime(patient_data[col])
formatted_dataset = patient_data.apply(lambda patient_record: process_patient_data(patient_record, creatinine_columns, data_type), axis=1) formatted_dataset = patient_data.apply(lambda patient_record: process_patient_data(patient_record, creatinine_columns, data_type = 'train'), axis=1)
if data_type == 'train': formatted_dataset = pd.DataFrame(formatted_dataset)
formatted_dataset = pd.DataFrame(formatted_dataset) formatted_dataset.rename(columns={0: 'sex',
formatted_dataset.rename(columns={0: 'patient_features', 1: 'age',
1: 'aki_score'}, 2: 'c1',
inplace=True) 3: 'rv1',
else: 4: 'rv2',
formatted_dataset.rename(columns={0: 'patient_features'}, 5: 'rv_ratio',
inplace=True) 6: 'D',
7: 'aki'},
inplace=True)
return formatted_dataset return formatted_dataset
def nhs_aki_algo(patient): def prepare_test_data(data):
""" """
""" Return the processed dataset for each patient
[sex, age, c1, rv1, rv2, rv_ratio, D] = patient """
# (low_ri, high_ri) = get_reference_interval(sex, age) data = [d for d in data if ((d != '') and (d != 'y') and (d != 'n'))]
if rv_ratio >= 1.5:
return 1 header_len = len(data)
elif D > 26: header = get_header(header_len, 'test')
return 1
else: creatinine_columns = [col for col in header if 'creatinine' in col]
return 0
patient_data = {col_i:data_i for col_i, data_i in zip(header, data)}
def get_reference_interval(sex, age): # change every test date column into datetime type
""" for col in creatinine_columns[::2]:
Return the Population Reference Interval (RI) based on patient_data[col] = pd.to_datetime(patient_data[col])
age and sex of patient.
source: resources/annual_conference_2016_-_recognition_of_aki.pdf patient = pd.Series(patient_data)
https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.nwts.nhs.uk/_file/gCDJ8vu46p_275810.pdf&ved=2ahUKEwifj_P0mfuKAxWqXUEAHQ0oKQEQFnoECBcQAQ&usg=AOvVaw01O_TsELBULOw3GvBNEv3p
""" formatted_patient = process_patient_data(patient, creatinine_columns, data_type = 'test')
if age > 16:
if sex == 'm': return (59, 104) formatted_patient.rename({ 0: 'sex',
else: return (45, 84) 1: 'age',
elif age == 16: 2: 'c1',
if sex == 'm': return (54, 99) 3: 'rv1',
else: return (48, 81) 4: 'rv2',
elif age == 15: 5: 'rv_ratio',
if sex == 'm': return (47, 98) 6: 'D',},
else: return (44, 79) inplace=True)
elif age == 14: return formatted_patient
if sex == 'm': return (40, 83)
else: return (43, 75)
elif age == 13:
if sex == 'm': return (38, 76)
else: return (38, 74)
elif age == 12: return (36, 67)
elif age == 11: return (36, 64)
elif 9 <= age < 11: return (28, 57)
elif 7 <= age < 9: return (30, 48)
elif 5 <= age < 7: return (25, 42)
elif 3 <= age < 5: return (23, 37)
elif 1 <= age < 3: return (15, 31)
elif age < 1: return (14, 81)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment