Skip to content
Snippets Groups Projects
Commit 45a09d45 authored by kmilicic's avatar kmilicic
Browse files

testing, logging, function organisation

parent 086ec459
No related branches found
No related tags found
No related merge requests found
......@@ -3,6 +3,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -yq install python3
RUN python3 -m venv /venv
ENV PATH="/venv/bin:$PATH"
COPY requirements.txt /model/
RUN pip3 install -r /model/requirements.txt
COPY model.py best_model.pth /model/
CMD /model/model.py --input=/data/test.csv --output=/data/aki.csv
WORKDIR /model
RUN pip3 install -r requirements.txt
COPY model.py best_model.pth ./
CMD ./model.py --input=/data/test.csv --output=/data/aki.csv
aki.csv 0 → 100644
This diff is collapsed.
No preview for this file type
#!/usr/bin/env python3
import argparse
import logging
from pathlib import Path
import pandas as pd
......@@ -11,76 +12,35 @@ from sklearn.metrics import fbeta_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
MODEL_PATH = Path("/model/best_model.pth")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler()],
)
logger = logging.getLogger(__name__)
def preprocess_features(df: pd.DataFrame) -> pd.DataFrame:
df["sex"] = (df["sex"] == "M").astype(int)
date_cols = [col for col in df.columns if "date" in col]
for col in date_cols:
df[col] = pd.to_datetime(df[col]).dt.date
date_features = pd.DataFrame(index=df.index)
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing dates"):
current_date = pd.to_datetime(row[date_cols]).dropna().dt.date.max()
recent_dates = []
older_dates = []
# Get current value
c1 = None
for col in date_cols:
if pd.notna(df.at[idx, col]):
days_diff = (current_date - df.at[idx, col]).days
value = df.at[idx, col.replace("date", "result")]
if days_diff == 0:
c1 = value
if 0 <= days_diff <= 7:
recent_dates.append(value)
elif 8 <= days_diff <= 365:
older_dates.append(value)
# Calculate ratios
rv1 = min(recent_dates) if recent_dates else None
rv2 = pd.Series(older_dates).median() if older_dates else None
date_features.at[idx, "ratio1"] = (
c1 / rv1 if (c1 is not None and rv1 is not None) else None
)
date_features.at[idx, "ratio2"] = (
c1 / rv2 if (c1 is not None and rv2 is not None) else None
)
date_features.at[idx, "has_recent"] = 1 if recent_dates else 0
df["ratio1"] = date_features["ratio1"]
df["ratio2"] = date_features["ratio2"]
df["has_recent"] = date_features["has_recent"]
selected_columns = ["age", "sex", "ratio1", "ratio2", "has_recent"]
df = df[selected_columns]
# Fill missing values with median
df = df.fillna(df.median())
# Scale features
scaler = StandardScaler()
df[["age", "ratio1", "ratio2"]] = scaler.fit_transform(
df[["age", "ratio1", "ratio2"]]
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", default="test.csv")
parser.add_argument("--output", default="aki.csv")
parser.add_argument("--model_path", default="best_model.pth")
parser.add_argument("--train", action="store_true", help="Train the model")
args = parser.parse_args()
return df
model = train_model(args.model_path) if args.train else load_model(args.model_path)
test_df = preprocess_features(pd.read_csv(args.input))
predictions_df = predict(model, test_df)
def preprocess_targets(df: pd.DataFrame) -> pd.Series:
return (df["aki"] == "y").astype(int)
logger.info(f"Saving model predictions to {args.output}")
predictions_df.to_csv(args.output, index=False)
class Net(nn.Module):
def __init__(self, input_size: int) -> None:
def __init__(self) -> None:
super(Net, self).__init__()
self.layer1: nn.Linear = nn.Linear(input_size, 16)
self.layer1: nn.Linear = nn.Linear(5, 16)
self.layer2: nn.Linear = nn.Linear(16, 8)
self.layer3: nn.Linear = nn.Linear(8, 1)
self.relu: nn.ReLU = nn.ReLU()
......@@ -94,7 +54,8 @@ class Net(nn.Module):
return x.squeeze()
def train_model():
def train_model(model_path: Path) -> Net:
logger.info("Training model")
# Load preprocessed data
train_df = pd.read_csv("training.csv")
test_df = pd.read_csv("test.csv")
......@@ -112,12 +73,13 @@ def train_model():
y_test_tensor = torch.FloatTensor(y_test.values)
# Initialize model, loss and optimizer
model = Net(X_train.shape[1])
model = Net()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
# Training loop
n_epochs = 500
n_epochs = 100
# n_epochs = 500
batch_size = 32
best_f3 = 0
best_model_state = None
......@@ -167,21 +129,100 @@ def train_model():
# Load best model state
assert best_model_state is not None
model.load_state_dict(best_model_state)
torch.save(best_model_state, MODEL_PATH)
torch.save(best_model_state, model_path)
print(f"Best F3 score: {best_f3:.3f}")
return model
def predict(X_test: pd.DataFrame) -> pd.DataFrame:
X_test = preprocess_features(X_test)
X_test_tensor = torch.FloatTensor(X_test.values)
def preprocess_features(df: pd.DataFrame) -> pd.DataFrame:
validate_data(df)
df["sex"] = (df["sex"] == "M").astype(int)
date_cols = [col for col in df.columns if "date" in col]
# Load the trained model
model = Net(input_size=X_test_tensor.shape[1])
model.load_state_dict(torch.load(MODEL_PATH, weights_only=True))
for col in date_cols:
df[col] = pd.to_datetime(df[col]).dt.date
date_features = pd.DataFrame(index=df.index)
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing dates"):
current_date = pd.to_datetime(row[date_cols]).dropna().dt.date.max()
recent_dates = []
older_dates = []
# Get current value
c1 = None
for col in date_cols:
if pd.notna(df.at[idx, col]):
days_diff = (current_date - df.at[idx, col]).days
value = df.at[idx, col.replace("date", "result")]
if days_diff == 0:
c1 = value
if 0 <= days_diff <= 7:
recent_dates.append(value)
elif 8 <= days_diff <= 365:
older_dates.append(value)
# Calculate ratios
rv1 = min(recent_dates) if recent_dates else None
rv2 = pd.Series(older_dates).median() if older_dates else None
date_features.at[idx, "ratio1"] = (
c1 / rv1 if (c1 is not None and rv1 is not None) else None
)
date_features.at[idx, "ratio2"] = (
c1 / rv2 if (c1 is not None and rv2 is not None) else None
)
date_features.at[idx, "has_recent"] = 1 if recent_dates else 0
df["ratio1"] = date_features["ratio1"]
df["ratio2"] = date_features["ratio2"]
df["has_recent"] = date_features["has_recent"]
selected_columns = ["age", "sex", "ratio1", "ratio2", "has_recent"]
df = df[selected_columns]
# Fill missing values with median
df = df.fillna(df.median())
# Scale features
scaler = StandardScaler()
df[["age", "ratio1", "ratio2"]] = scaler.fit_transform(
df[["age", "ratio1", "ratio2"]]
)
return df
def validate_data(df: pd.DataFrame) -> None:
required_columns = {"age", "sex", "aki"}
if not required_columns.issubset(df.columns):
raise ValueError(
f"Missing required columns: {required_columns - set(df.columns)}"
)
if df["age"].min() < 0:
raise ValueError("Age cannot be negative")
def preprocess_targets(df: pd.DataFrame) -> pd.Series:
return (df["aki"] == "y").astype(int)
def load_model(model_path: Path) -> Net:
logger.info("Loading model")
model = Net()
model.load_state_dict(torch.load(model_path, weights_only=True))
model.eval()
return model
def predict(model: Net, X_test: pd.DataFrame) -> pd.DataFrame:
X_test_tensor = torch.FloatTensor(X_test.values)
# Get predictions
with torch.no_grad():
outputs = model(X_test_tensor)
......@@ -189,20 +230,5 @@ def predict(X_test: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame(predictions, columns=["aki"])
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", default="test.csv")
parser.add_argument("--output", default="aki.csv")
parser.add_argument("--train", action="store_true", help="Train the model")
args = parser.parse_args()
if args.train:
train_model()
test_df = pd.read_csv(args.input)
predictions_df = predict(test_df)
predictions_df.to_csv(args.output, index=False)
if __name__ == "__main__":
main()
[project]
dependencies = [
"pandas>=2.2.3",
"pytest>=8.3.4",
"scikit-learn>=1.6.1",
"torch>=2.5.1",
"tqdm>=4.67.1",
......
File moved
from pathlib import Path
import pandas as pd
import pytest
import torch
from sklearn.metrics import fbeta_score
import model
def test_preprocess_features():
test_data = pd.DataFrame({"age": [30, 40], "sex": ["M", "F"], "aki": ["n", "y"]})
result = model.preprocess_features(test_data)
assert "sex" in result.columns
assert result["sex"].dtype == int
def test_validate_data_missing_columns():
test_data = pd.DataFrame({"age": [30, 40], "sex": ["M", "F"]})
with pytest.raises(ValueError):
model.validate_data(test_data)
def test_validate_data_negative_age():
test_data = pd.DataFrame({"age": [-1, 40], "sex": ["M", "F"], "aki": ["n", "y"]})
with pytest.raises(ValueError):
model.validate_data(test_data)
def test_preprocess_targets():
test_data = pd.DataFrame({"aki": ["y", "n", "y"]})
result = model.preprocess_targets(test_data)
assert result.tolist() == [1, 0, 1]
def test_net_forward_pass():
net = model.Net()
test_tensor = torch.randn(3, 5) # Batch of 3 samples with 5 features
output = net(test_tensor)
assert output.shape == torch.Size([3]) # Should return 1 prediction per sample
assert (output >= 0).all() and (
output <= 1
).all() # Outputs should be between 0 and 1
def test_model_performance():
model_path = Path("best_model.pth")
if not model_path.exists():
pytest.skip("Model weights not found, skipping performance test")
test_df = pd.read_csv("test.csv")
X_test = model.preprocess_features(test_df)
y_test = model.preprocess_targets(test_df)
net = model.load_model(model_path)
with torch.no_grad():
X_test_tensor = torch.FloatTensor(X_test.values)
outputs = net(X_test_tensor)
predictions = (outputs > 0.5).float()
accuracy = (predictions == torch.FloatTensor(y_test.values)).float().mean()
f3_score = fbeta_score(y_test.values.astype(int), predictions.numpy(), beta=3)
assert accuracy > 0.95, f"Model accuracy {accuracy:.3f} below 95% threshold"
assert f3_score > 0.95, f"Model F3 score {f3_score:.3f} below 95% threshold"
import pandas as pd
import torch
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import fbeta_score
def preprocess_features(df: pd.DataFrame) -> pd.DataFrame:
df["sex"] = (df["sex"] == "M").astype(int)
date_cols = [col for col in df.columns if "date" in col]
for col in date_cols:
df[col] = pd.to_datetime(df[col]).dt.date
date_features = pd.DataFrame(index=df.index)
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing dates"):
current_date = pd.to_datetime(row[date_cols]).dropna().dt.date.max()
recent_dates = []
older_dates = []
# Get current value
c1 = None
for col in date_cols:
if pd.notna(df.at[idx, col]):
days_diff = (current_date - df.at[idx, col]).days
value = df.at[idx, col.replace("date", "result")]
if days_diff == 0:
c1 = value
if 0 <= days_diff <= 7:
recent_dates.append(value)
elif 8 <= days_diff <= 365:
older_dates.append(value)
# Calculate ratios
rv1 = min(recent_dates) if recent_dates else None
rv2 = pd.Series(older_dates).median() if older_dates else None
date_features.at[idx, "ratio1"] = (
c1 / rv1 if (c1 is not None and rv1 is not None) else None
)
date_features.at[idx, "ratio2"] = (
c1 / rv2 if (c1 is not None and rv2 is not None) else None
)
date_features.at[idx, "has_recent"] = 1 if recent_dates else 0
df["ratio1"] = date_features["ratio1"]
df["ratio2"] = date_features["ratio2"]
df["has_recent"] = date_features["has_recent"]
selected_columns = ["age", "sex", "ratio1", "ratio2", "has_recent"]
df = df[selected_columns]
# Fill missing values with median
df = df.fillna(df.median())
# Scale features
scaler = StandardScaler()
df[["age", "ratio1", "ratio2"]] = scaler.fit_transform(
df[["age", "ratio1", "ratio2"]]
)
return df
def preprocess_targets(df: pd.DataFrame) -> pd.Series:
return (df["aki"] == "y").astype(int)
class Net(nn.Module):
def __init__(self, input_size: int) -> None:
super(Net, self).__init__()
self.layer1: nn.Linear = nn.Linear(input_size, 16)
self.layer2: nn.Linear = nn.Linear(16, 8)
self.layer3: nn.Linear = nn.Linear(8, 1)
self.relu: nn.ReLU = nn.ReLU()
self.dropout: nn.Dropout = nn.Dropout(0.2)
self.sigmoid: nn.Sigmoid = nn.Sigmoid()
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.dropout(self.relu(self.layer1(x)))
x = self.relu(self.layer2(x))
x = self.sigmoid(self.layer3(x))
return x.squeeze()
def train_model():
# Load preprocessed data
# train_df = pd.read_csv("training.csv")
# test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("training.csv").head(100)
test_df = pd.read_csv("test.csv").head(100)
# Prepare features and target
X_train = preprocess_features(train_df)
y_train = preprocess_targets(train_df)
X_test = preprocess_features(test_df)
y_test = preprocess_targets(test_df)
# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values)
X_test_tensor = torch.FloatTensor(X_test.values)
y_test_tensor = torch.FloatTensor(y_test.values)
# Initialize model, loss and optimizer
model = Net(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
# Training loop
n_epochs = 500
batch_size = 32
best_f3 = 0
best_model_state = None
for epoch in range(n_epochs):
model.train()
for i in range(0, len(X_train_tensor), batch_size):
batch_X = X_train_tensor[i : i + batch_size]
batch_y = y_train_tensor[i : i + batch_size]
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs.squeeze(), batch_y)
loss.backward()
optimizer.step()
# Validation
model.eval()
with torch.no_grad():
# Training metrics
train_outputs = model(X_train_tensor)
train_predictions = (train_outputs > 0.5).float()
train_accuracy = (train_predictions == y_train_tensor).float().mean()
train_f3 = fbeta_score(
y_train_tensor.numpy(), train_predictions.numpy(), beta=3
)
# Test metrics
test_outputs = model(X_test_tensor)
test_predictions = (test_outputs > 0.5).float()
test_accuracy = (test_predictions == y_test_tensor).float().mean()
test_f3 = fbeta_score(
y_test_tensor.numpy(), test_predictions.numpy(), beta=3
)
print(
f"Epoch {epoch + 1}/{n_epochs}, "
f"Train Accuracy: {train_accuracy:.3f}, Train F3: {train_f3:.3f}, "
f"Test Accuracy: {test_accuracy:.3f}, Test F3: {test_f3:.3f}"
)
# Save best model
if test_f3 > best_f3:
best_f3 = test_f3
best_model_state = model.state_dict()
# Load best model state
assert best_model_state is not None
model.load_state_dict(best_model_state)
torch.save(best_model_state, "best_model.pth")
print(f"Best F3 score: {best_f3:.3f}")
return model
if __name__ == "__main__":
train_model()
......@@ -28,6 +28,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/de/86/5486b0188d08aa643e127774a99bac51ffa6cf343e3deb0583956dca5b22/fsspec-2024.12.0-py3-none-any.whl", hash = "sha256:b520aed47ad9804237ff878b504267a3b0b441e97508bd6d2d8774e3db85cee2", size = 183862 },
]
[[package]]
name = "iniconfig"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 },
]
[[package]]
name = "jinja2"
version = "3.1.5"
......@@ -297,6 +306,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144 },
]
[[package]]
name = "packaging"
version = "24.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 },
]
[[package]]
name = "pandas"
version = "2.2.3"
......@@ -331,6 +349,30 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 },
]
[[package]]
name = "pluggy"
version = "1.5.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
]
[[package]]
name = "pytest"
version = "8.3.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "colorama", marker = "sys_platform == 'win32'" },
{ name = "iniconfig" },
{ name = "packaging" },
{ name = "pluggy" },
]
sdist = { url = "https://files.pythonhosted.org/packages/05/35/30e0d83068951d90a01852cb1cef56e5d8a09d20c7f511634cc2f7e0372a/pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761", size = 1445919 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/11/92/76a1c94d3afee238333bc0a42b82935dd8f9cf8ce9e336ff87ee14d9e1cf/pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6", size = 343083 },
]
[[package]]
name = "python-dateutil"
version = "2.9.0.post0"
......@@ -463,6 +505,7 @@ version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "pandas" },
{ name = "pytest" },
{ name = "scikit-learn" },
{ name = "torch" },
{ name = "tqdm" },
......@@ -477,6 +520,7 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "pandas", specifier = ">=2.2.3" },
{ name = "pytest", specifier = ">=8.3.4" },
{ name = "scikit-learn", specifier = ">=1.6.1" },
{ name = "torch", specifier = ">=2.5.1" },
{ name = "tqdm", specifier = ">=4.67.1" },
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment