From ea4d629334bbb9a6ee2a98144eebc3d8b11ee6ea Mon Sep 17 00:00:00 2001
From: Cesar Martin Aguirre Aragon <rpcesarmartin@gmail.com>
Date: Fri, 15 May 2026 12:49:40 -0500
Subject: [PATCH 1/3] updated init and train scritps

---
 modeling/__init__.py | 1 +
 modeling/train.py    | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/modeling/__init__.py b/modeling/__init__.py
index 901d34c..ba4dbcb 100644
--- a/modeling/__init__.py
+++ b/modeling/__init__.py
@@ -6,3 +6,4 @@
     WORTH_SENSITIVITY_FLOOR,
     WORTH_WEIGHT_MULTIPLIER,
 )
+from .evaluate import find_optimal_threshold
diff --git a/modeling/train.py b/modeling/train.py
index 3cf3073..853556f 100644
--- a/modeling/train.py
+++ b/modeling/train.py
@@ -65,7 +65,7 @@ def train_baseline(
     model.compile(
         optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
         loss="binary_crossentropy",
-        metrics=["accuracy"],
+        metrics=["accuracy", tf.keras.metrics.AUC(name="auc")],
     )
 
     class_weights = compute_class_weights(list(train_df[label_col]))
@@ -155,12 +155,14 @@ def _build_callbacks(checkpoint_dir: str) -> list:
     return [
         tf.keras.callbacks.ModelCheckpoint(
             filepath=os.path.join(checkpoint_dir, "best.keras"),
-            monitor="val_loss",
+            monitor="val_auc",
+            mode="max",
             save_best_only=True,
             verbose=1,
         ),
         tf.keras.callbacks.EarlyStopping(
-            monitor="val_loss",
+            monitor="val_auc",
+            mode="max",
             patience=7,
             restore_best_weights=True,
             verbose=1,

From 99e3c60fa93a804fd40c58aa32618aec1d8b983d Mon Sep 17 00:00:00 2001
From: Cesar Martin Aguirre Aragon <rpcesarmartin@gmail.com>
Date: Fri, 15 May 2026 12:50:48 -0500
Subject: [PATCH 2/3] Updated evaluate and baseline classifier py files

---
 modeling/baseline_classifier.py |  2 +-
 modeling/evaluate.py            | 63 +++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/modeling/baseline_classifier.py b/modeling/baseline_classifier.py
index 8596e20..734cc47 100644
--- a/modeling/baseline_classifier.py
+++ b/modeling/baseline_classifier.py
@@ -130,4 +130,4 @@ def compute_class_weights(labels: list[int]) -> dict[int, float]:
     # Extra penalty for missing a WORTH case (false reassurance risk).
     weights[POSITIVE_CLASS_INDEX] *= WORTH_WEIGHT_MULTIPLIER
 
-    return {i: float(w) for i, w in enumerate(weights)}
+    return {i: float(w) for i, w in enumerate(weights)}
\ No newline at end of file
diff --git a/modeling/evaluate.py b/modeling/evaluate.py
index 3b23a43..0a9a813 100644
--- a/modeling/evaluate.py
+++ b/modeling/evaluate.py
@@ -18,6 +18,7 @@
     classification_report,
     confusion_matrix,
     recall_score,
+    roc_curve,
 )
 
 from config.constants import INPUT_SIZE
@@ -29,6 +30,68 @@
 from modeling.train import _build_dataset
 
 
+def find_optimal_threshold(
+    model: tf.keras.Model,
+    val_df: pd.DataFrame,
+    image_dir: str,
+    image_col: str = "image_path",
+    label_col: str = "label",
+    input_size: tuple = INPUT_SIZE,
+    batch_size: int = 32,
+) -> float:
+    """Find the decision threshold that maximises specificity while meeting the sensitivity floor.
+
+    Sweeps the ROC curve on the validation set and picks the operating point where
+    WORTH_SECOND_LOOK sensitivity >= WORTH_SENSITIVITY_FLOOR and FPR (false-alarm rate)
+    is minimised. Falls back to 0.5 with a warning if no threshold meets the floor.
+
+    Args:
+        model: Trained Keras model (or path string to a saved .keras file).
+        val_df: Validation split DataFrame — must NOT be the test set.
+        image_dir: Root directory containing image files.
+        image_col: Column with image filenames.
+        label_col: Column with binary labels (int 0 or 1).
+        input_size: Must match the size used during training.
+        batch_size: Inference batch size.
+
+    Returns:
+        Optimal threshold float in (0, 1).
+    """
+    if isinstance(model, str):
+        model = tf.keras.models.load_model(model)
+
+    val_ds = _build_dataset(
+        val_df, image_dir, image_col, label_col, input_size, batch_size, shuffle=False
+    )
+    true_labels = np.asarray([int(y) for y in val_df[label_col]])
+    probabilities = model.predict(val_ds, verbose=0).ravel()
+
+    # roc_curve returns fpr, tpr (=sensitivity), and the corresponding thresholds.
+    fpr, tpr, thresholds = roc_curve(true_labels, probabilities, pos_label=POSITIVE_CLASS_INDEX)
+
+    # Keep only operating points that satisfy the sensitivity floor.
+    valid_mask = tpr >= WORTH_SENSITIVITY_FLOOR
+    if not valid_mask.any():
+        print(
+            f"WARNING: No threshold achieves sensitivity >= {WORTH_SENSITIVITY_FLOOR}. "
+            "Falling back to 0.5. Consider retraining with stronger class weighting."
+        )
+        return 0.5
+
+    # Among valid points, choose the one with the lowest FPR (fewest false alarms).
+    best_idx = np.argmin(fpr[valid_mask])
+    optimal = float(thresholds[valid_mask][best_idx])
+
+    achieved_sensitivity = float(tpr[valid_mask][best_idx])
+    achieved_specificity = float(1.0 - fpr[valid_mask][best_idx])
+    print(
+        f"\nOptimal threshold: {optimal:.3f} "
+        f"(val sensitivity: {achieved_sensitivity:.3f}, "
+        f"val specificity: {achieved_specificity:.3f})"
+    )
+    return optimal
+
+
 def evaluate_baseline(
     model: tf.keras.Model,
     test_df: pd.DataFrame,

From b2539853e684766c33c3afdd007070dfbdef5e09 Mon Sep 17 00:00:00 2001
From: Cesar Martin Aguirre Aragon <rpcesarmartin@gmail.com>
Date: Fri, 15 May 2026 13:19:37 -0500
Subject: [PATCH 3/3] Change description

---
 modeling/CHANGES.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 modeling/CHANGES.md

diff --git a/modeling/CHANGES.md b/modeling/CHANGES.md
new file mode 100644
index 0000000..ac098d6
--- /dev/null
+++ b/modeling/CHANGES.md
@@ -0,0 +1,26 @@
+# Modeling Changes
+
+## Branch: feature/modeling-improvements
+
+---
+
+### 1. AUC-based checkpointing (`train.py`)
+
+**What changed:** `ModelCheckpoint` and `EarlyStopping` now monitor `val_auc` (maximise) instead of `val_loss` (minimise). Added `AUC` as a compiled metric. `ReduceLROnPlateau` stays on `val_loss` — this benefits from the smoother, more continuous signal.
+
+**Why:** `val_loss` (binary cross-entropy) rewards confidence, not ranking quality. A model can drive loss down by being very certain about easy negatives while still fumbling borderline WORTH cases. AUC directly measures whether the model ranks WORTH above NOT_WORTH across all thresholds — a much stronger signal when sensitivity is the priority. Checkpointing on best AUC gives us the most flexibility when choosing a decision threshold that meets the safety floor.
+
+---
+
+### 2. Validation-set threshold optimisation (`evaluate.py`)
+
+**What changed:** Added `find_optimal_threshold(model, val_df, image_dir, ...)` to `evaluate.py`, exported from `__init__.py`. The 0.5 default in `evaluate_baseline` is intentionally untouched — callers are expected to run `find_optimal_threshold` on the validation set first and pass the result in explicitly.
+
+**Why:** The sigmoid output is a ranking score, not a calibrated probability. Defaulting to 0.5 ignores class imbalance and the asymmetric cost of missing a WORTH case. Instead, we sweep the ROC curve on the validation set to find the lowest threshold where sensitivity meets `WORTH_SENSITIVITY_FLOOR` (0.80) and specificity is as high as possible — catching every necessary case while keeping patient callbacks to a minimum. The test set is never touched during this step.
+
+**Usage:**
+
+```python
+threshold = find_optimal_threshold(model, val_df, image_dir="data/images/")
+results = evaluate_baseline(model, test_df, image_dir="data/images/", threshold=threshold)
+```