From ea4d629334bbb9a6ee2a98144eebc3d8b11ee6ea Mon Sep 17 00:00:00 2001 From: Cesar Martin Aguirre Aragon Date: Fri, 15 May 2026 12:49:40 -0500 Subject: [PATCH 1/3] updated init and train scritps --- modeling/__init__.py | 1 + modeling/train.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/modeling/__init__.py b/modeling/__init__.py index 901d34c..ba4dbcb 100644 --- a/modeling/__init__.py +++ b/modeling/__init__.py @@ -6,3 +6,4 @@ WORTH_SENSITIVITY_FLOOR, WORTH_WEIGHT_MULTIPLIER, ) +from .evaluate import find_optimal_threshold diff --git a/modeling/train.py b/modeling/train.py index 3cf3073..853556f 100644 --- a/modeling/train.py +++ b/modeling/train.py @@ -65,7 +65,7 @@ def train_baseline( model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss="binary_crossentropy", - metrics=["accuracy"], + metrics=["accuracy", tf.keras.metrics.AUC(name="auc")], ) class_weights = compute_class_weights(list(train_df[label_col])) @@ -155,12 +155,14 @@ def _build_callbacks(checkpoint_dir: str) -> list: return [ tf.keras.callbacks.ModelCheckpoint( filepath=os.path.join(checkpoint_dir, "best.keras"), - monitor="val_loss", + monitor="val_auc", + mode="max", save_best_only=True, verbose=1, ), tf.keras.callbacks.EarlyStopping( - monitor="val_loss", + monitor="val_auc", + mode="max", patience=7, restore_best_weights=True, verbose=1, From 99e3c60fa93a804fd40c58aa32618aec1d8b983d Mon Sep 17 00:00:00 2001 From: Cesar Martin Aguirre Aragon Date: Fri, 15 May 2026 12:50:48 -0500 Subject: [PATCH 2/3] Updated evaluate and baseline classifier py files --- modeling/baseline_classifier.py | 2 +- modeling/evaluate.py | 63 +++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/modeling/baseline_classifier.py b/modeling/baseline_classifier.py index 8596e20..734cc47 100644 --- a/modeling/baseline_classifier.py +++ b/modeling/baseline_classifier.py @@ -130,4 +130,4 @@ def compute_class_weights(labels: list[int]) -> dict[int, float]: # Extra penalty for missing a WORTH case (false reassurance risk). weights[POSITIVE_CLASS_INDEX] *= WORTH_WEIGHT_MULTIPLIER - return {i: float(w) for i, w in enumerate(weights)} + return {i: float(w) for i, w in enumerate(weights)} \ No newline at end of file diff --git a/modeling/evaluate.py b/modeling/evaluate.py index 3b23a43..0a9a813 100644 --- a/modeling/evaluate.py +++ b/modeling/evaluate.py @@ -18,6 +18,7 @@ classification_report, confusion_matrix, recall_score, + roc_curve, ) from config.constants import INPUT_SIZE @@ -29,6 +30,68 @@ from modeling.train import _build_dataset +def find_optimal_threshold( + model: tf.keras.Model, + val_df: pd.DataFrame, + image_dir: str, + image_col: str = "image_path", + label_col: str = "label", + input_size: tuple = INPUT_SIZE, + batch_size: int = 32, +) -> float: + """Find the decision threshold that maximises specificity while meeting the sensitivity floor. + + Sweeps the ROC curve on the validation set and picks the operating point where + WORTH_SECOND_LOOK sensitivity >= WORTH_SENSITIVITY_FLOOR and FPR (false-alarm rate) + is minimised. Falls back to 0.5 with a warning if no threshold meets the floor. + + Args: + model: Trained Keras model (or path string to a saved .keras file). + val_df: Validation split DataFrame — must NOT be the test set. + image_dir: Root directory containing image files. + image_col: Column with image filenames. + label_col: Column with binary labels (int 0 or 1). + input_size: Must match the size used during training. + batch_size: Inference batch size. + + Returns: + Optimal threshold float in (0, 1). + """ + if isinstance(model, str): + model = tf.keras.models.load_model(model) + + val_ds = _build_dataset( + val_df, image_dir, image_col, label_col, input_size, batch_size, shuffle=False + ) + true_labels = np.asarray([int(y) for y in val_df[label_col]]) + probabilities = model.predict(val_ds, verbose=0).ravel() + + # roc_curve returns fpr, tpr (=sensitivity), and the corresponding thresholds. + fpr, tpr, thresholds = roc_curve(true_labels, probabilities, pos_label=POSITIVE_CLASS_INDEX) + + # Keep only operating points that satisfy the sensitivity floor. + valid_mask = tpr >= WORTH_SENSITIVITY_FLOOR + if not valid_mask.any(): + print( + f"WARNING: No threshold achieves sensitivity >= {WORTH_SENSITIVITY_FLOOR}. " + "Falling back to 0.5. Consider retraining with stronger class weighting." + ) + return 0.5 + + # Among valid points, choose the one with the lowest FPR (fewest false alarms). + best_idx = np.argmin(fpr[valid_mask]) + optimal = float(thresholds[valid_mask][best_idx]) + + achieved_sensitivity = float(tpr[valid_mask][best_idx]) + achieved_specificity = float(1.0 - fpr[valid_mask][best_idx]) + print( + f"\nOptimal threshold: {optimal:.3f} " + f"(val sensitivity: {achieved_sensitivity:.3f}, " + f"val specificity: {achieved_specificity:.3f})" + ) + return optimal + + def evaluate_baseline( model: tf.keras.Model, test_df: pd.DataFrame, From b2539853e684766c33c3afdd007070dfbdef5e09 Mon Sep 17 00:00:00 2001 From: Cesar Martin Aguirre Aragon Date: Fri, 15 May 2026 13:19:37 -0500 Subject: [PATCH 3/3] Change description --- modeling/CHANGES.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 modeling/CHANGES.md diff --git a/modeling/CHANGES.md b/modeling/CHANGES.md new file mode 100644 index 0000000..ac098d6 --- /dev/null +++ b/modeling/CHANGES.md @@ -0,0 +1,26 @@ +# Modeling Changes + +## Branch: feature/modeling-improvements + +--- + +### 1. AUC-based checkpointing (`train.py`) + +**What changed:** `ModelCheckpoint` and `EarlyStopping` now monitor `val_auc` (maximise) instead of `val_loss` (minimise). Added `AUC` as a compiled metric. `ReduceLROnPlateau` stays on `val_loss` — this benefits from the smoother, more continuous signal. + +**Why:** `val_loss` (binary cross-entropy) rewards confidence, not ranking quality. A model can drive loss down by being very certain about easy negatives while still fumbling borderline WORTH cases. AUC directly measures whether the model ranks WORTH above NOT_WORTH across all thresholds — a much stronger signal when sensitivity is the priority. Checkpointing on best AUC gives us the most flexibility when choosing a decision threshold that meets the safety floor. + +--- + +### 2. Validation-set threshold optimisation (`evaluate.py`) + +**What changed:** Added `find_optimal_threshold(model, val_df, image_dir, ...)` to `evaluate.py`, exported from `__init__.py`. The 0.5 default in `evaluate_baseline` is intentionally untouched — callers are expected to run `find_optimal_threshold` on the validation set first and pass the result in explicitly. + +**Why:** The sigmoid output is a ranking score, not a calibrated probability. Defaulting to 0.5 ignores class imbalance and the asymmetric cost of missing a WORTH case. Instead, we sweep the ROC curve on the validation set to find the lowest threshold where sensitivity meets `WORTH_SENSITIVITY_FLOOR` (0.80) and specificity is as high as possible — catching every necessary case while keeping patient callbacks to a minimum. The test set is never touched during this step. + +**Usage:** + +```python +threshold = find_optimal_threshold(model, val_df, image_dir="data/images/") +results = evaluate_baseline(model, test_df, image_dir="data/images/", threshold=threshold) +```