From 054c6a37ed2dd8ae37a3ea4c359449d93f89d07e Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Tue, 7 Apr 2026 16:58:22 +0200
Subject: [PATCH 01/35] 1st iter

---
 pvc/model-test-refs-pvc.yaml                  |  12 +
 pyproject.toml                                |   2 +-
 tests/README-model-snapshots.md               |  85 +++++
 tests/__init__.py                             |   0
 tests/benchmark/benchmark_batch_size.py       | 333 ++++++++++++++++++
 tests/benchmark/load_test.py                  | 286 +++++++++++++++
 tests/model_snapshots/__init__.py             |   0
 tests/model_snapshots/_shared.py              | 144 ++++++++
 tests/model_snapshots/generate_references.py  | 103 ++++++
 .../test_binary_classifier_model_snapshot.py  |  36 ++
 ...st_semantic_segmentation_model_snapshot.py |  35 ++
 11 files changed, 1035 insertions(+), 1 deletion(-)
 create mode 100644 pvc/model-test-refs-pvc.yaml
 create mode 100644 tests/README-model-snapshots.md
 create mode 100644 tests/__init__.py
 create mode 100644 tests/benchmark/benchmark_batch_size.py
 create mode 100644 tests/benchmark/load_test.py
 create mode 100644 tests/model_snapshots/__init__.py
 create mode 100644 tests/model_snapshots/_shared.py
 create mode 100644 tests/model_snapshots/generate_references.py
 create mode 100644 tests/model_snapshots/test_binary_classifier_model_snapshot.py
 create mode 100644 tests/model_snapshots/test_semantic_segmentation_model_snapshot.py

diff --git a/pvc/model-test-refs-pvc.yaml b/pvc/model-test-refs-pvc.yaml
new file mode 100644
index 0000000..4b3390a
--- /dev/null
+++ b/pvc/model-test-refs-pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: model-test-refs-pvc
+  labels:
+    app: model-service-tests
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 5Gi
diff --git a/pyproject.toml b/pyproject.toml
index fbb3a9f..df49f5d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,4 +18,4 @@ dependencies = [
 ]
 
 [dependency-groups]
-dev = ["mypy>=1.18.2", "ruff>=0.14.6"]
+dev = ["mypy>=1.18.2", "ruff>=0.14.6", "pytest>=8.4.2"]
diff --git a/tests/README-model-snapshots.md b/tests/README-model-snapshots.md
new file mode 100644
index 0000000..3d2fb7b
--- /dev/null
+++ b/tests/README-model-snapshots.md
@@ -0,0 +1,85 @@
+# Model snapshot tests
+
+This repository contains end-to-end snapshot tests in `tests/model_snapshots/`.
+
+Per-model test files:
+
+- `tests/model_snapshots/test_binary_classifier_model_snapshot.py`
+- `tests/model_snapshots/test_semantic_segmentation_model_snapshot.py`
+
+Shared files:
+
+- `tests/model_snapshots/_shared.py`
+- `tests/model_snapshots/run_all_model_snapshot_tests.py`
+
+These tests are meant as post-deploy use-case checks (not only liveness checks):
+
+- they execute a real request path through Ray Serve deployment
+- they verify request processing success (timeouts/errors fail the test)
+- they verify result correctness for each deployment (`binary_classifier`, `semantic_segmentation`)
+- they touch real slide paths, helping catch mount/filesystem issues
+
+Each test calls its deployment-specific endpoint:
+
+- binary classifier: SDK call `client.models.classify_image("prostate-classifier-1", tile)`
+- semantic segmentation: SDK call `client.models.segment_image("episeg-1", tile)`
+
+Input tile is read directly from a real WSI using `ratiopath.openslide.OpenSlide`.
+
+## Adding a new model test
+
+Přidání nového modelu do testů je nyní velmi jednoduché:
+
+1. Vytvořte nový soubor v `tests/model_snapshots/`, např. `test_novy_model_snapshot.py`.
+2. Importujte a zavolejte příslušnou case funkci z `_shared.py` a předejte jí konfiguraci napřímo parametrem:
+
+```python
+from pathlib import Path
+from tests.model_snapshots._shared import run_binary_classifier_case
+
+def test_novy_model_snapshot() -> None:
+    # Parametry si rovnou zadefinujte v testovacím souboru
+    run_binary_classifier_case(
+        model_id="my-new-endpoint-id",
+        slide_path="/mnt/bioptic_tree/.../slide.mrxs",
+        expected_score=0.987,
+        tile_size=512,
+        level=0,
+    )
+```
+
+Tím se stane automaticky součástí sady `pytest tests/model_snapshots`.
+
+## Global environment variables
+
+Common (pro celý cluster a všechny testy):
+
+- `MODEL_SERVICE_MODELS_BASE_URL` (default: `http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000`)
+
+Očekávané skóre/pole a cesty k datasetům pro stávající modely (`episeg-1` a `prostate-classifier-1`) se tahají z těchto proměnných ve stávajících testovacích souborech, pokud chcete zachovat původní CI chování (případně se dají časem snadno zahardkódit do testovacího souboru):
+
+- `MODEL_TEST_BINARY_EXPECTED_SCORE`
+- `MODEL_TEST_SEMANTIC_EXPECTED_ARRAY_PATH`
+
+## Example (PowerShell)
+
+```powershell
+$env:MODEL_TEST_BINARY_EXPECTED_SCORE = "0.9732"
+$env:MODEL_TEST_SEMANTIC_EXPECTED_ARRAY_PATH = "/mnt/path/to/reference/semantic_expected.npy"
+
+# Models base URL is resolved directly from SDK fallback inside kubernetes:
+# http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000
+
+python tests/model_snapshots/run_all_model_snapshot_tests.py
+
+# Alternative:
+python -m pytest tests/model_snapshots -q
+```
+
+## SDK dependency
+
+Install SDK package so that `import rationai` works in tests, e.g.:
+
+```powershell
+python -m pip install git+https://github.com/RationAI/rationai-sdk-python.git
+```
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/benchmark/benchmark_batch_size.py b/tests/benchmark/benchmark_batch_size.py
new file mode 100644
index 0000000..7ca0f1f
--- /dev/null
+++ b/tests/benchmark/benchmark_batch_size.py
@@ -0,0 +1,333 @@
+# kubectl apply -n rationai-jobs-ns -f c:\Users\jiris\muni-dp\dp\model-service\ray-service.yaml
+# kubectl get pods -n rationai-jobs-ns | Select-String "episeg" (model name)
+# kubectl cp tests/benchmark_batch_size.py rationai-jobs-ns/rayservice-model-optimized-7zwlk-head-fbzr5:/tmp/benchmark_batch_size.py
+# kubectl exec -n rationai-jobs-ns rayservice-model-optimized-7zwlk-head-fbzr5 -- bash -c "python3 -u /tmp/benchmark_batch_size.py --url http://localhost:8000/episeg-1/ --batch-size 128"
+
+# kubectl exec -n rationai-jobs-ns rayservice-model-optimized-7zwlk-head-fbzr5 -- bash -c "pip install httpx -q && python3 -u /tmp/benchmark_batch_size.py --url http://localhost:8000/episeg-1/ --batch-size 8 --concurrency-values 4,8,16,24,32,48,64 --tile-size 1024 --n 500 --warmup 100"
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import csv
+import sys
+import time
+from pathlib import Path
+
+import lz4.frame
+import numpy as np
+
+
+try:
+    import httpx
+except ImportError:
+    print("pip install httpx")
+    sys.exit(1)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+TILE_SIZE_DEFAULT = 224
+POOL_SIZE = 64
+OUTPUT_CSV = "results.csv"
+
+
+def make_pool(tile_size: int, n: int = POOL_SIZE) -> list[bytes]:
+    rng = np.random.default_rng(seed=42)
+    pool = []
+    for _ in range(n):
+        img = rng.integers(0, 255, (tile_size, tile_size, 3), dtype=np.uint8)
+        pool.append(lz4.frame.compress(img.tobytes()))
+    return pool
+
+
+async def run_batch(
+    url: str,
+    pool: list[bytes],
+    total: int,
+    concurrency: int,
+    timeout: float,
+) -> tuple[float, int, int]:
+    """Pošle `total` requestů s `concurrency` souběžnými workery."""
+    remaining = total
+    ok = 0
+    fail = 0
+    pool_len = len(pool)
+    counter = 0
+    lock = asyncio.Lock()
+
+    limits = httpx.Limits(
+        max_connections=concurrency + 8,
+        max_keepalive_connections=concurrency + 8,
+    )
+
+    async def worker(client: httpx.AsyncClient) -> None:
+        nonlocal remaining, ok, fail, counter
+        while True:
+            async with lock:
+                if remaining <= 0:
+                    return
+                remaining -= 1
+                idx = counter % pool_len
+                counter += 1
+            payload = pool[idx]
+            try:
+                r = await client.post(
+                    url,
+                    content=payload,
+                    headers={"Content-Type": "application/octet-stream"},
+                    timeout=timeout,
+                )
+                if r.status_code == 200:
+                    ok += 1
+                else:
+                    fail += 1
+                    print(f"  [WARN] HTTP {r.status_code}: {r.text[:120]}")
+            except Exception as e:
+                fail += 1
+                print(f"  [ERR] {type(e).__name__}: {e!r}")
+
+    t0 = time.perf_counter()
+    async with httpx.AsyncClient(limits=limits, follow_redirects=True) as client:
+        await asyncio.gather(*[worker(client) for _ in range(concurrency)])
+    return time.perf_counter() - t0, ok, fail
+
+
+def append_csv(path: str, row: dict) -> None:
+    fieldnames = [
+        "url",
+        "batch_size",
+        "concurrency",
+        "n",
+        "elapsed_s",
+        "throughput_img_s",
+        "ok",
+        "fail",
+    ]
+    write_header = not Path(path).exists()
+    with open(path, "a", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        if write_header:
+            writer.writeheader()
+        writer.writerow(row)
+
+
+def load_csv(path: str, url: str) -> list[dict]:
+    if not Path(path).exists():
+        return []
+    with open(path) as f:
+        reader = csv.DictReader(f)
+        return [r for r in reader if r["url"] == url]
+
+
+def concurrency_sweep_values(batch_size: int) -> list[int]:
+    """Pro MIG-2g.20gb: testujeme rozsah od batch_size/2 do batch_size*4.
+    Jemnější kroky kolem batch_size kde bývá knee.
+    """
+    half = max(1, batch_size // 2)
+    candidates = sorted(
+        set(
+            [
+                half,
+                batch_size,
+                batch_size + batch_size // 2,
+                batch_size * 2,
+                batch_size * 3,
+                batch_size * 4,
+            ]
+        )
+    )
+    # Přidej mezikroky kolem batch_size
+    extras = [batch_size - batch_size // 4, batch_size + batch_size // 4]
+    candidates = sorted(set(candidates + [e for e in extras if e > 0]))
+    return candidates
+
+
+def print_summary(rows: list[dict], batch_size: int | None = None) -> None:
+    if not rows:
+        return
+    if batch_size is not None:
+        rows = [r for r in rows if int(r["batch_size"]) == batch_size]
+    if not rows:
+        return
+
+    best = max(rows, key=lambda r: float(r["throughput_img_s"]))
+
+    header = f"{'batch_size':>12} {'concurrency':>12} {'throughput img/s':>18} {'ok':>8} {'fail':>8}"
+    print(header)
+    print("-" * len(header))
+    for row in sorted(
+        rows, key=lambda r: (int(r["batch_size"]), int(r["concurrency"]))
+    ):
+        marker = " ← BEST" if row is best else ""
+        fail_val = int(row["fail"])
+        fail_str = f"[!]{fail_val}" if fail_val > 0 else str(fail_val)
+        print(
+            f"{row['batch_size']:>12} {row['concurrency']:>12}"
+            f" {row['throughput_img_s']:>18} {row['ok']:>8} {fail_str:>8}{marker}"
+        )
+    print()
+    print("Doporučené YAML hodnoty pro batch_size =", best["batch_size"])
+    tor = int(best["concurrency"])
+    mor = int(tor * 1.25) + 8
+    print(f"  max_batch_size:           {best['batch_size']}")
+    print(f"  target_ongoing_requests:  {tor}   # = nejlepší concurrency")
+    print(f"  max_ongoing_requests:     {mor}   # target * 1.25 + buffer")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+async def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "--url",
+        default="http://localhost:8000/virchow2/",
+        help="Endpointová URL (default: http://localhost:8000/virchow2/)",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        required=True,
+        help="max_batch_size nastavený v user_config (shodný s YAML)",
+    )
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=None,
+        help="Pevná hodnota concurrency – přeskočí sweep a naměří jen tuto",
+    )
+    parser.add_argument(
+        "--concurrency-values",
+        type=str,
+        default=None,
+        help="Čárkami oddělený seznam concurrency hodnot k otestování, "
+        "např. '32,64,128,256'  (přepíše výchozí sweep)",
+    )
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1000,
+        help="Počet měřených requestů na jeden bod (default: 1000)",
+    )
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=100,
+        help="Warmup requesty před měřením (default: 100)",
+    )
+    parser.add_argument("--tile-size", type=int, default=TILE_SIZE_DEFAULT)
+    parser.add_argument("--timeout", type=float, default=300.0)
+    parser.add_argument(
+        "--output",
+        default=OUTPUT_CSV,
+        help=f"Výstupní CSV soubor (default: {OUTPUT_CSV})",
+    )
+    parser.add_argument(
+        "--skip-existing",
+        action="store_true",
+        help="Přeskočí (batch_size, concurrency) kombinace už změřené v CSV",
+    )
+    args = parser.parse_args()
+
+    url = args.url.rstrip("/") + "/"
+    pool = make_pool(args.tile_size)
+
+    # Determine sweep values
+    if args.concurrency is not None:
+        sweep = [args.concurrency]
+    elif args.concurrency_values:
+        sweep = [int(v.strip()) for v in args.concurrency_values.split(",")]
+    else:
+        sweep = concurrency_sweep_values(args.batch_size)
+
+    # Already measured (for --skip-existing)
+    existing: set[int] = set()
+    if args.skip_existing:
+        for row in load_csv(args.output, url):
+            if int(row["batch_size"]) == args.batch_size:
+                existing.add(int(row["concurrency"]))
+
+    print("=" * 60)
+    print("Virchow2 Benchmark Sweep")
+    print("=" * 60)
+    print(f"URL:              {url}")
+    print(f"max_batch_size:   {args.batch_size}  (musí odpovídat YAML!)")
+    print(f"concurrency sweep:{sweep}")
+    print(f"n per point:      {args.n}")
+    print(f"warmup:           {args.warmup}")
+    print(f"output:           {args.output}")
+    print()
+
+    # Warmup – jednou, s prostředním concurrency
+    warmup_conc = sweep[len(sweep) // 2]
+    print(f"Warmup ({args.warmup} img, concurrency={warmup_conc})...")
+    await run_batch(url, pool, args.warmup, warmup_conc, args.timeout)
+    print("Warmup done.\n")
+
+    results_this_run: list[dict] = []
+
+    for conc in sweep:
+        if conc in existing:
+            print(f"[SKIP] concurrency={conc} (already in CSV)")
+            continue
+
+        print(f"▶ batch_size={args.batch_size}  concurrency={conc}  ({args.n} img)...")
+        elapsed, ok, fail = await run_batch(url, pool, args.n, conc, args.timeout)
+        rps = ok / elapsed if elapsed > 0 else 0.0
+
+        row = {
+            "url": url,
+            "batch_size": args.batch_size,
+            "concurrency": conc,
+            "n": ok + fail,
+            "elapsed_s": f"{elapsed:.2f}",
+            "throughput_img_s": f"{rps:.1f}",
+            "ok": ok,
+            "fail": fail,
+        }
+        append_csv(args.output, row)
+        results_this_run.append(row)
+
+        status = f"  → {rps:.1f} img/s"
+        if fail:
+            status += f"  [{fail} failures!]"
+        print(status)
+
+        # Kratká pauza mezi body aby se server stabilizoval
+        await asyncio.sleep(2)
+
+    # Summary – jen aktuální batch_size
+    print()
+    print("=" * 60)
+    print(f"Výsledky pro batch_size = {args.batch_size}")
+    print("=" * 60)
+    all_rows = load_csv(args.output, url)
+    print_summary(all_rows, batch_size=args.batch_size)
+
+    # Pokud existují data pro více batch_size, ukaž i celkové porovnání
+    all_batch_sizes = sorted(set(int(r["batch_size"]) for r in all_rows))
+    if len(all_batch_sizes) > 1:
+        print()
+        print("=" * 60)
+        print("Celkové porovnání všech batch_size (best concurrency per batch)")
+        print("=" * 60)
+        # Pro každý batch_size vyber jen nejlepší concurrency
+        best_per_batch = []
+        for bs in all_batch_sizes:
+            candidates = [r for r in all_rows if int(r["batch_size"]) == bs]
+            if candidates:
+                best_per_batch.append(
+                    max(candidates, key=lambda r: float(r["throughput_img_s"]))
+                )
+        print_summary(best_per_batch)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/benchmark/load_test.py b/tests/benchmark/load_test.py
new file mode 100644
index 0000000..ba5b8ef
--- /dev/null
+++ b/tests/benchmark/load_test.py
@@ -0,0 +1,286 @@
+# kubectl cp tests/load_test.py rationai-jobs-ns/rayservice-model-virchow2-5qfmz-head-98tbv:/tmp/load_test.py
+# kubectl exec -n rationai-jobs-ns rayservice-model-virchow2-5qfmz-head-98tbv -- bash -c "python3 -u /tmp/load_test.py --url http://localhost:8000/virchow2/ --tiles 5000 --concurrency 128"
+from __future__ import annotations
+
+import argparse
+import asyncio
+import sys
+import time
+from dataclasses import dataclass, field
+
+import lz4.frame
+import numpy as np
+
+
+try:
+    import httpx
+except ImportError:
+    print("pip install httpx")
+    sys.exit(1)
+
+
+TILE_SIZE_DEFAULT = 224
+POOL_SIZE = 64
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def make_pool(tile_size: int, n: int = POOL_SIZE) -> list[bytes]:
+    rng = np.random.default_rng(seed=42)
+    pool = []
+    for _ in range(n):
+        img = rng.integers(0, 255, (tile_size, tile_size, 3), dtype=np.uint8)
+        pool.append(lz4.frame.compress(img.tobytes()))
+    return pool
+
+
+@dataclass
+class Stats:
+    ok: int = 0
+    fail_503: int = 0
+    fail_other: int = 0
+    latencies: list[float] = field(default_factory=list)
+    lock: asyncio.Lock = field(default_factory=asyncio.Lock)
+
+    @property
+    def total(self) -> int:
+        return self.ok + self.fail_503 + self.fail_other
+
+    def percentile(self, p: float) -> float:
+        if not self.latencies:
+            return 0.0
+        s = sorted(self.latencies)
+        idx = int(len(s) * p / 100)
+        return s[min(idx, len(s) - 1)]
+
+
+async def send_tile(
+    client: httpx.AsyncClient,
+    url: str,
+    payload: bytes,
+    stats: Stats,
+    timeout: float,
+    progress_every: int,
+) -> None:
+    t0 = time.perf_counter()
+    try:
+        r = await client.post(
+            url,
+            content=payload,
+            headers={"Content-Type": "application/octet-stream"},
+            timeout=timeout,
+        )
+        latency = time.perf_counter() - t0
+        async with stats.lock:
+            if r.status_code == 200:
+                stats.ok += 1
+                stats.latencies.append(latency)
+                if stats.ok % progress_every == 0:
+                    print(
+                        f"  ✓ {stats.ok} OK  |  503: {stats.fail_503}  |  other: {stats.fail_other}"
+                    )
+            elif r.status_code == 503:
+                stats.fail_503 += 1
+            else:
+                stats.fail_other += 1
+                print(f"  [WARN] HTTP {r.status_code}: {r.text[:120]}")
+    except Exception as e:
+        async with stats.lock:
+            stats.fail_other += 1
+        print(f"  [ERR] {e}")
+
+
+async def run_wsi(
+    url: str,
+    pool: list[bytes],
+    tiles: int,
+    concurrency: int,
+    timeout: float,
+    wsi_id: int,
+    stats: Stats,
+) -> float:
+    """Simuluje jeden WSI — pošle `tiles` requestů s max `concurrency` souběžně."""
+    semaphore = asyncio.Semaphore(concurrency)
+    pool_len = len(pool)
+
+    limits = httpx.Limits(
+        max_connections=concurrency + 8,
+        max_keepalive_connections=concurrency + 8,
+    )
+
+    async def bounded_send(client: httpx.AsyncClient, idx: int) -> None:
+        async with semaphore:
+            await send_tile(
+                client,
+                url,
+                pool[idx % pool_len],
+                stats,
+                timeout,
+                progress_every=max(tiles // 10, 100),
+            )
+
+    t0 = time.perf_counter()
+    async with httpx.AsyncClient(limits=limits, follow_redirects=True) as client:
+        tasks = [bounded_send(client, i) for i in range(tiles)]
+        await asyncio.gather(*tasks)
+    return time.perf_counter() - t0
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+async def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--url", default="http://localhost:8000/virchow2/", help="Endpoint URL"
+    )
+    parser.add_argument(
+        "--tiles",
+        type=int,
+        default=5000,
+        help="Počet dlaždic na jeden WSI (default: 5000)",
+    )
+    parser.add_argument(
+        "--wsi-count",
+        type=int,
+        default=1,
+        help="Počet paralelních WSI slidů (default: 1)",
+    )
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        default=128,
+        help="Max souběžných requestů na WSI (default: 128, "
+        "mělo by odpovídat target_ongoing_requests)",
+    )
+    parser.add_argument("--tile-size", type=int, default=TILE_SIZE_DEFAULT)
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=120.0,
+        help="Timeout na jeden request v sekundách (default: 120)",
+    )
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=50,
+        help="Warmup requestů před testem (default: 50)",
+    )
+    parser.add_argument("--no-warmup", action="store_true", help="Přeskočit warmup")
+    args = parser.parse_args()
+
+    url = args.url.rstrip("/") + "/"
+    pool = make_pool(args.tile_size)
+    total_tiles = args.tiles * args.wsi_count
+
+    print("=" * 60)
+    print("Virchow2 WSI Load Test")
+    print("=" * 60)
+    print(f"URL:              {url}")
+    print(f"Tiles per WSI:    {args.tiles:,}")
+    print(f"WSI count:        {args.wsi_count}")
+    print(f"Total tiles:      {total_tiles:,}")
+    print(f"Concurrency/WSI:  {args.concurrency}")
+    print(f"Total concurrent: {args.concurrency * args.wsi_count}")
+    print(f"Request timeout:  {args.timeout}s")
+    print()
+
+    # Warmup
+    if not args.no_warmup:
+        print(f"Warmup ({args.warmup} tiles)...")
+        warmup_stats = Stats()
+        await run_wsi(
+            url,
+            pool,
+            args.warmup,
+            min(args.concurrency, 32),
+            args.timeout,
+            wsi_id=0,
+            stats=warmup_stats,
+        )
+        print(
+            f"Warmup done (ok={warmup_stats.ok}, fail={warmup_stats.fail_503 + warmup_stats.fail_other}).\n"
+        )
+
+    # Actual test
+    stats = Stats()
+    print(
+        f"▶ Spouštím {'paralelně ' + str(args.wsi_count) + ' WSI' if args.wsi_count > 1 else '1 WSI'}  "
+        f"({total_tiles:,} tiles celkem)...\n"
+    )
+
+    t0 = time.perf_counter()
+
+    if args.wsi_count == 1:
+        await run_wsi(
+            url, pool, args.tiles, args.concurrency, args.timeout, wsi_id=0, stats=stats
+        )
+    else:
+        # Všechny WSI slidy spustit paralelně — simulace více scannerů najednou
+        await asyncio.gather(
+            *[
+                run_wsi(
+                    url,
+                    pool,
+                    args.tiles,
+                    args.concurrency,
+                    args.timeout,
+                    wsi_id=i,
+                    stats=stats,
+                )
+                for i in range(args.wsi_count)
+            ]
+        )
+
+    elapsed = time.perf_counter() - t0
+    rps = stats.ok / elapsed if elapsed > 0 else 0.0
+
+    # Report
+    print()
+    print("=" * 60)
+    print("Výsledky")
+    print("=" * 60)
+    print(f"Celkový čas:      {elapsed:.1f}s  ({elapsed / 60:.1f} min)")
+    print(f"Throughput:       {rps:.1f} img/s")
+    print()
+    print(
+        f"Úspěšné:          {stats.ok:,} / {total_tiles:,}  ({100 * stats.ok / total_tiles:.1f}%)"
+    )
+    print(
+        f"503 backpressure: {stats.fail_503:,}  ({100 * stats.fail_503 / total_tiles:.1f}%)"
+    )
+    print(f"Jiné chyby:       {stats.fail_other:,}")
+    print()
+    if stats.latencies:
+        print("Latence (úspěšné requesty):")
+        print(f"  p50:  {stats.percentile(50) * 1000:.0f} ms")
+        print(f"  p90:  {stats.percentile(90) * 1000:.0f} ms")
+        print(f"  p99:  {stats.percentile(99) * 1000:.0f} ms")
+        print(f"  max:  {max(stats.latencies) * 1000:.0f} ms")
+    print()
+
+    # Verdict
+    fail_rate = (stats.fail_503 + stats.fail_other) / total_tiles
+    if fail_rate == 0:
+        print("✅ PASS — žádné chyby, nastavení je v pořádku pro WSI.")
+    elif fail_rate < 0.01:
+        print(
+            f"⚠️  WARN — {fail_rate * 100:.2f}% chyb. Zvažte zvýšení max_queued_requests."
+        )
+    else:
+        print(
+            f"❌ FAIL — {fail_rate * 100:.1f}% chyb. Nastavení nestačí pro tento objem."
+        )
+        print("   → Zvyšte max_queued_requests nebo snižte --concurrency klientů.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/model_snapshots/__init__.py b/tests/model_snapshots/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
new file mode 100644
index 0000000..e0bc161
--- /dev/null
+++ b/tests/model_snapshots/_shared.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import hashlib
+import importlib
+import os
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+
+def _required_env(var_name: str) -> str:
+    value = os.environ.get(var_name)
+    if not value:
+        pytest.skip(f"Missing env var `{var_name}`.")
+    return value
+
+
+def _sha256(path: Path) -> str:
+    digest = hashlib.sha256()
+    with path.open("rb") as file_handle:
+        for chunk in iter(lambda: file_handle.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def _read_tile_from_slide(
+    slide_path: str,
+    tile_size: int,
+    level: int,
+) -> np.ndarray:
+    try:
+        from ratiopath.openslide import OpenSlide
+    except ImportError:
+        pytest.skip("Python package `ratiopath` is not installed.")
+
+    with OpenSlide(slide_path) as slide:
+        extent_x, extent_y = slide.level_dimensions[level]
+        x = max(0, (extent_x - tile_size) // 2)
+        y = max(0, (extent_y - tile_size) // 2)
+        tile = slide.read_region_relative(
+            (x, y), level, (tile_size, tile_size)
+        ).convert("RGB")
+
+    return np.asarray(tile, dtype=np.uint8)
+
+
+def _client(
+    models_base_url: str,
+    timeout_s: float,
+):
+    try:
+        rationai = importlib.import_module("rationai")
+    except ImportError:
+        pytest.skip("Python package `rationai` is not installed.")
+
+    return rationai.Client(models_base_url=models_base_url, timeout=timeout_s)
+
+
+def run_binary_classifier_case(
+    model_id: str,
+    slide_path: str,
+    expected_score: float,
+    tile_size: int = 512,
+    level: int = 0,
+    timeout_s: float = 600.0,
+    tolerance: float = 0.00001,
+) -> None:
+    models_base_url = os.environ.get(
+        "MODEL_SERVICE_MODELS_BASE_URL",
+        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+    )
+
+    tile = _read_tile_from_slide(
+        slide_path=slide_path, tile_size=tile_size, level=level
+    )
+
+    with _client(models_base_url=models_base_url, timeout_s=timeout_s) as client:
+        prediction = client.models.classify_image(
+            model=model_id, image=tile, timeout=timeout_s
+        )
+
+    if not isinstance(prediction, int | float):
+        pytest.fail(
+            "Expected binary classifier to return scalar score, "
+            f"got {type(prediction)}: {prediction}"
+        )
+
+    actual_score = float(prediction)
+    assert abs(actual_score - expected_score) <= tolerance, (
+        f"Binary score mismatch: expected={expected_score}, actual={actual_score}, "
+        f"tolerance={tolerance}"
+    )
+
+
+def run_semantic_segmentation_case(
+    model_id: str,
+    slide_path: str,
+    expected_array_path: Path | str,
+    tile_size: int = 1024,
+    level: int = 0,
+    timeout_s: float = 600.0,
+    atol: float = 0.0,
+    rtol: float = 0.0,
+) -> None:
+    models_base_url = os.environ.get(
+        "MODEL_SERVICE_MODELS_BASE_URL",
+        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+    )
+    expected_array_path = Path(expected_array_path)
+
+    if not expected_array_path.exists():
+        pytest.fail(f"Expected array file does not exist: {expected_array_path}")
+
+    tile = _read_tile_from_slide(
+        slide_path=slide_path, tile_size=tile_size, level=level
+    )
+    expected = np.load(expected_array_path)
+
+    with _client(models_base_url=models_base_url, timeout_s=timeout_s) as client:
+        prediction = client.models.segment_image(
+            model=model_id, image=tile, timeout=timeout_s
+        )
+
+    actual = np.asarray(prediction)
+
+    if actual.shape != expected.shape:
+        pytest.fail(
+            f"Semantic shape mismatch: expected={expected.shape}, actual={actual.shape}"
+        )
+
+    if not np.allclose(actual, expected, rtol=rtol, atol=atol):
+        mismatch = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
+        pytest.fail(
+            "Semantic output mismatch: arrays differ beyond tolerance "
+            f"(atol={atol}, rtol={rtol}, max_abs_diff={mismatch})"
+        )
+
+
+def verify_file_hash(path: Path, expected_hash: str) -> None:
+    actual_hash = _sha256(path)
+    assert actual_hash == expected_hash, (
+        f"Hash mismatch for {path}: expected={expected_hash}, actual={actual_hash}"
+    )
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
new file mode 100644
index 0000000..f5ea20d
--- /dev/null
+++ b/tests/model_snapshots/generate_references.py
@@ -0,0 +1,103 @@
+import json
+import os
+from pathlib import Path
+
+import numpy as np
+
+from tests.model_snapshots._shared import _client, _read_tile_from_slide
+
+
+OUT_DIR = Path("/mnt/test_refs")
+
+CASES = [
+    {
+        "label": "breast",
+        "slide_path": "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
+        "model_id": "episeg-1",
+        "type": "semantic",
+        "tile_size": 1024,
+        "level": 0,
+    },
+    {
+        "label": "breast",
+        "slide_path": "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
+        "model_id": "prostate-classifier-1",
+        "type": "binary",
+        "tile_size": 512,
+        "level": 0,
+    },
+    {
+        "label": "colorectum",
+        "slide_path": "/mnt/data/MOU/colorectum/colorectal_cancer_2020-2024-06/2020_00106-01-N.mrxs",
+        "model_id": "episeg-1",
+        "type": "semantic",
+        "tile_size": 1024,
+        "level": 0,
+    },
+    {
+        "label": "colon",
+        "slide_path": "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
+        "model_id": "episeg-1",
+        "type": "semantic",
+        "tile_size": 1024,
+        "level": 0,
+    },
+]
+
+
+def generate_references() -> None:
+    models_base_url = os.environ.get(
+        "MODEL_SERVICE_MODELS_BASE_URL",
+        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+    )
+
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    print(f"== Generating references to {OUT_DIR} via {models_base_url} ==")
+
+    with _client(models_base_url=models_base_url, timeout_s=600) as client:
+        for case in CASES:
+            label = case["label"]
+            model_id = case["model_id"]
+            mtype = case["type"]
+            slide_path = case["slide_path"]
+            tile_size = case["tile_size"]
+            level = case["level"]
+
+            print(f"\nProcessing [{label}] => Model: {model_id} ({mtype})")
+            print(f"Slide path: {slide_path}")
+
+            try:
+                tile = _read_tile_from_slide(
+                    slide_path=slide_path, tile_size=tile_size, level=level
+                )
+            except Exception as e:
+                print(f"  -> Failed to read tile: {e}")
+                continue
+
+            try:
+                if mtype == "binary":
+                    prediction = client.models.classify_image(
+                        model=model_id, image=tile, timeout=600
+                    )
+                    out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
+                    with out_file.open("w") as f:
+                        json.dump({"expected_score": float(prediction)}, f, indent=2)
+                    print(f"  -> SUCCESS! Saved binary score to {out_file}")
+
+                elif mtype == "semantic":
+                    prediction = client.models.segment_image(
+                        model=model_id, image=tile, timeout=600
+                    )
+                    arr = np.asarray(prediction)
+                    out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
+                    np.save(out_file, arr)
+                    print(
+                        f"  -> SUCCESS! Saved semantic array {arr.shape} to {out_file}"
+                    )
+
+            except Exception as e:
+                print(f"  -> ERROR during prediction/saving: {e}")
+
+
+if __name__ == "__main__":
+    generate_references()
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
new file mode 100644
index 0000000..cd533f8
--- /dev/null
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -0,0 +1,36 @@
+import json
+from pathlib import Path
+
+import pytest
+
+from tests.model_snapshots._shared import run_binary_classifier_case
+
+
+@pytest.mark.parametrize(
+    "label, slide_path",
+    [
+        (
+            "breast",
+            "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
+        ),
+    ],
+)
+def test_binary_classifier(label: str, slide_path: str) -> None:
+    model_id = "prostate-classifier-1"
+    json_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.json")
+
+    if json_path.exists():
+        with json_path.open() as f:
+            expected_score = json.load(f)["expected_score"]
+    else:
+        pytest.skip(
+            f"Reference file {json_path} missing. Run generate_references.py first."
+        )
+
+    run_binary_classifier_case(
+        model_id=model_id,
+        slide_path=slide_path,
+        expected_score=expected_score,
+        tile_size=512,
+        level=0,
+    )
diff --git a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
new file mode 100644
index 0000000..a6301a8
--- /dev/null
+++ b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+
+import pytest
+
+from tests.model_snapshots._shared import run_semantic_segmentation_case
+
+
+@pytest.mark.parametrize(
+    "label, slide_path",
+    [
+        (
+            "breast",
+            "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
+        ),
+        (
+            "colorectum",
+            "/mnt/data/MOU/colorectum/colorectal_cancer_2020-2024-06/2020_00106-01-N.mrxs",
+        ),
+        (
+            "colon",
+            "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
+        ),
+    ],
+)
+def test_semantic_episeg(label: str, slide_path: str) -> None:
+    model_id = "episeg-1"
+    expected_array_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.npy")
+
+    run_semantic_segmentation_case(
+        model_id=model_id,
+        slide_path=slide_path,
+        expected_array_path=expected_array_path,
+        tile_size=1024,
+        level=0,
+    )

From 5f80c05c1089d1cbfa28539efab96aab43e02a30 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 14:50:30 +0200
Subject: [PATCH 02/35] test runner

---
 builders/test_runner.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 builders/test_runner.py

diff --git a/builders/test_runner.py b/builders/test_runner.py
new file mode 100644
index 0000000..e8a8cef
--- /dev/null
+++ b/builders/test_runner.py
@@ -0,0 +1,36 @@
+import subprocess
+import sys
+
+from fastapi import FastAPI
+from ray import serve
+
+
+fastapi = FastAPI()
+
+
+@serve.deployment(num_replicas=1)
+@serve.ingress(fastapi)
+class TestRunner:
+    @fastapi.post("/")
+    def run(self) -> dict:
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "pytest",
+                "tests/model_snapshots/",
+                "-v",
+                "--tb=short",
+            ],
+            capture_output=True,
+            text=True,
+        )
+        return {
+            "returncode": result.returncode,
+            "stdout": result.stdout,
+            "stderr": result.stderr,
+            "passed": result.returncode == 0,
+        }
+
+
+app = TestRunner.bind()

From 4aed7548abb573a5115fb3439ce05fd5fb61f4d7 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 14:54:43 +0200
Subject: [PATCH 03/35] fix

---
 builders/test_runner.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/builders/test_runner.py b/builders/test_runner.py
index e8a8cef..3a0a578 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -11,6 +11,12 @@
 @serve.deployment(num_replicas=1)
 @serve.ingress(fastapi)
 class TestRunner:
+    def __init__(self) -> None:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "pytest", "-q"],
+            check=True,
+        )
+
     @fastapi.post("/")
     def run(self) -> dict:
         result = subprocess.run(

From 85db95c693f400bf76cbe2d3ef8a84a2c999cfc4 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 15:10:07 +0200
Subject: [PATCH 04/35] tests

---
 tests/model_snapshots/_shared.py              |  95 ++++++--------
 tests/model_snapshots/generate_references.py  | 119 +++++++-----------
 .../test_binary_classifier_model_snapshot.py  |   7 +-
 ...st_semantic_segmentation_model_snapshot.py |  11 +-
 4 files changed, 87 insertions(+), 145 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index e0bc161..5edd6c6 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -1,34 +1,36 @@
 from __future__ import annotations
 
 import hashlib
-import importlib
 import os
 from pathlib import Path
 
+import httpx
+import lz4.frame
 import numpy as np
 import pytest
-
-
-def _required_env(var_name: str) -> str:
-    value = os.environ.get(var_name)
-    if not value:
-        pytest.skip(f"Missing env var `{var_name}`.")
-    return value
+from numpy.typing import NDArray
 
 
 def _sha256(path: Path) -> str:
     digest = hashlib.sha256()
-    with path.open("rb") as file_handle:
-        for chunk in iter(lambda: file_handle.read(1024 * 1024), b""):
+    with path.open("rb") as fh:
+        for chunk in iter(lambda: fh.read(1024 * 1024), b""):
             digest.update(chunk)
     return digest.hexdigest()
 
 
+def _models_base_url() -> str:
+    return os.environ.get(
+        "MODEL_SERVICE_MODELS_BASE_URL",
+        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+    )
+
+
 def _read_tile_from_slide(
     slide_path: str,
     tile_size: int,
     level: int,
-) -> np.ndarray:
+) -> NDArray[np.uint8]:
     try:
         from ratiopath.openslide import OpenSlide
     except ImportError:
@@ -45,16 +47,31 @@ def _read_tile_from_slide(
     return np.asarray(tile, dtype=np.uint8)
 
 
-def _client(
-    models_base_url: str,
+def _classify(
+    model_id: str,
+    tile: NDArray[np.uint8],
     timeout_s: float,
-):
-    try:
-        rationai = importlib.import_module("rationai")
-    except ImportError:
-        pytest.skip("Python package `rationai` is not installed.")
+) -> float:
+    url = f"{_models_base_url()}/{model_id}/"
+    data = lz4.frame.compress(tile.tobytes())
+    response = httpx.post(url, content=data, timeout=httpx.Timeout(timeout_s))
+    response.raise_for_status()
+    return float(response.json())
 
-    return rationai.Client(models_base_url=models_base_url, timeout=timeout_s)
+
+def _segment(
+    model_id: str,
+    tile: NDArray[np.uint8],
+    timeout_s: float,
+) -> NDArray[np.float16]:
+    h, w = tile.shape[:2]
+    url = f"{_models_base_url()}/{model_id}/"
+    data = lz4.frame.compress(tile.tobytes())
+    response = httpx.post(url, content=data, timeout=httpx.Timeout(timeout_s))
+    response.raise_for_status()
+    return np.frombuffer(
+        lz4.frame.decompress(response.content), dtype=np.float16
+    ).reshape(-1, h, w)
 
 
 def run_binary_classifier_case(
@@ -66,27 +83,11 @@ def run_binary_classifier_case(
     timeout_s: float = 600.0,
     tolerance: float = 0.00001,
 ) -> None:
-    models_base_url = os.environ.get(
-        "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
-    )
-
     tile = _read_tile_from_slide(
         slide_path=slide_path, tile_size=tile_size, level=level
     )
+    actual_score = _classify(model_id=model_id, tile=tile, timeout_s=timeout_s)
 
-    with _client(models_base_url=models_base_url, timeout_s=timeout_s) as client:
-        prediction = client.models.classify_image(
-            model=model_id, image=tile, timeout=timeout_s
-        )
-
-    if not isinstance(prediction, int | float):
-        pytest.fail(
-            "Expected binary classifier to return scalar score, "
-            f"got {type(prediction)}: {prediction}"
-        )
-
-    actual_score = float(prediction)
     assert abs(actual_score - expected_score) <= tolerance, (
         f"Binary score mismatch: expected={expected_score}, actual={actual_score}, "
         f"tolerance={tolerance}"
@@ -103,37 +104,23 @@ def run_semantic_segmentation_case(
     atol: float = 0.0,
     rtol: float = 0.0,
 ) -> None:
-    models_base_url = os.environ.get(
-        "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
-    )
     expected_array_path = Path(expected_array_path)
-
     if not expected_array_path.exists():
-        pytest.fail(f"Expected array file does not exist: {expected_array_path}")
+        pytest.fail(f"Reference file does not exist: {expected_array_path}")
 
     tile = _read_tile_from_slide(
         slide_path=slide_path, tile_size=tile_size, level=level
     )
     expected = np.load(expected_array_path)
-
-    with _client(models_base_url=models_base_url, timeout_s=timeout_s) as client:
-        prediction = client.models.segment_image(
-            model=model_id, image=tile, timeout=timeout_s
-        )
-
-    actual = np.asarray(prediction)
+    actual = _segment(model_id=model_id, tile=tile, timeout_s=timeout_s)
 
     if actual.shape != expected.shape:
-        pytest.fail(
-            f"Semantic shape mismatch: expected={expected.shape}, actual={actual.shape}"
-        )
+        pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
 
     if not np.allclose(actual, expected, rtol=rtol, atol=atol):
         mismatch = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
         pytest.fail(
-            "Semantic output mismatch: arrays differ beyond tolerance "
-            f"(atol={atol}, rtol={rtol}, max_abs_diff={mismatch})"
+            f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={mismatch})"
         )
 
 
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index f5ea20d..04f831f 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -1,10 +1,10 @@
 import json
-import os
 from pathlib import Path
 
+import httpx
+import lz4.frame
 import numpy as np
-
-from tests.model_snapshots._shared import _client, _read_tile_from_slide
+from _shared import _models_base_url, _read_tile_from_slide
 
 
 OUT_DIR = Path("/mnt/test_refs")
@@ -18,85 +18,50 @@
         "tile_size": 1024,
         "level": 0,
     },
-    {
-        "label": "breast",
-        "slide_path": "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
-        "model_id": "prostate-classifier-1",
-        "type": "binary",
-        "tile_size": 512,
-        "level": 0,
-    },
-    {
-        "label": "colorectum",
-        "slide_path": "/mnt/data/MOU/colorectum/colorectal_cancer_2020-2024-06/2020_00106-01-N.mrxs",
-        "model_id": "episeg-1",
-        "type": "semantic",
-        "tile_size": 1024,
-        "level": 0,
-    },
-    {
-        "label": "colon",
-        "slide_path": "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
-        "model_id": "episeg-1",
-        "type": "semantic",
-        "tile_size": 1024,
-        "level": 0,
-    },
 ]
 
 
 def generate_references() -> None:
-    models_base_url = os.environ.get(
-        "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
-    )
-
+    base_url = _models_base_url()
     OUT_DIR.mkdir(parents=True, exist_ok=True)
-    print(f"== Generating references to {OUT_DIR} via {models_base_url} ==")
-
-    with _client(models_base_url=models_base_url, timeout_s=600) as client:
-        for case in CASES:
-            label = case["label"]
-            model_id = case["model_id"]
-            mtype = case["type"]
-            slide_path = case["slide_path"]
-            tile_size = case["tile_size"]
-            level = case["level"]
-
-            print(f"\nProcessing [{label}] => Model: {model_id} ({mtype})")
-            print(f"Slide path: {slide_path}")
-
-            try:
-                tile = _read_tile_from_slide(
-                    slide_path=slide_path, tile_size=tile_size, level=level
-                )
-            except Exception as e:
-                print(f"  -> Failed to read tile: {e}")
-                continue
-
-            try:
-                if mtype == "binary":
-                    prediction = client.models.classify_image(
-                        model=model_id, image=tile, timeout=600
-                    )
-                    out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
-                    with out_file.open("w") as f:
-                        json.dump({"expected_score": float(prediction)}, f, indent=2)
-                    print(f"  -> SUCCESS! Saved binary score to {out_file}")
-
-                elif mtype == "semantic":
-                    prediction = client.models.segment_image(
-                        model=model_id, image=tile, timeout=600
-                    )
-                    arr = np.asarray(prediction)
-                    out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
-                    np.save(out_file, arr)
-                    print(
-                        f"  -> SUCCESS! Saved semantic array {arr.shape} to {out_file}"
-                    )
-
-            except Exception as e:
-                print(f"  -> ERROR during prediction/saving: {e}")
+    print(f"== Generating references to {OUT_DIR} via {base_url} ==")
+
+    for case in CASES:
+        label, model_id, mtype = case["label"], case["model_id"], case["type"]
+        print(f"\n[{label}] {model_id} ({mtype})")
+
+        try:
+            tile = _read_tile_from_slide(
+                case["slide_path"], case["tile_size"], case["level"]
+            )
+        except Exception as e:
+            print(f"  -> Failed to read tile: {e}")
+            continue
+
+        url = f"{base_url}/{model_id}/"
+        data = lz4.frame.compress(tile.tobytes())
+
+        try:
+            response = httpx.post(url, content=data, timeout=600)
+            response.raise_for_status()
+
+            if mtype == "binary":
+                out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
+                with out_file.open("w") as f:
+                    json.dump({"expected_score": float(response.json())}, f, indent=2)
+                print(f"  -> Saved {out_file}")
+
+            elif mtype == "semantic":
+                h, w = tile.shape[:2]
+                arr = np.frombuffer(
+                    lz4.frame.decompress(response.content), dtype=np.float16
+                ).reshape(-1, h, w)
+                out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
+                np.save(out_file, arr)
+                print(f"  -> Saved {out_file} shape={arr.shape}")
+
+        except Exception as e:
+            print(f"  -> ERROR: {e}")
 
 
 if __name__ == "__main__":
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index cd533f8..87709a7 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -2,16 +2,15 @@
 from pathlib import Path
 
 import pytest
-
-from tests.model_snapshots._shared import run_binary_classifier_case
+from _shared import run_binary_classifier_case
 
 
 @pytest.mark.parametrize(
     "label, slide_path",
     [
         (
-            "breast",
-            "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
+            "colon",
+            "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
         ),
     ],
 )
diff --git a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
index a6301a8..294f682 100644
--- a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
+++ b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
@@ -1,8 +1,7 @@
 from pathlib import Path
 
 import pytest
-
-from tests.model_snapshots._shared import run_semantic_segmentation_case
+from _shared import run_semantic_segmentation_case
 
 
 @pytest.mark.parametrize(
@@ -12,14 +11,6 @@
             "breast",
             "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
         ),
-        (
-            "colorectum",
-            "/mnt/data/MOU/colorectum/colorectal_cancer_2020-2024-06/2020_00106-01-N.mrxs",
-        ),
-        (
-            "colon",
-            "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
-        ),
     ],
 )
 def test_semantic_episeg(label: str, slide_path: str) -> None:

From 115e8a2f32775296b101cafa98859f24d79dd670 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 15:14:40 +0200
Subject: [PATCH 05/35] pyproj

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index df49f5d..57ce2e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,3 +19,6 @@ dependencies = [
 
 [dependency-groups]
 dev = ["mypy>=1.18.2", "ruff>=0.14.6", "pytest>=8.4.2"]
+
+[tool.pytest.ini_options]
+pythonpath = ["tests/model_snapshots"]

From 8352582e3c20b867c9ad9454a9ffe8c96d9b4631 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 16:23:23 +0200
Subject: [PATCH 06/35] better output

---
 builders/test_runner.py          |  5 ++--
 tests/model_snapshots/_shared.py | 42 ++++++++++----------------------
 2 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/builders/test_runner.py b/builders/test_runner.py
index 3a0a578..97f125c 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -27,15 +27,14 @@ def run(self) -> dict:
                 "tests/model_snapshots/",
                 "-v",
                 "--tb=short",
+                "--no-header",
             ],
             capture_output=True,
             text=True,
         )
         return {
-            "returncode": result.returncode,
-            "stdout": result.stdout,
-            "stderr": result.stderr,
             "passed": result.returncode == 0,
+            "output": result.stdout,
         }
 
 
diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 5edd6c6..6b2b1bc 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -4,8 +4,6 @@
 import os
 from pathlib import Path
 
-import httpx
-import lz4.frame
 import numpy as np
 import pytest
 from numpy.typing import NDArray
@@ -22,7 +20,7 @@ def _sha256(path: Path) -> str:
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
     )
 
 
@@ -47,31 +45,13 @@ def _read_tile_from_slide(
     return np.asarray(tile, dtype=np.uint8)
 
 
-def _classify(
-    model_id: str,
-    tile: NDArray[np.uint8],
-    timeout_s: float,
-) -> float:
-    url = f"{_models_base_url()}/{model_id}/"
-    data = lz4.frame.compress(tile.tobytes())
-    response = httpx.post(url, content=data, timeout=httpx.Timeout(timeout_s))
-    response.raise_for_status()
-    return float(response.json())
+def _client(timeout_s: float = 600.0):
+    try:
+        from rationai import Client
+    except ImportError:
+        pytest.skip("Python package `rationai` is not installed.")
 
-
-def _segment(
-    model_id: str,
-    tile: NDArray[np.uint8],
-    timeout_s: float,
-) -> NDArray[np.float16]:
-    h, w = tile.shape[:2]
-    url = f"{_models_base_url()}/{model_id}/"
-    data = lz4.frame.compress(tile.tobytes())
-    response = httpx.post(url, content=data, timeout=httpx.Timeout(timeout_s))
-    response.raise_for_status()
-    return np.frombuffer(
-        lz4.frame.decompress(response.content), dtype=np.float16
-    ).reshape(-1, h, w)
+    return Client(models_base_url=_models_base_url(), timeout=timeout_s)
 
 
 def run_binary_classifier_case(
@@ -86,7 +66,9 @@ def run_binary_classifier_case(
     tile = _read_tile_from_slide(
         slide_path=slide_path, tile_size=tile_size, level=level
     )
-    actual_score = _classify(model_id=model_id, tile=tile, timeout_s=timeout_s)
+
+    with _client(timeout_s=timeout_s) as client:
+        actual_score = float(client.models.classify_image(model=model_id, image=tile))
 
     assert abs(actual_score - expected_score) <= tolerance, (
         f"Binary score mismatch: expected={expected_score}, actual={actual_score}, "
@@ -112,7 +94,9 @@ def run_semantic_segmentation_case(
         slide_path=slide_path, tile_size=tile_size, level=level
     )
     expected = np.load(expected_array_path)
-    actual = _segment(model_id=model_id, tile=tile, timeout_s=timeout_s)
+
+    with _client(timeout_s=timeout_s) as client:
+        actual = np.asarray(client.models.segment_image(model=model_id, image=tile))
 
     if actual.shape != expected.shape:
         pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")

From b020ebcc1dd7e57a54c37c1a513032feb661751f Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 17:37:48 +0200
Subject: [PATCH 07/35] fixes

---
 helm/rayservice/applications/test-runner.yaml | 16 ++++++++++++++++
 helm/rayservice/values.yaml                   |  1 +
 helm/rayservice/workers/cpu-workers.yaml      |  5 +++++
 helm/rayservice/workers/mig20-workers.yaml    |  5 +++++
 pvc/model-test-refs-pvc.yaml                  |  4 ++--
 tests/model_snapshots/_shared.py              |  6 +++---
 6 files changed, 32 insertions(+), 5 deletions(-)
 create mode 100644 helm/rayservice/applications/test-runner.yaml

diff --git a/helm/rayservice/applications/test-runner.yaml b/helm/rayservice/applications/test-runner.yaml
new file mode 100644
index 0000000..45f5b68
--- /dev/null
+++ b/helm/rayservice/applications/test-runner.yaml
@@ -0,0 +1,16 @@
+- name: test-runner
+  import_path: builders.test_runner:app
+  route_prefix: /run-tests
+  runtime_env:
+    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/tests.zip?v5
+    pip:
+      - git+https://github.com/RationAI/rationai-sdk-python.git
+  deployments:
+    - name: TestRunner
+      autoscaling_config:
+        min_replicas: 0
+        max_replicas: 1
+        target_ongoing_requests: 1
+      ray_actor_options:
+        num_cpus: 4
+        memory: 8589934592
diff --git a/helm/rayservice/values.yaml b/helm/rayservice/values.yaml
index b6e24b7..7b90191 100644
--- a/helm/rayservice/values.yaml
+++ b/helm/rayservice/values.yaml
@@ -7,3 +7,4 @@ applications:
   - heatmap-builder
   - prostate-classifier-1
   - virchow2
+  - test-runner
diff --git a/helm/rayservice/workers/cpu-workers.yaml b/helm/rayservice/workers/cpu-workers.yaml
index 0c98d85..eda5986 100644
--- a/helm/rayservice/workers/cpu-workers.yaml
+++ b/helm/rayservice/workers/cpu-workers.yaml
@@ -45,6 +45,8 @@ template:
             mountPath: /mnt/cache
           - name: huggingface-cache
             mountPath: /mnt/huggingface_cache
+          - name: test-refs
+            mountPath: /mnt/test_refs
     volumes:
       - name: data
         persistentVolumeClaim:
@@ -64,3 +66,6 @@ template:
       - name: huggingface-cache
         persistentVolumeClaim:
           claimName: huggingface-cache-pvc
+      - name: test-refs
+        persistentVolumeClaim:
+          claimName: model-test-refs-pvc
diff --git a/helm/rayservice/workers/mig20-workers.yaml b/helm/rayservice/workers/mig20-workers.yaml
index 77032d9..86571c7 100644
--- a/helm/rayservice/workers/mig20-workers.yaml
+++ b/helm/rayservice/workers/mig20-workers.yaml
@@ -55,6 +55,8 @@ template:
             mountPath: /mnt/cache
           - name: huggingface-cache
             mountPath: /mnt/huggingface_cache
+          - name: test-refs
+            mountPath: /mnt/test_refs
     volumes:
       - name: data
         persistentVolumeClaim:
@@ -74,3 +76,6 @@ template:
       - name: huggingface-cache
         persistentVolumeClaim:
           claimName: huggingface-cache-pvc
+      - name: test-refs
+        persistentVolumeClaim:
+          claimName: model-test-refs-pvc
diff --git a/pvc/model-test-refs-pvc.yaml b/pvc/model-test-refs-pvc.yaml
index 4b3390a..ae08c4e 100644
--- a/pvc/model-test-refs-pvc.yaml
+++ b/pvc/model-test-refs-pvc.yaml
@@ -2,11 +2,11 @@ apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
   name: model-test-refs-pvc
-  labels:
-    app: model-service-tests
+  namespace: rationai-jobs-ns
 spec:
   accessModes:
     - ReadWriteMany
   resources:
     requests:
       storage: 5Gi
+  storageClassName: nfs-csi
diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 6b2b1bc..99347b4 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -20,7 +20,7 @@ def _sha256(path: Path) -> str:
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
     )
 
 
@@ -45,7 +45,7 @@ def _read_tile_from_slide(
     return np.asarray(tile, dtype=np.uint8)
 
 
-def _client(timeout_s: float = 600.0):
+def _client(timeout_s: float = 1600.0):
     try:
         from rationai import Client
     except ImportError:
@@ -82,7 +82,7 @@ def run_semantic_segmentation_case(
     expected_array_path: Path | str,
     tile_size: int = 1024,
     level: int = 0,
-    timeout_s: float = 600.0,
+    timeout_s: float = 1200.0,
     atol: float = 0.0,
     rtol: float = 0.0,
 ) -> None:

From b74de01ba2fffb7f19dc1ff5f4bd80ac8b23a0ed Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 18:51:21 +0200
Subject: [PATCH 08/35] sdk use

---
 tests/model_snapshots/_shared.py             | 77 ++++++++-----------
 tests/model_snapshots/generate_references.py | 78 ++++++++++----------
 2 files changed, 73 insertions(+), 82 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 99347b4..187aaa6 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -1,22 +1,14 @@
 from __future__ import annotations
 
-import hashlib
 import os
 from pathlib import Path
+from time import perf_counter
 
 import numpy as np
 import pytest
 from numpy.typing import NDArray
 
 
-def _sha256(path: Path) -> str:
-    digest = hashlib.sha256()
-    with path.open("rb") as fh:
-        for chunk in iter(lambda: fh.read(1024 * 1024), b""):
-            digest.update(chunk)
-    return digest.hexdigest()
-
-
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
@@ -24,20 +16,24 @@ def _models_base_url() -> str:
     )
 
 
-def _read_tile_from_slide(
-    slide_path: str,
-    tile_size: int,
-    level: int,
-) -> NDArray[np.uint8]:
+def _client(timeout_s: float = 1200.0):
+    try:
+        from rationai import Client
+    except ImportError:
+        pytest.skip("Python package `rationai` is not installed.")
+    return Client(models_base_url=_models_base_url(), timeout=timeout_s)
+
+
+def _read_tile(slide_path: str, tile_size: int, level: int) -> NDArray[np.uint8]:
     try:
         from ratiopath.openslide import OpenSlide
     except ImportError:
         pytest.skip("Python package `ratiopath` is not installed.")
 
     with OpenSlide(slide_path) as slide:
-        extent_x, extent_y = slide.level_dimensions[level]
-        x = max(0, (extent_x - tile_size) // 2)
-        y = max(0, (extent_y - tile_size) // 2)
+        w, h = slide.level_dimensions[level]
+        x = max(0, (w - tile_size) // 2)
+        y = max(0, (h - tile_size) // 2)
         tile = slide.read_region_relative(
             (x, y), level, (tile_size, tile_size)
         ).convert("RGB")
@@ -45,15 +41,6 @@ def _read_tile_from_slide(
     return np.asarray(tile, dtype=np.uint8)
 
 
-def _client(timeout_s: float = 1600.0):
-    try:
-        from rationai import Client
-    except ImportError:
-        pytest.skip("Python package `rationai` is not installed.")
-
-    return Client(models_base_url=_models_base_url(), timeout=timeout_s)
-
-
 def run_binary_classifier_case(
     model_id: str,
     slide_path: str,
@@ -63,16 +50,19 @@ def run_binary_classifier_case(
     timeout_s: float = 600.0,
     tolerance: float = 0.00001,
 ) -> None:
-    tile = _read_tile_from_slide(
-        slide_path=slide_path, tile_size=tile_size, level=level
-    )
+    tile = _read_tile(slide_path, tile_size, level)
 
-    with _client(timeout_s=timeout_s) as client:
+    with _client(timeout_s) as client:
+        t0 = perf_counter()
         actual_score = float(client.models.classify_image(model=model_id, image=tile))
+        elapsed = perf_counter() - t0
+
+    print(
+        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | score={actual_score:.6f} | expected={expected_score:.6f}"
+    )
 
     assert abs(actual_score - expected_score) <= tolerance, (
-        f"Binary score mismatch: expected={expected_score}, actual={actual_score}, "
-        f"tolerance={tolerance}"
+        f"Binary score mismatch: expected={expected_score}, actual={actual_score}, tolerance={tolerance}"
     )
 
 
@@ -90,26 +80,23 @@ def run_semantic_segmentation_case(
     if not expected_array_path.exists():
         pytest.fail(f"Reference file does not exist: {expected_array_path}")
 
-    tile = _read_tile_from_slide(
-        slide_path=slide_path, tile_size=tile_size, level=level
-    )
+    tile = _read_tile(slide_path, tile_size, level)
     expected = np.load(expected_array_path)
 
-    with _client(timeout_s=timeout_s) as client:
+    with _client(timeout_s) as client:
+        t0 = perf_counter()
         actual = np.asarray(client.models.segment_image(model=model_id, image=tile))
+        elapsed = perf_counter() - t0
+
+    max_diff = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
+    print(
+        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | shape={actual.shape} | max_diff={max_diff:.6f}"
+    )
 
     if actual.shape != expected.shape:
         pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
 
     if not np.allclose(actual, expected, rtol=rtol, atol=atol):
-        mismatch = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
         pytest.fail(
-            f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={mismatch})"
+            f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={max_diff})"
         )
-
-
-def verify_file_hash(path: Path, expected_hash: str) -> None:
-    actual_hash = _sha256(path)
-    assert actual_hash == expected_hash, (
-        f"Hash mismatch for {path}: expected={expected_hash}, actual={actual_hash}"
-    )
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 04f831f..d85cec5 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -1,10 +1,8 @@
 import json
 from pathlib import Path
 
-import httpx
-import lz4.frame
 import numpy as np
-from _shared import _models_base_url, _read_tile_from_slide
+from _shared import _client, _models_base_url, _read_tile
 
 
 OUT_DIR = Path("/mnt/test_refs")
@@ -18,50 +16,56 @@
         "tile_size": 1024,
         "level": 0,
     },
+    {
+        "label": "colon",
+        "slide_path": "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
+        "model_id": "prostate-classifier-1",
+        "type": "binary",
+        "tile_size": 512,
+        "level": 0,
+    },
 ]
 
 
 def generate_references() -> None:
-    base_url = _models_base_url()
     OUT_DIR.mkdir(parents=True, exist_ok=True)
-    print(f"== Generating references to {OUT_DIR} via {base_url} ==")
-
-    for case in CASES:
-        label, model_id, mtype = case["label"], case["model_id"], case["type"]
-        print(f"\n[{label}] {model_id} ({mtype})")
-
-        try:
-            tile = _read_tile_from_slide(
-                case["slide_path"], case["tile_size"], case["level"]
-            )
-        except Exception as e:
-            print(f"  -> Failed to read tile: {e}")
-            continue
+    print(f"== Generating references to {OUT_DIR} via {_models_base_url()} ==")
 
-        url = f"{base_url}/{model_id}/"
-        data = lz4.frame.compress(tile.tobytes())
+    with _client(timeout_s=1200) as client:
+        for case in CASES:
+            label, model_id, mtype = case["label"], case["model_id"], case["type"]
+            print(f"\n[{label}] {model_id} ({mtype})")
 
-        try:
-            response = httpx.post(url, content=data, timeout=600)
-            response.raise_for_status()
+            try:
+                tile = _read_tile(case["slide_path"], case["tile_size"], case["level"])
+            except Exception as e:
+                print(f"  -> Failed to read tile: {e}")
+                continue
 
-            if mtype == "binary":
-                out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
-                with out_file.open("w") as f:
-                    json.dump({"expected_score": float(response.json())}, f, indent=2)
-                print(f"  -> Saved {out_file}")
+            try:
+                if mtype == "binary":
+                    score = float(
+                        client.models.classify_image(
+                            model=model_id, image=tile, timeout=600
+                        )
+                    )
+                    out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
+                    with out_file.open("w") as f:
+                        json.dump({"expected_score": score}, f, indent=2)
+                    print(f"  -> Saved {out_file}")
 
-            elif mtype == "semantic":
-                h, w = tile.shape[:2]
-                arr = np.frombuffer(
-                    lz4.frame.decompress(response.content), dtype=np.float16
-                ).reshape(-1, h, w)
-                out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
-                np.save(out_file, arr)
-                print(f"  -> Saved {out_file} shape={arr.shape}")
+                elif mtype == "semantic":
+                    arr = np.asarray(
+                        client.models.segment_image(
+                            model=model_id, image=tile, timeout=1200
+                        )
+                    )
+                    out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
+                    np.save(out_file, arr)
+                    print(f"  -> Saved {out_file} shape={arr.shape}")
 
-        except Exception as e:
-            print(f"  -> ERROR: {e}")
+            except Exception as e:
+                print(f"  -> ERROR: {e}")
 
 
 if __name__ == "__main__":

From 50eb79bb763ab5f75afc99412de2e266f0f11f63 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 26 Apr 2026 18:54:13 +0200
Subject: [PATCH 09/35] better print

---
 builders/test_runner.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/builders/test_runner.py b/builders/test_runner.py
index 97f125c..b818eb4 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -18,7 +18,7 @@ def __init__(self) -> None:
         )
 
     @fastapi.post("/")
-    def run(self) -> dict:
+    def run(self) -> str:
         result = subprocess.run(
             [
                 sys.executable,
@@ -28,14 +28,15 @@ def run(self) -> dict:
                 "-v",
                 "--tb=short",
                 "--no-header",
+                "-s",
+                "--color=no",
             ],
             capture_output=True,
             text=True,
         )
-        return {
-            "passed": result.returncode == 0,
-            "output": result.stdout,
-        }
+        return result.stdout + (
+            f"\nSTDERR:\n{result.stderr}" if result.returncode != 0 else ""
+        )
 
 
 app = TestRunner.bind()

From e855481492699133ad1c04f2804d2b2944b5d709 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Tue, 28 Apr 2026 21:21:31 +0200
Subject: [PATCH 10/35] better print

---
 builders/test_runner.py                      | 25 +++++++++++++++++---
 tests/model_snapshots/_shared.py             | 20 ++++------------
 tests/model_snapshots/generate_references.py | 12 +++++++---
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/builders/test_runner.py b/builders/test_runner.py
index b818eb4..96f947c 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -1,6 +1,7 @@
 import subprocess
 import sys
 
+import requests
 from fastapi import FastAPI
 from ray import serve
 
@@ -17,8 +18,24 @@ def __init__(self) -> None:
             check=True,
         )
 
+    def _model_statuses(self) -> str:
+        try:
+            resp = requests.get("http://localhost:52365/api/serve/applications/")
+            resp.raise_for_status()
+            data = resp.json()
+            lines = ["Model statuses:"]
+            for app_name, app_info in data.get("applications", {}).items():
+                for dep_name, dep_info in app_info.get("deployments", {}).items():
+                    status = dep_info.get("status", "UNKNOWN")
+                    lines.append(f"  {app_name} ({dep_name}): {status}")
+            return "\n".join(lines)
+        except Exception as e:
+            return f"Could not fetch model statuses: {e}"
+
     @fastapi.post("/")
     def run(self) -> str:
+        statuses = self._model_statuses()
+
         result = subprocess.run(
             [
                 sys.executable,
@@ -34,9 +51,11 @@ def run(self) -> str:
             capture_output=True,
             text=True,
         )
-        return result.stdout + (
-            f"\nSTDERR:\n{result.stderr}" if result.returncode != 0 else ""
-        )
+
+        output = statuses + "\n\n" + result.stdout
+        if result.returncode != 0:
+            output += f"\nSTDERR:\n{result.stderr}"
+        return output
 
 
 app = TestRunner.bind()
diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 187aaa6..8f560fd 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -7,6 +7,8 @@
 import numpy as np
 import pytest
 from numpy.typing import NDArray
+from rationai import Client
+from ratiopath.openslide import OpenSlide
 
 
 def _models_base_url() -> str:
@@ -16,20 +18,7 @@ def _models_base_url() -> str:
     )
 
 
-def _client(timeout_s: float = 1200.0):
-    try:
-        from rationai import Client
-    except ImportError:
-        pytest.skip("Python package `rationai` is not installed.")
-    return Client(models_base_url=_models_base_url(), timeout=timeout_s)
-
-
 def _read_tile(slide_path: str, tile_size: int, level: int) -> NDArray[np.uint8]:
-    try:
-        from ratiopath.openslide import OpenSlide
-    except ImportError:
-        pytest.skip("Python package `ratiopath` is not installed.")
-
     with OpenSlide(slide_path) as slide:
         w, h = slide.level_dimensions[level]
         x = max(0, (w - tile_size) // 2)
@@ -37,7 +26,6 @@ def _read_tile(slide_path: str, tile_size: int, level: int) -> NDArray[np.uint8]
         tile = slide.read_region_relative(
             (x, y), level, (tile_size, tile_size)
         ).convert("RGB")
-
     return np.asarray(tile, dtype=np.uint8)
 
 
@@ -52,7 +40,7 @@ def run_binary_classifier_case(
 ) -> None:
     tile = _read_tile(slide_path, tile_size, level)
 
-    with _client(timeout_s) as client:
+    with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
         t0 = perf_counter()
         actual_score = float(client.models.classify_image(model=model_id, image=tile))
         elapsed = perf_counter() - t0
@@ -83,7 +71,7 @@ def run_semantic_segmentation_case(
     tile = _read_tile(slide_path, tile_size, level)
     expected = np.load(expected_array_path)
 
-    with _client(timeout_s) as client:
+    with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
         t0 = perf_counter()
         actual = np.asarray(client.models.segment_image(model=model_id, image=tile))
         elapsed = perf_counter() - t0
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index d85cec5..35b3840 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -1,11 +1,17 @@
 import json
+import os
 from pathlib import Path
 
 import numpy as np
-from _shared import _client, _models_base_url, _read_tile
+from _shared import _read_tile
+from rationai import Client
 
 
 OUT_DIR = Path("/mnt/test_refs")
+MODELS_BASE_URL = os.environ.get(
+    "MODEL_SERVICE_MODELS_BASE_URL",
+    "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+)
 
 CASES = [
     {
@@ -29,9 +35,9 @@
 
 def generate_references() -> None:
     OUT_DIR.mkdir(parents=True, exist_ok=True)
-    print(f"== Generating references to {OUT_DIR} via {_models_base_url()} ==")
+    print(f"== Generating references to {OUT_DIR} via {MODELS_BASE_URL} ==")
 
-    with _client(timeout_s=1200) as client:
+    with Client(models_base_url=MODELS_BASE_URL, timeout=1200) as client:
         for case in CASES:
             label, model_id, mtype = case["label"], case["model_id"], case["type"]
             print(f"\n[{label}] {model_id} ({mtype})")

From 5e7654d234765421d4ad423ad2811e5666132fd6 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Tue, 28 Apr 2026 21:49:46 +0200
Subject: [PATCH 11/35] fix port

---
 builders/test_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/builders/test_runner.py b/builders/test_runner.py
index 96f947c..5d6ef2b 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -20,7 +20,7 @@ def __init__(self) -> None:
 
     def _model_statuses(self) -> str:
         try:
-            resp = requests.get("http://localhost:52365/api/serve/applications/")
+            resp = requests.get("http://localhost:8265/api/serve/applications/")
             resp.raise_for_status()
             data = resp.json()
             lines = ["Model statuses:"]

From e7a7987684702469dc2087d9cd3624b1f644006f Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Mon, 4 May 2026 19:58:17 +0200
Subject: [PATCH 12/35] fix: change middle to x,y points

---
 tests/model_snapshots/_shared.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 8f560fd..a0581a7 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -18,13 +18,15 @@ def _models_base_url() -> str:
     )
 
 
-def _read_tile(slide_path: str, tile_size: int, level: int) -> NDArray[np.uint8]:
+def _read_tile_at(
+    slide_path: str, x: int, y: int, tile_size: int, level: int
+) -> NDArray[np.uint8]:
     with OpenSlide(slide_path) as slide:
-        w, h = slide.level_dimensions[level]
-        x = max(0, (w - tile_size) // 2)
-        y = max(0, (h - tile_size) // 2)
+        downsample = slide.level_downsamples[level]
+        x_rel = int(x / downsample)
+        y_rel = int(y / downsample)
         tile = slide.read_region_relative(
-            (x, y), level, (tile_size, tile_size)
+            (x_rel, y_rel), level, (tile_size, tile_size)
         ).convert("RGB")
     return np.asarray(tile, dtype=np.uint8)
 
@@ -32,13 +34,15 @@ def _read_tile(slide_path: str, tile_size: int, level: int) -> NDArray[np.uint8]
 def run_binary_classifier_case(
     model_id: str,
     slide_path: str,
+    x: int,
+    y: int,
     expected_score: float,
     tile_size: int = 512,
     level: int = 0,
     timeout_s: float = 600.0,
     tolerance: float = 0.00001,
 ) -> None:
-    tile = _read_tile(slide_path, tile_size, level)
+    tile = _read_tile_at(slide_path, x, y, tile_size, level)
 
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
         t0 = perf_counter()
@@ -57,6 +61,8 @@ def run_binary_classifier_case(
 def run_semantic_segmentation_case(
     model_id: str,
     slide_path: str,
+    x: int,
+    y: int,
     expected_array_path: Path | str,
     tile_size: int = 1024,
     level: int = 0,
@@ -68,7 +74,7 @@ def run_semantic_segmentation_case(
     if not expected_array_path.exists():
         pytest.fail(f"Reference file does not exist: {expected_array_path}")
 
-    tile = _read_tile(slide_path, tile_size, level)
+    tile = _read_tile_at(slide_path, x, y, tile_size, level)
     expected = np.load(expected_array_path)
 
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:

From c055043a1144eabbe5336489c7e54bbf3c773704 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Mon, 4 May 2026 19:58:36 +0200
Subject: [PATCH 13/35] generate new refs

---
 tests/model_snapshots/generate_references.py | 24 ++++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 35b3840..5b6822b 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 import numpy as np
-from _shared import _read_tile
+from _shared import _read_tile_at
 from rationai import Client
 
 
@@ -15,20 +15,14 @@
 
 CASES = [
     {
-        "label": "breast",
-        "slide_path": "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
-        "model_id": "episeg-1",
-        "type": "semantic",
-        "tile_size": 1024,
-        "level": 0,
-    },
-    {
-        "label": "colon",
-        "slide_path": "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
+        "label": "prostate_positive",
+        "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs",
         "model_id": "prostate-classifier-1",
         "type": "binary",
         "tile_size": 512,
         "level": 0,
+        "x": 43390,
+        "y": 45865,
     },
 ]
 
@@ -43,7 +37,13 @@ def generate_references() -> None:
             print(f"\n[{label}] {model_id} ({mtype})")
 
             try:
-                tile = _read_tile(case["slide_path"], case["tile_size"], case["level"])
+                tile = _read_tile_at(
+                    case["slide_path"],
+                    case["x"],
+                    case["y"],
+                    case["tile_size"],
+                    case["level"],
+                )
             except Exception as e:
                 print(f"  -> Failed to read tile: {e}")
                 continue

From fa43f530ecddc05ed67e0154104ce42cb148f721 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Mon, 4 May 2026 19:58:56 +0200
Subject: [PATCH 14/35] test: new test

---
 .../test_binary_classifier_model_snapshot.py  | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index 87709a7..7060927 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -6,29 +6,40 @@
 
 
 @pytest.mark.parametrize(
-    "label, slide_path",
+    "label, slide_path, x, y",
     [
         (
-            "colon",
-            "/mnt/data/MOU/colon/comparison_of_scanners/FLASH2021_5638-02-T.mrxs",
+            "prostate_positive",
+            "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs",
+            43390,
+            45865,
         ),
     ],
 )
-def test_binary_classifier(label: str, slide_path: str) -> None:
+def test_prostate_classifier_positive(
+    label: str, slide_path: str, x: int, y: int
+) -> None:
     model_id = "prostate-classifier-1"
     json_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.json")
 
-    if json_path.exists():
-        with json_path.open() as f:
-            expected_score = json.load(f)["expected_score"]
-    else:
+    if not json_path.exists():
         pytest.skip(
             f"Reference file {json_path} missing. Run generate_references.py first."
         )
 
+    with json_path.open() as f:
+        expected_score = json.load(f)["expected_score"]
+
+    assert expected_score >= 0.5, (
+        f"Reference score {expected_score:.4f} is below positive threshold 0.5 — "
+        "was the reference generated on the correct tile?"
+    )
+
     run_binary_classifier_case(
         model_id=model_id,
         slide_path=slide_path,
+        x=x,
+        y=y,
         expected_score=expected_score,
         tile_size=512,
         level=0,

From 968af4f5376758de6b7d32aa7f89ca671424a8f9 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Mon, 4 May 2026 21:45:40 +0200
Subject: [PATCH 15/35] test fixes

Co-authored-by: Copilot <copilot@github.com>
---
 builders/test_runner.py                       | 18 +---
 helm/rayservice/applications/test-runner.yaml |  6 +-
 tests/README-model-snapshots.md               | 85 -------------------
 tests/model_snapshots/_shared.py              | 46 ++++++++++
 tests/model_snapshots/generate_references.py  | 38 ++++++++-
 .../test_binary_classifier_model_snapshot.py  | 49 +++++++++--
 6 files changed, 128 insertions(+), 114 deletions(-)
 delete mode 100644 tests/README-model-snapshots.md

diff --git a/builders/test_runner.py b/builders/test_runner.py
index 5d6ef2b..01d7bdb 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -1,7 +1,6 @@
 import subprocess
 import sys
 
-import requests
 from fastapi import FastAPI
 from ray import serve
 
@@ -18,23 +17,8 @@ def __init__(self) -> None:
             check=True,
         )
 
-    def _model_statuses(self) -> str:
-        try:
-            resp = requests.get("http://localhost:8265/api/serve/applications/")
-            resp.raise_for_status()
-            data = resp.json()
-            lines = ["Model statuses:"]
-            for app_name, app_info in data.get("applications", {}).items():
-                for dep_name, dep_info in app_info.get("deployments", {}).items():
-                    status = dep_info.get("status", "UNKNOWN")
-                    lines.append(f"  {app_name} ({dep_name}): {status}")
-            return "\n".join(lines)
-        except Exception as e:
-            return f"Could not fetch model statuses: {e}"
-
     @fastapi.post("/")
     def run(self) -> str:
-        statuses = self._model_statuses()
 
         result = subprocess.run(
             [
@@ -52,7 +36,7 @@ def run(self) -> str:
             text=True,
         )
 
-        output = statuses + "\n\n" + result.stdout
+        output = result.stdout
         if result.returncode != 0:
             output += f"\nSTDERR:\n{result.stderr}"
         return output
diff --git a/helm/rayservice/applications/test-runner.yaml b/helm/rayservice/applications/test-runner.yaml
index 45f5b68..3aad245 100644
--- a/helm/rayservice/applications/test-runner.yaml
+++ b/helm/rayservice/applications/test-runner.yaml
@@ -2,7 +2,7 @@
   import_path: builders.test_runner:app
   route_prefix: /run-tests
   runtime_env:
-    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/tests.zip?v5
+    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/tests.zip?v10
     pip:
       - git+https://github.com/RationAI/rationai-sdk-python.git
   deployments:
@@ -12,5 +12,5 @@
         max_replicas: 1
         target_ongoing_requests: 1
       ray_actor_options:
-        num_cpus: 4
-        memory: 8589934592
+        num_cpus: 1
+        memory: 2147483648
diff --git a/tests/README-model-snapshots.md b/tests/README-model-snapshots.md
deleted file mode 100644
index 3d2fb7b..0000000
--- a/tests/README-model-snapshots.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# Model snapshot tests
-
-This repository contains end-to-end snapshot tests in `tests/model_snapshots/`.
-
-Per-model test files:
-
-- `tests/model_snapshots/test_binary_classifier_model_snapshot.py`
-- `tests/model_snapshots/test_semantic_segmentation_model_snapshot.py`
-
-Shared files:
-
-- `tests/model_snapshots/_shared.py`
-- `tests/model_snapshots/run_all_model_snapshot_tests.py`
-
-These tests are meant as post-deploy use-case checks (not only liveness checks):
-
-- they execute a real request path through Ray Serve deployment
-- they verify request processing success (timeouts/errors fail the test)
-- they verify result correctness for each deployment (`binary_classifier`, `semantic_segmentation`)
-- they touch real slide paths, helping catch mount/filesystem issues
-
-Each test calls its deployment-specific endpoint:
-
-- binary classifier: SDK call `client.models.classify_image("prostate-classifier-1", tile)`
-- semantic segmentation: SDK call `client.models.segment_image("episeg-1", tile)`
-
-Input tile is read directly from a real WSI using `ratiopath.openslide.OpenSlide`.
-
-## Adding a new model test
-
-Přidání nového modelu do testů je nyní velmi jednoduché:
-
-1. Vytvořte nový soubor v `tests/model_snapshots/`, např. `test_novy_model_snapshot.py`.
-2. Importujte a zavolejte příslušnou case funkci z `_shared.py` a předejte jí konfiguraci napřímo parametrem:
-
-```python
-from pathlib import Path
-from tests.model_snapshots._shared import run_binary_classifier_case
-
-def test_novy_model_snapshot() -> None:
-    # Parametry si rovnou zadefinujte v testovacím souboru
-    run_binary_classifier_case(
-        model_id="my-new-endpoint-id",
-        slide_path="/mnt/bioptic_tree/.../slide.mrxs",
-        expected_score=0.987,
-        tile_size=512,
-        level=0,
-    )
-```
-
-Tím se stane automaticky součástí sady `pytest tests/model_snapshots`.
-
-## Global environment variables
-
-Common (pro celý cluster a všechny testy):
-
-- `MODEL_SERVICE_MODELS_BASE_URL` (default: `http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000`)
-
-Očekávané skóre/pole a cesty k datasetům pro stávající modely (`episeg-1` a `prostate-classifier-1`) se tahají z těchto proměnných ve stávajících testovacích souborech, pokud chcete zachovat původní CI chování (případně se dají časem snadno zahardkódit do testovacího souboru):
-
-- `MODEL_TEST_BINARY_EXPECTED_SCORE`
-- `MODEL_TEST_SEMANTIC_EXPECTED_ARRAY_PATH`
-
-## Example (PowerShell)
-
-```powershell
-$env:MODEL_TEST_BINARY_EXPECTED_SCORE = "0.9732"
-$env:MODEL_TEST_SEMANTIC_EXPECTED_ARRAY_PATH = "/mnt/path/to/reference/semantic_expected.npy"
-
-# Models base URL is resolved directly from SDK fallback inside kubernetes:
-# http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000
-
-python tests/model_snapshots/run_all_model_snapshot_tests.py
-
-# Alternative:
-python -m pytest tests/model_snapshots -q
-```
-
-## SDK dependency
-
-Install SDK package so that `import rationai` works in tests, e.g.:
-
-```powershell
-python -m pip install git+https://github.com/RationAI/rationai-sdk-python.git
-```
diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index a0581a7..260c42e 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -41,6 +41,8 @@ def run_binary_classifier_case(
     level: int = 0,
     timeout_s: float = 600.0,
     tolerance: float = 0.00001,
+    expected_is_positive: bool | None = None,
+    threshold: float = 0.5,
 ) -> None:
     tile = _read_tile_at(slide_path, x, y, tile_size, level)
 
@@ -53,6 +55,14 @@ def run_binary_classifier_case(
         f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | score={actual_score:.6f} | expected={expected_score:.6f}"
     )
 
+    if expected_is_positive is not None:
+        actual_is_positive = actual_score >= threshold
+        assert actual_is_positive == expected_is_positive, (
+            "Binary class mismatch: "
+            f"expected_is_positive={expected_is_positive}, "
+            f"actual_score={actual_score:.6f}, threshold={threshold:.3f}"
+        )
+
     assert abs(actual_score - expected_score) <= tolerance, (
         f"Binary score mismatch: expected={expected_score}, actual={actual_score}, tolerance={tolerance}"
     )
@@ -94,3 +104,39 @@ def run_semantic_segmentation_case(
         pytest.fail(
             f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={max_diff})"
         )
+
+
+def run_embed_case(
+    model_id: str,
+    slide_path: str,
+    expected_array_path: Path | str,
+    tile_size: int = 224,
+    level: int = 0,
+    timeout_s: float = 1200.0,
+    atol: float = 0.0,
+    rtol: float = 0.0,
+) -> None:
+    expected_array_path = Path(expected_array_path)
+    if not expected_array_path.exists():
+        pytest.fail(f"Reference file does not exist: {expected_array_path}")
+
+    tile = _read_tile(slide_path, tile_size, level)
+    expected = np.load(expected_array_path)
+
+    with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
+        t0 = perf_counter()
+        actual = np.asarray(client.models.embed_image(model=model_id, image=tile))
+        elapsed = perf_counter() - t0
+
+    max_diff = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
+    print(
+        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | shape={actual.shape} | max_diff={max_diff:.6f}"
+    )
+
+    if actual.shape != expected.shape:
+        pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
+
+    if not np.allclose(actual, expected, rtol=rtol, atol=atol):
+        pytest.fail(
+            f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={max_diff})"
+        )
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 5b6822b..c7251a8 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -12,6 +12,7 @@
     "MODEL_SERVICE_MODELS_BASE_URL",
     "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
 )
+BINARY_POSITIVE_THRESHOLD = 0.5
 
 CASES = [
     {
@@ -24,6 +25,16 @@
         "x": 43390,
         "y": 45865,
     },
+    {
+        "label": "prostate_negative",
+        "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
+        "model_id": "prostate-classifier-1",
+        "type": "binary",
+        "tile_size": 512,
+        "level": 0,
+        "x": 34467,
+        "y": 104964,
+    },
 ]
 
 
@@ -57,7 +68,23 @@ def generate_references() -> None:
                     )
                     out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
                     with out_file.open("w") as f:
-                        json.dump({"expected_score": score}, f, indent=2)
+                        json.dump(
+                            {
+                                "label": label,
+                                "model_id": model_id,
+                                "slide_path": case["slide_path"],
+                                "x": case["x"],
+                                "y": case["y"],
+                                "tile_size": case["tile_size"],
+                                "level": case["level"],
+                                "threshold": BINARY_POSITIVE_THRESHOLD,
+                                "expected_is_positive": score
+                                >= BINARY_POSITIVE_THRESHOLD,
+                                "expected_score": score,
+                            },
+                            f,
+                            indent=2,
+                        )
                     print(f"  -> Saved {out_file}")
 
                 elif mtype == "semantic":
@@ -70,6 +97,15 @@ def generate_references() -> None:
                     np.save(out_file, arr)
                     print(f"  -> Saved {out_file} shape={arr.shape}")
 
+                elif mtype == "embed":
+                    arr = np.asarray(
+                        client.models.embed_image(
+                            model=model_id, image=tile, timeout=1200
+                        )
+                    )
+                    out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
+                    np.save(out_file, arr)
+                    print(f"  -> Saved {out_file} shape={arr.shape}")
             except Exception as e:
                 print(f"  -> ERROR: {e}")
 
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index 7060927..f46b3f6 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -5,19 +5,30 @@
 from _shared import run_binary_classifier_case
 
 
+BINARY_POSITIVE_THRESHOLD = 0.5
+
+
 @pytest.mark.parametrize(
-    "label, slide_path, x, y",
+    "label, slide_path, x, y, is_positive",
     [
         (
             "prostate_positive",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs",
             43390,
             45865,
+            True,
+        ),
+        (
+            "prostate_negative",
+            "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
+            34467,
+            104964,
+            False,
         ),
     ],
 )
-def test_prostate_classifier_positive(
-    label: str, slide_path: str, x: int, y: int
+def test_prostate_classifier_snapshot(
+    label: str, slide_path: str, x: int, y: int, is_positive: bool
 ) -> None:
     model_id = "prostate-classifier-1"
     json_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.json")
@@ -28,12 +39,32 @@ def test_prostate_classifier_positive(
         )
 
     with json_path.open() as f:
-        expected_score = json.load(f)["expected_score"]
+        reference = json.load(f)
 
-    assert expected_score >= 0.5, (
-        f"Reference score {expected_score:.4f} is below positive threshold 0.5 — "
-        "was the reference generated on the correct tile?"
-    )
+    assert reference.get("label") == label
+    assert reference.get("model_id") == model_id
+    assert reference.get("slide_path") == slide_path
+    assert reference.get("x") == x
+    assert reference.get("y") == y
+    assert reference.get("tile_size") == 512
+    assert reference.get("level") == 0
+
+    expected_score = reference["expected_score"]
+    threshold = reference.get("threshold", BINARY_POSITIVE_THRESHOLD)
+    expected_is_positive = reference.get("expected_is_positive")
+    assert expected_is_positive is not None
+    assert expected_is_positive == is_positive
+
+    if is_positive:
+        assert expected_score >= threshold, (
+            f"Reference score {expected_score:.4f} is below positive threshold {threshold:.3f} — "
+            "was the reference generated on the correct tile?"
+        )
+    else:
+        assert expected_score < threshold, (
+            f"Reference score {expected_score:.4f} is above negative threshold {threshold:.3f} — "
+            "was the reference generated on the correct tile?"
+        )
 
     run_binary_classifier_case(
         model_id=model_id,
@@ -43,4 +74,6 @@ def test_prostate_classifier_positive(
         expected_score=expected_score,
         tile_size=512,
         level=0,
+        expected_is_positive=expected_is_positive,
+        threshold=threshold,
     )

From d78cddb409f25ebfbd35cacfeb5692426ce8538f Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Mon, 4 May 2026 21:54:28 +0200
Subject: [PATCH 16/35] fix: different coordinates

Co-authored-by: Copilot <copilot@github.com>
---
 tests/model_snapshots/generate_references.py                  | 4 ++--
 .../model_snapshots/test_binary_classifier_model_snapshot.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index c7251a8..01aeda1 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -32,8 +32,8 @@
         "type": "binary",
         "tile_size": 512,
         "level": 0,
-        "x": 34467,
-        "y": 104964,
+        "x": 31017,
+        "y": 113220,
     },
 ]
 
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index f46b3f6..be433d1 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -21,8 +21,8 @@
         (
             "prostate_negative",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
-            34467,
-            104964,
+            31017,
+            113220,
             False,
         ),
     ],

From 324a71319758154aa6260f197a359d6d7702ae4f Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Mon, 4 May 2026 22:07:00 +0200
Subject: [PATCH 17/35] fix coors

Co-authored-by: Copilot <copilot@github.com>
---
 tests/model_snapshots/generate_references.py                  | 4 ++--
 .../model_snapshots/test_binary_classifier_model_snapshot.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 01aeda1..18dedfb 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -32,8 +32,8 @@
         "type": "binary",
         "tile_size": 512,
         "level": 0,
-        "x": 31017,
-        "y": 113220,
+        "x": 32950,
+        "y": 108990,
     },
 ]
 
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index be433d1..e2c13e9 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -21,8 +21,8 @@
         (
             "prostate_negative",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
-            31017,
-            113220,
+            32950,
+            108990,
             False,
         ),
     ],

From 09138635d62531f7b0c6e07c9ccc5ee5eaf3befc Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Tue, 5 May 2026 20:51:40 +0200
Subject: [PATCH 18/35] feat: add virchow2 test

Co-authored-by: Copilot <copilot@github.com>
---
 tests/model_snapshots/_shared.py              |  4 ++-
 tests/model_snapshots/generate_references.py  | 36 ++++++++-----------
 .../test_virchow2_model_snapshot.py           | 30 ++++++++++++++++
 3 files changed, 47 insertions(+), 23 deletions(-)
 create mode 100644 tests/model_snapshots/test_virchow2_model_snapshot.py

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 260c42e..285adcb 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -109,6 +109,8 @@ def run_semantic_segmentation_case(
 def run_embed_case(
     model_id: str,
     slide_path: str,
+    x: int,
+    y: int,
     expected_array_path: Path | str,
     tile_size: int = 224,
     level: int = 0,
@@ -120,7 +122,7 @@ def run_embed_case(
     if not expected_array_path.exists():
         pytest.fail(f"Reference file does not exist: {expected_array_path}")
 
-    tile = _read_tile(slide_path, tile_size, level)
+    tile = _read_tile_at(slide_path, x, y, tile_size, level)
     expected = np.load(expected_array_path)
 
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 18dedfb..3bba3e0 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -14,28 +14,20 @@
 )
 BINARY_POSITIVE_THRESHOLD = 0.5
 
-CASES = [
-    {
-        "label": "prostate_positive",
-        "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs",
-        "model_id": "prostate-classifier-1",
-        "type": "binary",
-        "tile_size": 512,
-        "level": 0,
-        "x": 43390,
-        "y": 45865,
-    },
-    {
-        "label": "prostate_negative",
-        "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
-        "model_id": "prostate-classifier-1",
-        "type": "binary",
-        "tile_size": 512,
-        "level": 0,
-        "x": 32950,
-        "y": 108990,
-    },
-]
+# Keep only one active case here. Store other candidate slides in new_images.txt
+# and swap them in when you want to regenerate a different reference.
+ACTIVE_CASE = {
+    "label": "virchow2",
+    "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
+    "model_id": "virchow2",
+    "type": "embed",
+    "tile_size": 224,
+    "level": 0,
+    "x": 40000,
+    "y": 70000,
+}
+
+CASES = [ACTIVE_CASE]
 
 
 def generate_references() -> None:
diff --git a/tests/model_snapshots/test_virchow2_model_snapshot.py b/tests/model_snapshots/test_virchow2_model_snapshot.py
new file mode 100644
index 0000000..dd1ba92
--- /dev/null
+++ b/tests/model_snapshots/test_virchow2_model_snapshot.py
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+import pytest
+from _shared import run_embed_case
+
+
+@pytest.mark.parametrize(
+    "label, slide_path, x, y",
+    [
+        (
+            "prostate",
+            "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
+            40000,
+            70000,
+        ),
+    ],
+)
+def test_virchow2(label: str, slide_path: str, x: int, y: int) -> None:
+    model_id = "virchow2"
+    expected_array_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.npy")
+
+    run_embed_case(
+        model_id=model_id,
+        slide_path=slide_path,
+        x=x,
+        y=y,
+        expected_array_path=expected_array_path,
+        tile_size=224,
+        level=0,
+    )

From e72e9001df1240279d3a916c65aa1b471ec5624b Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Tue, 5 May 2026 20:54:00 +0200
Subject: [PATCH 19/35] fix: name

---
 tests/model_snapshots/test_virchow2_model_snapshot.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/model_snapshots/test_virchow2_model_snapshot.py b/tests/model_snapshots/test_virchow2_model_snapshot.py
index dd1ba92..38f6c1d 100644
--- a/tests/model_snapshots/test_virchow2_model_snapshot.py
+++ b/tests/model_snapshots/test_virchow2_model_snapshot.py
@@ -8,7 +8,7 @@
     "label, slide_path, x, y",
     [
         (
-            "prostate",
+            "virchow2",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
             40000,
             70000,

From 4d13c8384f0bf9f2e837d97788dfbfdaa52b55b2 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Tue, 5 May 2026 22:12:52 +0200
Subject: [PATCH 20/35] fix: tolerance

---
 tests/model_snapshots/_shared.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 285adcb..6c36ff7 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -40,7 +40,6 @@ def run_binary_classifier_case(
     tile_size: int = 512,
     level: int = 0,
     timeout_s: float = 600.0,
-    tolerance: float = 0.00001,
     expected_is_positive: bool | None = None,
     threshold: float = 0.5,
 ) -> None:
@@ -58,15 +57,10 @@ def run_binary_classifier_case(
     if expected_is_positive is not None:
         actual_is_positive = actual_score >= threshold
         assert actual_is_positive == expected_is_positive, (
-            "Binary class mismatch: "
-            f"expected_is_positive={expected_is_positive}, "
+            f"Binary class mismatch: expected_is_positive={expected_is_positive}, "
             f"actual_score={actual_score:.6f}, threshold={threshold:.3f}"
         )
 
-    assert abs(actual_score - expected_score) <= tolerance, (
-        f"Binary score mismatch: expected={expected_score}, actual={actual_score}, tolerance={tolerance}"
-    )
-
 
 def run_semantic_segmentation_case(
     model_id: str,
@@ -115,30 +109,35 @@ def run_embed_case(
     tile_size: int = 224,
     level: int = 0,
     timeout_s: float = 1200.0,
-    atol: float = 0.0,
-    rtol: float = 0.0,
+    min_cosine_similarity: float = 0.999,
 ) -> None:
     expected_array_path = Path(expected_array_path)
     if not expected_array_path.exists():
         pytest.fail(f"Reference file does not exist: {expected_array_path}")
 
     tile = _read_tile_at(slide_path, x, y, tile_size, level)
-    expected = np.load(expected_array_path)
+    expected = np.load(expected_array_path).flatten().astype(np.float32)
 
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
         t0 = perf_counter()
-        actual = np.asarray(client.models.embed_image(model=model_id, image=tile))
+        actual = (
+            np.asarray(client.models.embed_image(model=model_id, image=tile))
+            .flatten()
+            .astype(np.float32)
+        )
         elapsed = perf_counter() - t0
 
-    max_diff = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
+    similarity = float(
+        np.dot(actual, expected) / (np.linalg.norm(actual) * np.linalg.norm(expected))
+    )
     print(
-        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | shape={actual.shape} | max_diff={max_diff:.6f}"
+        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | shape={actual.shape} | cosine_similarity={similarity:.6f}"
     )
 
     if actual.shape != expected.shape:
         pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
 
-    if not np.allclose(actual, expected, rtol=rtol, atol=atol):
+    if similarity < min_cosine_similarity:
         pytest.fail(
-            f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={max_diff})"
+            f"Embedding similarity too low: {similarity:.6f} < {min_cosine_similarity}"
         )

From 7f2924a5a54e9be346316f18aa38009566bfd757 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Wed, 6 May 2026 13:57:46 +0200
Subject: [PATCH 21/35] test: semantic test

Co-authored-by: Copilot <copilot@github.com>
---
 .../test_semantic_segmentation_model_snapshot.py     | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
index 294f682..54e9c56 100644
--- a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
+++ b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
@@ -5,21 +5,25 @@
 
 
 @pytest.mark.parametrize(
-    "label, slide_path",
+    "label, slide_path, x, y",
     [
         (
-            "breast",
-            "/mnt/bioptic_tree/2019/08/728/2019_08728-01-T/2019_08728-01-T.mrxs",
+            "colorectum_kos04",
+            "/mnt/data/MOU/colorectum/tissue_microarray/he/KOS04.mrxs",
+            46000,
+            82400,
         ),
     ],
 )
-def test_semantic_episeg(label: str, slide_path: str) -> None:
+def test_semantic_episeg(label: str, slide_path: str, x: int, y: int) -> None:
     model_id = "episeg-1"
     expected_array_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.npy")
 
     run_semantic_segmentation_case(
         model_id=model_id,
         slide_path=slide_path,
+        x=x,
+        y=y,
         expected_array_path=expected_array_path,
         tile_size=1024,
         level=0,

From 1995689be56b638e5d926b84471a90164c03b1fa Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Wed, 6 May 2026 16:44:37 +0200
Subject: [PATCH 22/35] test: print

Co-authored-by: Copilot <copilot@github.com>
---
 tests/model_snapshots/_shared.py              | 82 ++++++++++++++++---
 .../test_binary_classifier_model_snapshot.py  |  1 +
 ...st_semantic_segmentation_model_snapshot.py |  3 +
 .../test_virchow2_model_snapshot.py           |  1 +
 4 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 6c36ff7..c3e751e 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -42,6 +42,7 @@ def run_binary_classifier_case(
     timeout_s: float = 600.0,
     expected_is_positive: bool | None = None,
     threshold: float = 0.5,
+    case_name: str | None = None,
 ) -> None:
     tile = _read_tile_at(slide_path, x, y, tile_size, level)
 
@@ -50,9 +51,8 @@ def run_binary_classifier_case(
         actual_score = float(client.models.classify_image(model=model_id, image=tile))
         elapsed = perf_counter() - t0
 
-    print(
-        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | score={actual_score:.6f} | expected={expected_score:.6f}"
-    )
+    delta = actual_score - expected_score
+    name = case_name or "case"
 
     if expected_is_positive is not None:
         actual_is_positive = actual_score >= threshold
@@ -61,6 +61,13 @@ def run_binary_classifier_case(
             f"actual_score={actual_score:.6f}, threshold={threshold:.3f}"
         )
 
+    print(f"\n/{model_id}")
+    print("passed")
+    print(
+        f"{name} stats: score={actual_score:.6f} expected={expected_score:.6f} "
+        f"delta={delta:+.6f} threshold={threshold:.3f}"
+    )
+
 
 def run_semantic_segmentation_case(
     model_id: str,
@@ -71,8 +78,12 @@ def run_semantic_segmentation_case(
     tile_size: int = 1024,
     level: int = 0,
     timeout_s: float = 1200.0,
-    atol: float = 0.0,
-    rtol: float = 0.0,
+    atol: float = 1e-6,
+    rtol: float = 1e-5,
+    epithelium_threshold: float | None = None,
+    min_epithelium_fraction: float | None = None,
+    epithelium_channel: int | None = None,
+    case_name: str | None = None,
 ) -> None:
     expected_array_path = Path(expected_array_path)
     if not expected_array_path.exists():
@@ -87,9 +98,20 @@ def run_semantic_segmentation_case(
         elapsed = perf_counter() - t0
 
     max_diff = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
-    print(
-        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | shape={actual.shape} | max_diff={max_diff:.6f}"
-    )
+
+    if actual.ndim == 4:
+        stats_slice = actual[0, 0]
+    elif actual.ndim == 3:
+        stats_slice = actual[0]
+    else:
+        stats_slice = actual.squeeze()
+
+    stats_slice = stats_slice.astype(np.float32)
+    min_val = float(stats_slice.min())
+    mean_val = float(stats_slice.mean())
+    max_val = float(stats_slice.max())
+    frac_05 = float((stats_slice >= 0.5).mean())
+    name = case_name or "case"
 
     if actual.shape != expected.shape:
         pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
@@ -99,6 +121,36 @@ def run_semantic_segmentation_case(
             f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={max_diff})"
         )
 
+    if epithelium_threshold is not None and min_epithelium_fraction is not None:
+        if actual.ndim == 4:
+            channel = 0 if epithelium_channel is None else epithelium_channel
+            epithelium = actual[0, channel]
+        elif actual.ndim == 3:
+            channel = 0 if epithelium_channel is None else epithelium_channel
+            epithelium = actual[channel]
+        else:
+            epithelium = actual.squeeze()
+
+        if epithelium.ndim != 2:
+            pytest.fail(
+                "Cannot determine epithelium channel; provide epithelium_channel explicitly."
+            )
+
+        fraction = float((epithelium >= epithelium_threshold).mean())
+        if fraction < min_epithelium_fraction:
+            pytest.fail(
+                "Epithelium coverage too low: "
+                f"fraction={fraction:.6f} < min_fraction={min_epithelium_fraction:.6f}"
+            )
+
+    print(f"\n/{model_id}")
+    print("passed")
+    print(
+        f"{name} stats: shape={actual.shape} max_diff={max_diff:.6f} "
+        f"min={min_val:.6f} mean={mean_val:.6f} max={max_val:.6f} "
+        f"frac>=0.5={frac_05:.6f}"
+    )
+
 
 def run_embed_case(
     model_id: str,
@@ -110,6 +162,7 @@ def run_embed_case(
     level: int = 0,
     timeout_s: float = 1200.0,
     min_cosine_similarity: float = 0.999,
+    case_name: str | None = None,
 ) -> None:
     expected_array_path = Path(expected_array_path)
     if not expected_array_path.exists():
@@ -130,9 +183,9 @@ def run_embed_case(
     similarity = float(
         np.dot(actual, expected) / (np.linalg.norm(actual) * np.linalg.norm(expected))
     )
-    print(
-        f"\n  model={model_id} | tile={tile_size}px | time={elapsed:.2f}s | shape={actual.shape} | cosine_similarity={similarity:.6f}"
-    )
+    actual_norm = float(np.linalg.norm(actual))
+    expected_norm = float(np.linalg.norm(expected))
+    name = case_name or "case"
 
     if actual.shape != expected.shape:
         pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
@@ -141,3 +194,10 @@ def run_embed_case(
         pytest.fail(
             f"Embedding similarity too low: {similarity:.6f} < {min_cosine_similarity}"
         )
+
+    print(f"\n/{model_id}")
+    print("passed")
+    print(
+        f"{name} stats: shape={actual.shape} cosine_similarity={similarity:.6f} "
+        f"norm_actual={actual_norm:.6f} norm_expected={expected_norm:.6f}"
+    )
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index e2c13e9..01e7162 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -76,4 +76,5 @@ def test_prostate_classifier_snapshot(
         level=0,
         expected_is_positive=expected_is_positive,
         threshold=threshold,
+        case_name=label,
     )
diff --git a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
index 54e9c56..865f50f 100644
--- a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
+++ b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
@@ -27,4 +27,7 @@ def test_semantic_episeg(label: str, slide_path: str, x: int, y: int) -> None:
         expected_array_path=expected_array_path,
         tile_size=1024,
         level=0,
+        epithelium_threshold=0.5,
+        min_epithelium_fraction=0.01,
+        case_name=label,
     )
diff --git a/tests/model_snapshots/test_virchow2_model_snapshot.py b/tests/model_snapshots/test_virchow2_model_snapshot.py
index 38f6c1d..e94d097 100644
--- a/tests/model_snapshots/test_virchow2_model_snapshot.py
+++ b/tests/model_snapshots/test_virchow2_model_snapshot.py
@@ -27,4 +27,5 @@ def test_virchow2(label: str, slide_path: str, x: int, y: int) -> None:
         expected_array_path=expected_array_path,
         tile_size=224,
         level=0,
+        case_name=label,
     )

From be23dd5c92f72b33ffd14a12cb000ae9d7fe0d1f Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Wed, 6 May 2026 16:51:16 +0200
Subject: [PATCH 23/35] print

---
 builders/test_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/builders/test_runner.py b/builders/test_runner.py
index 01d7bdb..3baf09f 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -1,7 +1,7 @@
 import subprocess
 import sys
 
-from fastapi import FastAPI
+from fastapi import FastAPI, Response
 from ray import serve
 
 
@@ -18,7 +18,7 @@ def __init__(self) -> None:
         )
 
     @fastapi.post("/")
-    def run(self) -> str:
+    def run(self) -> Response:
 
         result = subprocess.run(
             [
@@ -39,7 +39,7 @@ def run(self) -> str:
         output = result.stdout
         if result.returncode != 0:
             output += f"\nSTDERR:\n{result.stderr}"
-        return output
+        return Response(content=output, media_type="text/plain")
 
 
 app = TestRunner.bind()

From fbfba69037964f7e0a693fcfd9eeb93c873d6038 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Wed, 6 May 2026 19:10:09 +0200
Subject: [PATCH 24/35] fix: add isclose()

Co-authored-by: Copilot <copilot@github.com>
---
 tests/model_snapshots/_shared.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index c3e751e..0c696c1 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -42,6 +42,8 @@ def run_binary_classifier_case(
     timeout_s: float = 600.0,
     expected_is_positive: bool | None = None,
     threshold: float = 0.5,
+    atol: float = 1e-6,
+    rtol: float = 1e-5,
     case_name: str | None = None,
 ) -> None:
     tile = _read_tile_at(slide_path, x, y, tile_size, level)
@@ -61,8 +63,13 @@ def run_binary_classifier_case(
             f"actual_score={actual_score:.6f}, threshold={threshold:.3f}"
         )
 
+    if not np.isclose(actual_score, expected_score, rtol=rtol, atol=atol):
+        pytest.fail(
+            f"Binary score mismatch beyond tolerance (atol={atol}, rtol={rtol}, "
+            f"expected={expected_score:.6f}, actual={actual_score:.6f})"
+        )
+
     print(f"\n/{model_id}")
-    print("passed")
     print(
         f"{name} stats: score={actual_score:.6f} expected={expected_score:.6f} "
         f"delta={delta:+.6f} threshold={threshold:.3f}"
@@ -116,9 +123,13 @@ def run_semantic_segmentation_case(
     if actual.shape != expected.shape:
         pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
 
-    if not np.allclose(actual, expected, rtol=rtol, atol=atol):
+    close_mask = np.isclose(actual, expected, rtol=rtol, atol=atol)
+    if not close_mask.all():
+        mismatch_fraction = float((~close_mask).mean())
         pytest.fail(
-            f"Output mismatch beyond tolerance (atol={atol}, rtol={rtol}, max_abs_diff={max_diff})"
+            "Output mismatch beyond tolerance "
+            f"(atol={atol}, rtol={rtol}, max_abs_diff={max_diff}, "
+            f"mismatch_fraction={mismatch_fraction:.6f})"
         )
 
     if epithelium_threshold is not None and min_epithelium_fraction is not None:
@@ -144,7 +155,6 @@ def run_semantic_segmentation_case(
             )
 
     print(f"\n/{model_id}")
-    print("passed")
     print(
         f"{name} stats: shape={actual.shape} max_diff={max_diff:.6f} "
         f"min={min_val:.6f} mean={mean_val:.6f} max={max_val:.6f} "
@@ -196,7 +206,6 @@ def run_embed_case(
         )
 
     print(f"\n/{model_id}")
-    print("passed")
     print(
         f"{name} stats: shape={actual.shape} cosine_similarity={similarity:.6f} "
         f"norm_actual={actual_norm:.6f} norm_expected={expected_norm:.6f}"

From b1010b5f5976b6655e4739901e35fa14ff94f4f9 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Tue, 12 May 2026 21:17:09 +0200
Subject: [PATCH 25/35] new tests

Co-authored-by: Copilot <copilot@github.com>
---
 builders/throughput_runner.py                 |  46 ++++
 .../applications/throughput-test.yaml         |  16 ++
 helm/rayservice/values.yaml                   |   1 +
 tests/benchmark/perf_throughput.py            | 236 ++++++++++++++++++
 .../test_prov_gigapath_model_snapshot.py      |  31 +++
 5 files changed, 330 insertions(+)
 create mode 100644 builders/throughput_runner.py
 create mode 100644 helm/rayservice/applications/throughput-test.yaml
 create mode 100644 tests/benchmark/perf_throughput.py
 create mode 100644 tests/model_snapshots/test_prov_gigapath_model_snapshot.py

diff --git a/builders/throughput_runner.py b/builders/throughput_runner.py
new file mode 100644
index 0000000..87a568c
--- /dev/null
+++ b/builders/throughput_runner.py
@@ -0,0 +1,46 @@
+import subprocess
+import sys
+
+from fastapi import FastAPI
+from ray import serve
+
+
+fastapi = FastAPI()
+
+
+@serve.deployment(num_replicas=1)
+@serve.ingress(fastapi)
+class ThroughputRunner:
+    def __init__(self) -> None:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "pytest", "-q"],
+            check=True,
+        )
+
+    @fastapi.post("/")
+    def run(
+        self,
+        duration_s: float = 60.0,
+        concurrency: int = 8,
+        timeout: float = 60.0,
+    ) -> str:
+        result = subprocess.run(
+            [
+                sys.executable,
+                "misc/throughput_test.py",
+                "--duration-s",
+                str(duration_s),
+                "--concurrency",
+                str(concurrency),
+                "--timeout",
+                str(timeout),
+            ],
+            capture_output=True,
+            text=True,
+        )
+        return result.stdout + (
+            f"\nSTDERR:\n{result.stderr}" if result.returncode != 0 else ""
+        )
+
+
+app = ThroughputRunner.bind()
diff --git a/helm/rayservice/applications/throughput-test.yaml b/helm/rayservice/applications/throughput-test.yaml
new file mode 100644
index 0000000..6bb045a
--- /dev/null
+++ b/helm/rayservice/applications/throughput-test.yaml
@@ -0,0 +1,16 @@
+- name: throughput-runner
+  import_path: builders.throughput_runner:app
+  route_prefix: /run-throughput
+  runtime_env:
+    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/tests.zip?v1
+    pip:
+      - git+https://github.com/RationAI/rationai-sdk-python.git
+  deployments:
+    - name: ThroughputRunner
+      autoscaling_config:
+        min_replicas: 0
+        max_replicas: 1
+        target_ongoing_requests: 1
+      ray_actor_options:
+        num_cpus: 1
+        memory: 2147483648
\ No newline at end of file
diff --git a/helm/rayservice/values.yaml b/helm/rayservice/values.yaml
index 34f8128..2dbc443 100644
--- a/helm/rayservice/values.yaml
+++ b/helm/rayservice/values.yaml
@@ -9,3 +9,4 @@ applications:
   - prov-gigapath
   - virchow2
   - test-runner
+  - throughput-test
diff --git a/tests/benchmark/perf_throughput.py b/tests/benchmark/perf_throughput.py
new file mode 100644
index 0000000..bb3fc36
--- /dev/null
+++ b/tests/benchmark/perf_throughput.py
@@ -0,0 +1,236 @@
+from __future__ import annotations
+
+import argparse
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from threading import Lock
+
+import numpy as np
+from rationai import Client
+
+
+DEFAULT_MODELS = [
+    ("prostate-classifier-1", "binary", 512),
+    ("episeg-1", "semantic", 1024),
+    ("virchow2", "embed", 224),
+]
+POOL_SIZE_DEFAULT = 64
+
+
+@dataclass
+class Stats:
+    ok: int = 0
+    fail_503: int = 0
+    fail_other: int = 0
+    latencies: list[float] = field(default_factory=list)
+    lock: Lock = field(default_factory=Lock)
+
+    @property
+    def total(self) -> int:
+        return self.ok + self.fail_503 + self.fail_other
+
+    def percentile(self, p: float) -> float:
+        if not self.latencies:
+            return 0.0
+        s = sorted(self.latencies)
+        idx = int(len(s) * p / 100)
+        return s[min(idx, len(s) - 1)]
+
+
+def _models_base_url() -> str:
+    return os.environ.get(
+        "MODEL_SERVICE_MODELS_BASE_URL",
+        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+    )
+
+
+def make_pool(tile_size: int, n: int) -> list[np.ndarray]:
+    rng = np.random.default_rng(seed=42)
+    pool = []
+    for _ in range(n):
+        img = rng.integers(0, 255, (tile_size, tile_size, 3), dtype=np.uint8)
+        pool.append(img)
+    return pool
+
+
+def _call_model(
+    client: Client, model_id: str, model_type: str, image: np.ndarray
+) -> None:
+    if model_type == "binary":
+        client.models.classify_image(model=model_id, image=image)
+    elif model_type == "semantic":
+        client.models.segment_image(model=model_id, image=image)
+    elif model_type == "embed":
+        client.models.embed_image(model=model_id, image=image)
+    else:
+        raise ValueError(f"Unknown model type: {model_type}")
+
+
+def send_loop(
+    model_id: str,
+    model_type: str,
+    pool: list[np.ndarray],
+    stats: Stats,
+    end_time: float,
+    timeout: float,
+    models_base_url: str,
+) -> None:
+    pool_len = len(pool)
+    idx = 0
+    with Client(models_base_url=models_base_url, timeout=timeout) as client:
+        while time.perf_counter() < end_time:
+            image = pool[idx % pool_len]
+            idx += 1
+            t0 = time.perf_counter()
+            try:
+                _call_model(client, model_id, model_type, image)
+                latency = time.perf_counter() - t0
+                with stats.lock:
+                    stats.ok += 1
+                    stats.latencies.append(latency)
+            except Exception as exc:
+                status_code = getattr(
+                    getattr(exc, "response", None), "status_code", None
+                )
+                with stats.lock:
+                    if status_code == 503:
+                        stats.fail_503 += 1
+                    else:
+                        stats.fail_other += 1
+
+
+def run_model(
+    name: str,
+    model_type: str,
+    tile_size: int,
+    duration_s: float,
+    concurrency: int,
+    timeout: float,
+    pool_size: int,
+    models_base_url: str,
+) -> dict:
+    if pool_size <= 0:
+        raise ValueError("pool_size must be > 0")
+
+    pool = make_pool(tile_size, pool_size)
+    stats = Stats()
+
+    start = time.perf_counter()
+    end_time = start + duration_s
+    with ThreadPoolExecutor(max_workers=concurrency) as executor:
+        futures = [
+            executor.submit(
+                send_loop,
+                name,
+                model_type,
+                pool,
+                stats,
+                end_time,
+                timeout,
+                models_base_url,
+            )
+            for _ in range(concurrency)
+        ]
+        for future in as_completed(futures):
+            future.result()
+    elapsed = time.perf_counter() - start
+
+    throughput = stats.ok / elapsed if elapsed > 0 else 0.0
+    return {
+        "name": name,
+        "model_type": model_type,
+        "tile_size": tile_size,
+        "elapsed_s": elapsed,
+        "ok": stats.ok,
+        "fail_503": stats.fail_503,
+        "fail_other": stats.fail_other,
+        "throughput": throughput,
+        "p50": stats.percentile(50),
+        "p95": stats.percentile(95),
+    }
+
+
+def parse_models(values: list[str]) -> list[tuple[str, str, int]]:
+    if not values:
+        return DEFAULT_MODELS
+    parsed: list[tuple[str, str, int]] = []
+    for item in values:
+        parts = [p.strip() for p in item.split(",")]
+        if len(parts) != 3:
+            raise ValueError("--model expects: model_id,model_type,tile_size")
+        model_id, model_type, tile_size = parts[0], parts[1], int(parts[2])
+        parsed.append((model_id, model_type, tile_size))
+    return parsed
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Run per-model throughput tests and report img/s via SDK."
+    )
+    parser.add_argument(
+        "--model",
+        action="append",
+        default=[],
+        help="Model spec: model_id,model_type,tile_size (repeatable)",
+    )
+    parser.add_argument(
+        "--models-base-url",
+        default=_models_base_url(),
+        help="Base URL for the SDK (default: MODEL_SERVICE_MODELS_BASE_URL or http://localhost:8000)",
+    )
+    parser.add_argument("--duration-s", type=float, default=300.0)
+    parser.add_argument("--concurrency", type=int, default=64)
+    parser.add_argument("--timeout", type=float, default=60.0)
+    parser.add_argument("--pool-size", type=int, default=POOL_SIZE_DEFAULT)
+    args = parser.parse_args()
+
+    models = parse_models(args.model)
+
+    print("=" * 72)
+    print("Throughput Test (img/s) - SDK")
+    print("=" * 72)
+    print(f"Models base URL: {args.models_base_url}")
+    print(f"Duration:        {args.duration_s:.0f}s")
+    print(f"Concurrency:     {args.concurrency}")
+    print(f"Timeout:         {args.timeout}s")
+    print()
+
+    results = []
+    for name, model_type, tile_size in models:
+        print(f"/ {name}  ({model_type}, tile={tile_size})")
+        result = run_model(
+            name,
+            model_type,
+            tile_size,
+            args.duration_s,
+            args.concurrency,
+            args.timeout,
+            args.pool_size,
+            args.models_base_url,
+        )
+        results.append(result)
+        print(
+            f"  ok={result['ok']} fail_503={result['fail_503']} "
+            f"fail_other={result['fail_other']} elapsed={result['elapsed_s']:.2f}s"
+        )
+        print(
+            f"  img/s={result['throughput']:.2f} p50={result['p50']:.3f}s "
+            f"p95={result['p95']:.3f}s"
+        )
+        print()
+
+    print("Summary")
+    print("name".ljust(28), "img/s".rjust(10), "p50".rjust(10), "p95".rjust(10))
+    for r in results:
+        print(
+            r["name"].ljust(28),
+            f"{r['throughput']:.2f}".rjust(10),
+            f"{r['p50']:.3f}".rjust(10),
+            f"{r['p95']:.3f}".rjust(10),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/model_snapshots/test_prov_gigapath_model_snapshot.py b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
new file mode 100644
index 0000000..9d3f200
--- /dev/null
+++ b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+
+import pytest
+from _shared import run_embed_case
+
+
+@pytest.mark.parametrize(
+    "label, slide_path, x, y",
+    [
+        (
+            "prov-gigapath",
+            "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
+            40000,
+            70000,
+        ),
+    ],
+)
+def test_prov_gigapath(label: str, slide_path: str, x: int, y: int) -> None:
+    model_id = "prov-gigapath"
+    expected_array_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.npy")
+
+    run_embed_case(
+        model_id=model_id,
+        slide_path=slide_path,
+        x=x,
+        y=y,
+        expected_array_path=expected_array_path,
+        tile_size=224,
+        level=0,
+        case_name=label,
+    )

From db4cd4fffe08cbc8311b5b3f28b79e8445ed756d Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Wed, 13 May 2026 14:43:07 +0200
Subject: [PATCH 26/35] fix path

Co-authored-by: Copilot <copilot@github.com>
---
 builders/throughput_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/builders/throughput_runner.py b/builders/throughput_runner.py
index 87a568c..300b35b 100644
--- a/builders/throughput_runner.py
+++ b/builders/throughput_runner.py
@@ -27,7 +27,7 @@ def run(
         result = subprocess.run(
             [
                 sys.executable,
-                "misc/throughput_test.py",
+                "tests/benchmark/perf_throughput.py",
                 "--duration-s",
                 str(duration_s),
                 "--concurrency",

From db27737070ce3f83486e5b46852685ce7183a6b6 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Wed, 13 May 2026 15:47:43 +0200
Subject: [PATCH 27/35] print

Co-authored-by: Copilot <copilot@github.com>
---
 builders/throughput_runner.py      |  7 ++++---
 tests/benchmark/perf_throughput.py | 10 +++-------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/builders/throughput_runner.py b/builders/throughput_runner.py
index 300b35b..e8126fb 100644
--- a/builders/throughput_runner.py
+++ b/builders/throughput_runner.py
@@ -1,7 +1,7 @@
 import subprocess
 import sys
 
-from fastapi import FastAPI
+from fastapi import FastAPI, Response
 from ray import serve
 
 
@@ -23,7 +23,7 @@ def run(
         duration_s: float = 60.0,
         concurrency: int = 8,
         timeout: float = 60.0,
-    ) -> str:
+    ) -> Response:
         result = subprocess.run(
             [
                 sys.executable,
@@ -38,9 +38,10 @@ def run(
             capture_output=True,
             text=True,
         )
-        return result.stdout + (
+        output = result.stdout + (
             f"\nSTDERR:\n{result.stderr}" if result.returncode != 0 else ""
         )
+        return Response(content=output, media_type="text/plain")
 
 
 app = ThroughputRunner.bind()
diff --git a/tests/benchmark/perf_throughput.py b/tests/benchmark/perf_throughput.py
index bb3fc36..20bd4bf 100644
--- a/tests/benchmark/perf_throughput.py
+++ b/tests/benchmark/perf_throughput.py
@@ -199,7 +199,6 @@ def main() -> None:
 
     results = []
     for name, model_type, tile_size in models:
-        print(f"/ {name}  ({model_type}, tile={tile_size})")
         result = run_model(
             name,
             model_type,
@@ -212,14 +211,11 @@ def main() -> None:
         )
         results.append(result)
         print(
-            f"  ok={result['ok']} fail_503={result['fail_503']} "
-            f"fail_other={result['fail_other']} elapsed={result['elapsed_s']:.2f}s"
-        )
-        print(
-            f"  img/s={result['throughput']:.2f} p50={result['p50']:.3f}s "
+            f"{name} stats: ok={result['ok']} fail_503={result['fail_503']} "
+            f"fail_other={result['fail_other']} elapsed={result['elapsed_s']:.2f}s "
+            f"img/s={result['throughput']:.2f} p50={result['p50']:.3f}s "
             f"p95={result['p95']:.3f}s"
         )
-        print()
 
     print("Summary")
     print("name".ljust(28), "img/s".rjust(10), "p50".rjust(10), "p95".rjust(10))

From d557f5f1a8265372368bf70d4b98592de2185d0c Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Wed, 13 May 2026 15:52:30 +0200
Subject: [PATCH 28/35] test

Co-authored-by: Copilot <copilot@github.com>
---
 builders/throughput_runner.py      | 41 ++++++++++++-------
 tests/benchmark/perf_throughput.py | 63 ++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/builders/throughput_runner.py b/builders/throughput_runner.py
index e8126fb..d3ba549 100644
--- a/builders/throughput_runner.py
+++ b/builders/throughput_runner.py
@@ -20,24 +20,35 @@ def __init__(self) -> None:
     @fastapi.post("/")
     def run(
         self,
-        duration_s: float = 60.0,
+        duration_s: float = 300.0,
         concurrency: int = 8,
         timeout: float = 60.0,
+        wait_ready: bool = True,
+        wait_timeout_s: float = 0.0,
+        wait_interval_s: float = 10.0,
     ) -> Response:
-        result = subprocess.run(
-            [
-                sys.executable,
-                "tests/benchmark/perf_throughput.py",
-                "--duration-s",
-                str(duration_s),
-                "--concurrency",
-                str(concurrency),
-                "--timeout",
-                str(timeout),
-            ],
-            capture_output=True,
-            text=True,
-        )
+        cmd = [
+            sys.executable,
+            "tests/benchmark/perf_throughput.py",
+            "--duration-s",
+            str(duration_s),
+            "--concurrency",
+            str(concurrency),
+            "--timeout",
+            str(timeout),
+        ]
+        if wait_ready:
+            cmd.extend(
+                [
+                    "--wait-ready",
+                    "--wait-timeout-s",
+                    str(wait_timeout_s),
+                    "--wait-interval-s",
+                    str(wait_interval_s),
+                ]
+            )
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
         output = result.stdout + (
             f"\nSTDERR:\n{result.stderr}" if result.returncode != 0 else ""
         )
diff --git a/tests/benchmark/perf_throughput.py b/tests/benchmark/perf_throughput.py
index 20bd4bf..726923f 100644
--- a/tests/benchmark/perf_throughput.py
+++ b/tests/benchmark/perf_throughput.py
@@ -68,6 +68,42 @@ def _call_model(
         raise ValueError(f"Unknown model type: {model_type}")
 
 
+def wait_for_ready(
+    model_id: str,
+    model_type: str,
+    tile_size: int,
+    timeout: float,
+    models_base_url: str,
+    wait_timeout_s: float,
+    wait_interval_s: float,
+) -> None:
+    pool = make_pool(tile_size, 1)
+    image = pool[0]
+    start = time.perf_counter()
+    reported = False
+
+    while True:
+        try:
+            with Client(models_base_url=models_base_url, timeout=timeout) as client:
+                _call_model(client, model_id, model_type, image)
+            if reported:
+                waited = time.perf_counter() - start
+                print(f"{model_id} ready after {waited:.1f}s")
+            return
+        except Exception as exc:
+            status_code = getattr(getattr(exc, "response", None), "status_code", None)
+            if status_code not in (None, 503, 504):
+                raise
+            if not reported:
+                print(f"{model_id} waiting for readiness...")
+                reported = True
+            if wait_timeout_s > 0 and (time.perf_counter() - start) >= wait_timeout_s:
+                raise RuntimeError(
+                    f"{model_id} not ready after {wait_timeout_s:.1f}s"
+                ) from exc
+            time.sleep(wait_interval_s)
+
+
 def send_loop(
     model_id: str,
     model_type: str,
@@ -184,6 +220,23 @@ def main() -> None:
     parser.add_argument("--concurrency", type=int, default=64)
     parser.add_argument("--timeout", type=float, default=60.0)
     parser.add_argument("--pool-size", type=int, default=POOL_SIZE_DEFAULT)
+    parser.add_argument(
+        "--wait-ready",
+        action="store_true",
+        help="Wait for each model to become ready before running the test",
+    )
+    parser.add_argument(
+        "--wait-timeout-s",
+        type=float,
+        default=0.0,
+        help="Max time to wait for readiness (0 = wait forever)",
+    )
+    parser.add_argument(
+        "--wait-interval-s",
+        type=float,
+        default=10.0,
+        help="Wait interval between readiness checks",
+    )
     args = parser.parse_args()
 
     models = parse_models(args.model)
@@ -199,6 +252,16 @@ def main() -> None:
 
     results = []
     for name, model_type, tile_size in models:
+        if args.wait_ready:
+            wait_for_ready(
+                model_id=name,
+                model_type=model_type,
+                tile_size=tile_size,
+                timeout=args.timeout,
+                models_base_url=args.models_base_url,
+                wait_timeout_s=args.wait_timeout_s,
+                wait_interval_s=args.wait_interval_s,
+            )
         result = run_model(
             name,
             model_type,

From f14c82ce62d23bd8fc13e52e062f97003743f2db Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 31 May 2026 10:46:16 +0200
Subject: [PATCH 29/35] fixes

Co-authored-by: Copilot <copilot@github.com>
---
 tests/benchmark/benchmark_batch_size.py       | 333 ------------------
 tests/benchmark/load_test.py                  | 286 ---------------
 tests/model_snapshots/generate_references.py  |  17 +-
 tests/model_snapshots/new_images.txt          |   6 +
 .../test_binary_classifier_model_snapshot.py  |   8 +-
 .../test_prov_gigapath_model_snapshot.py      |   3 +-
 ...st_semantic_segmentation_model_snapshot.py |   3 +-
 .../test_virchow2_model_snapshot.py           |   3 +-
 tests/{benchmark => }/perf_throughput.py      |   2 +-
 9 files changed, 24 insertions(+), 637 deletions(-)
 delete mode 100644 tests/benchmark/benchmark_batch_size.py
 delete mode 100644 tests/benchmark/load_test.py
 create mode 100644 tests/model_snapshots/new_images.txt
 rename tests/{benchmark => }/perf_throughput.py (98%)

diff --git a/tests/benchmark/benchmark_batch_size.py b/tests/benchmark/benchmark_batch_size.py
deleted file mode 100644
index 7ca0f1f..0000000
--- a/tests/benchmark/benchmark_batch_size.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# kubectl apply -n rationai-jobs-ns -f c:\Users\jiris\muni-dp\dp\model-service\ray-service.yaml
-# kubectl get pods -n rationai-jobs-ns | Select-String "episeg" (model name)
-# kubectl cp tests/benchmark_batch_size.py rationai-jobs-ns/rayservice-model-optimized-7zwlk-head-fbzr5:/tmp/benchmark_batch_size.py
-# kubectl exec -n rationai-jobs-ns rayservice-model-optimized-7zwlk-head-fbzr5 -- bash -c "python3 -u /tmp/benchmark_batch_size.py --url http://localhost:8000/episeg-1/ --batch-size 128"
-
-# kubectl exec -n rationai-jobs-ns rayservice-model-optimized-7zwlk-head-fbzr5 -- bash -c "pip install httpx -q && python3 -u /tmp/benchmark_batch_size.py --url http://localhost:8000/episeg-1/ --batch-size 8 --concurrency-values 4,8,16,24,32,48,64 --tile-size 1024 --n 500 --warmup 100"
-
-from __future__ import annotations
-
-import argparse
-import asyncio
-import csv
-import sys
-import time
-from pathlib import Path
-
-import lz4.frame
-import numpy as np
-
-
-try:
-    import httpx
-except ImportError:
-    print("pip install httpx")
-    sys.exit(1)
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-TILE_SIZE_DEFAULT = 224
-POOL_SIZE = 64
-OUTPUT_CSV = "results.csv"
-
-
-def make_pool(tile_size: int, n: int = POOL_SIZE) -> list[bytes]:
-    rng = np.random.default_rng(seed=42)
-    pool = []
-    for _ in range(n):
-        img = rng.integers(0, 255, (tile_size, tile_size, 3), dtype=np.uint8)
-        pool.append(lz4.frame.compress(img.tobytes()))
-    return pool
-
-
-async def run_batch(
-    url: str,
-    pool: list[bytes],
-    total: int,
-    concurrency: int,
-    timeout: float,
-) -> tuple[float, int, int]:
-    """Pošle `total` requestů s `concurrency` souběžnými workery."""
-    remaining = total
-    ok = 0
-    fail = 0
-    pool_len = len(pool)
-    counter = 0
-    lock = asyncio.Lock()
-
-    limits = httpx.Limits(
-        max_connections=concurrency + 8,
-        max_keepalive_connections=concurrency + 8,
-    )
-
-    async def worker(client: httpx.AsyncClient) -> None:
-        nonlocal remaining, ok, fail, counter
-        while True:
-            async with lock:
-                if remaining <= 0:
-                    return
-                remaining -= 1
-                idx = counter % pool_len
-                counter += 1
-            payload = pool[idx]
-            try:
-                r = await client.post(
-                    url,
-                    content=payload,
-                    headers={"Content-Type": "application/octet-stream"},
-                    timeout=timeout,
-                )
-                if r.status_code == 200:
-                    ok += 1
-                else:
-                    fail += 1
-                    print(f"  [WARN] HTTP {r.status_code}: {r.text[:120]}")
-            except Exception as e:
-                fail += 1
-                print(f"  [ERR] {type(e).__name__}: {e!r}")
-
-    t0 = time.perf_counter()
-    async with httpx.AsyncClient(limits=limits, follow_redirects=True) as client:
-        await asyncio.gather(*[worker(client) for _ in range(concurrency)])
-    return time.perf_counter() - t0, ok, fail
-
-
-def append_csv(path: str, row: dict) -> None:
-    fieldnames = [
-        "url",
-        "batch_size",
-        "concurrency",
-        "n",
-        "elapsed_s",
-        "throughput_img_s",
-        "ok",
-        "fail",
-    ]
-    write_header = not Path(path).exists()
-    with open(path, "a", newline="") as f:
-        writer = csv.DictWriter(f, fieldnames=fieldnames)
-        if write_header:
-            writer.writeheader()
-        writer.writerow(row)
-
-
-def load_csv(path: str, url: str) -> list[dict]:
-    if not Path(path).exists():
-        return []
-    with open(path) as f:
-        reader = csv.DictReader(f)
-        return [r for r in reader if r["url"] == url]
-
-
-def concurrency_sweep_values(batch_size: int) -> list[int]:
-    """Pro MIG-2g.20gb: testujeme rozsah od batch_size/2 do batch_size*4.
-    Jemnější kroky kolem batch_size kde bývá knee.
-    """
-    half = max(1, batch_size // 2)
-    candidates = sorted(
-        set(
-            [
-                half,
-                batch_size,
-                batch_size + batch_size // 2,
-                batch_size * 2,
-                batch_size * 3,
-                batch_size * 4,
-            ]
-        )
-    )
-    # Přidej mezikroky kolem batch_size
-    extras = [batch_size - batch_size // 4, batch_size + batch_size // 4]
-    candidates = sorted(set(candidates + [e for e in extras if e > 0]))
-    return candidates
-
-
-def print_summary(rows: list[dict], batch_size: int | None = None) -> None:
-    if not rows:
-        return
-    if batch_size is not None:
-        rows = [r for r in rows if int(r["batch_size"]) == batch_size]
-    if not rows:
-        return
-
-    best = max(rows, key=lambda r: float(r["throughput_img_s"]))
-
-    header = f"{'batch_size':>12} {'concurrency':>12} {'throughput img/s':>18} {'ok':>8} {'fail':>8}"
-    print(header)
-    print("-" * len(header))
-    for row in sorted(
-        rows, key=lambda r: (int(r["batch_size"]), int(r["concurrency"]))
-    ):
-        marker = " ← BEST" if row is best else ""
-        fail_val = int(row["fail"])
-        fail_str = f"[!]{fail_val}" if fail_val > 0 else str(fail_val)
-        print(
-            f"{row['batch_size']:>12} {row['concurrency']:>12}"
-            f" {row['throughput_img_s']:>18} {row['ok']:>8} {fail_str:>8}{marker}"
-        )
-    print()
-    print("Doporučené YAML hodnoty pro batch_size =", best["batch_size"])
-    tor = int(best["concurrency"])
-    mor = int(tor * 1.25) + 8
-    print(f"  max_batch_size:           {best['batch_size']}")
-    print(f"  target_ongoing_requests:  {tor}   # = nejlepší concurrency")
-    print(f"  max_ongoing_requests:     {mor}   # target * 1.25 + buffer")
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-
-async def main() -> None:
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    parser.add_argument(
-        "--url",
-        default="http://localhost:8000/virchow2/",
-        help="Endpointová URL (default: http://localhost:8000/virchow2/)",
-    )
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        required=True,
-        help="max_batch_size nastavený v user_config (shodný s YAML)",
-    )
-    parser.add_argument(
-        "--concurrency",
-        type=int,
-        default=None,
-        help="Pevná hodnota concurrency – přeskočí sweep a naměří jen tuto",
-    )
-    parser.add_argument(
-        "--concurrency-values",
-        type=str,
-        default=None,
-        help="Čárkami oddělený seznam concurrency hodnot k otestování, "
-        "např. '32,64,128,256'  (přepíše výchozí sweep)",
-    )
-    parser.add_argument(
-        "--n",
-        type=int,
-        default=1000,
-        help="Počet měřených requestů na jeden bod (default: 1000)",
-    )
-    parser.add_argument(
-        "--warmup",
-        type=int,
-        default=100,
-        help="Warmup requesty před měřením (default: 100)",
-    )
-    parser.add_argument("--tile-size", type=int, default=TILE_SIZE_DEFAULT)
-    parser.add_argument("--timeout", type=float, default=300.0)
-    parser.add_argument(
-        "--output",
-        default=OUTPUT_CSV,
-        help=f"Výstupní CSV soubor (default: {OUTPUT_CSV})",
-    )
-    parser.add_argument(
-        "--skip-existing",
-        action="store_true",
-        help="Přeskočí (batch_size, concurrency) kombinace už změřené v CSV",
-    )
-    args = parser.parse_args()
-
-    url = args.url.rstrip("/") + "/"
-    pool = make_pool(args.tile_size)
-
-    # Determine sweep values
-    if args.concurrency is not None:
-        sweep = [args.concurrency]
-    elif args.concurrency_values:
-        sweep = [int(v.strip()) for v in args.concurrency_values.split(",")]
-    else:
-        sweep = concurrency_sweep_values(args.batch_size)
-
-    # Already measured (for --skip-existing)
-    existing: set[int] = set()
-    if args.skip_existing:
-        for row in load_csv(args.output, url):
-            if int(row["batch_size"]) == args.batch_size:
-                existing.add(int(row["concurrency"]))
-
-    print("=" * 60)
-    print("Virchow2 Benchmark Sweep")
-    print("=" * 60)
-    print(f"URL:              {url}")
-    print(f"max_batch_size:   {args.batch_size}  (musí odpovídat YAML!)")
-    print(f"concurrency sweep:{sweep}")
-    print(f"n per point:      {args.n}")
-    print(f"warmup:           {args.warmup}")
-    print(f"output:           {args.output}")
-    print()
-
-    # Warmup – jednou, s prostředním concurrency
-    warmup_conc = sweep[len(sweep) // 2]
-    print(f"Warmup ({args.warmup} img, concurrency={warmup_conc})...")
-    await run_batch(url, pool, args.warmup, warmup_conc, args.timeout)
-    print("Warmup done.\n")
-
-    results_this_run: list[dict] = []
-
-    for conc in sweep:
-        if conc in existing:
-            print(f"[SKIP] concurrency={conc} (already in CSV)")
-            continue
-
-        print(f"▶ batch_size={args.batch_size}  concurrency={conc}  ({args.n} img)...")
-        elapsed, ok, fail = await run_batch(url, pool, args.n, conc, args.timeout)
-        rps = ok / elapsed if elapsed > 0 else 0.0
-
-        row = {
-            "url": url,
-            "batch_size": args.batch_size,
-            "concurrency": conc,
-            "n": ok + fail,
-            "elapsed_s": f"{elapsed:.2f}",
-            "throughput_img_s": f"{rps:.1f}",
-            "ok": ok,
-            "fail": fail,
-        }
-        append_csv(args.output, row)
-        results_this_run.append(row)
-
-        status = f"  → {rps:.1f} img/s"
-        if fail:
-            status += f"  [{fail} failures!]"
-        print(status)
-
-        # Kratká pauza mezi body aby se server stabilizoval
-        await asyncio.sleep(2)
-
-    # Summary – jen aktuální batch_size
-    print()
-    print("=" * 60)
-    print(f"Výsledky pro batch_size = {args.batch_size}")
-    print("=" * 60)
-    all_rows = load_csv(args.output, url)
-    print_summary(all_rows, batch_size=args.batch_size)
-
-    # Pokud existují data pro více batch_size, ukaž i celkové porovnání
-    all_batch_sizes = sorted(set(int(r["batch_size"]) for r in all_rows))
-    if len(all_batch_sizes) > 1:
-        print()
-        print("=" * 60)
-        print("Celkové porovnání všech batch_size (best concurrency per batch)")
-        print("=" * 60)
-        # Pro každý batch_size vyber jen nejlepší concurrency
-        best_per_batch = []
-        for bs in all_batch_sizes:
-            candidates = [r for r in all_rows if int(r["batch_size"]) == bs]
-            if candidates:
-                best_per_batch.append(
-                    max(candidates, key=lambda r: float(r["throughput_img_s"]))
-                )
-        print_summary(best_per_batch)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/tests/benchmark/load_test.py b/tests/benchmark/load_test.py
deleted file mode 100644
index ba5b8ef..0000000
--- a/tests/benchmark/load_test.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# kubectl cp tests/load_test.py rationai-jobs-ns/rayservice-model-virchow2-5qfmz-head-98tbv:/tmp/load_test.py
-# kubectl exec -n rationai-jobs-ns rayservice-model-virchow2-5qfmz-head-98tbv -- bash -c "python3 -u /tmp/load_test.py --url http://localhost:8000/virchow2/ --tiles 5000 --concurrency 128"
-from __future__ import annotations
-
-import argparse
-import asyncio
-import sys
-import time
-from dataclasses import dataclass, field
-
-import lz4.frame
-import numpy as np
-
-
-try:
-    import httpx
-except ImportError:
-    print("pip install httpx")
-    sys.exit(1)
-
-
-TILE_SIZE_DEFAULT = 224
-POOL_SIZE = 64
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def make_pool(tile_size: int, n: int = POOL_SIZE) -> list[bytes]:
-    rng = np.random.default_rng(seed=42)
-    pool = []
-    for _ in range(n):
-        img = rng.integers(0, 255, (tile_size, tile_size, 3), dtype=np.uint8)
-        pool.append(lz4.frame.compress(img.tobytes()))
-    return pool
-
-
-@dataclass
-class Stats:
-    ok: int = 0
-    fail_503: int = 0
-    fail_other: int = 0
-    latencies: list[float] = field(default_factory=list)
-    lock: asyncio.Lock = field(default_factory=asyncio.Lock)
-
-    @property
-    def total(self) -> int:
-        return self.ok + self.fail_503 + self.fail_other
-
-    def percentile(self, p: float) -> float:
-        if not self.latencies:
-            return 0.0
-        s = sorted(self.latencies)
-        idx = int(len(s) * p / 100)
-        return s[min(idx, len(s) - 1)]
-
-
-async def send_tile(
-    client: httpx.AsyncClient,
-    url: str,
-    payload: bytes,
-    stats: Stats,
-    timeout: float,
-    progress_every: int,
-) -> None:
-    t0 = time.perf_counter()
-    try:
-        r = await client.post(
-            url,
-            content=payload,
-            headers={"Content-Type": "application/octet-stream"},
-            timeout=timeout,
-        )
-        latency = time.perf_counter() - t0
-        async with stats.lock:
-            if r.status_code == 200:
-                stats.ok += 1
-                stats.latencies.append(latency)
-                if stats.ok % progress_every == 0:
-                    print(
-                        f"  ✓ {stats.ok} OK  |  503: {stats.fail_503}  |  other: {stats.fail_other}"
-                    )
-            elif r.status_code == 503:
-                stats.fail_503 += 1
-            else:
-                stats.fail_other += 1
-                print(f"  [WARN] HTTP {r.status_code}: {r.text[:120]}")
-    except Exception as e:
-        async with stats.lock:
-            stats.fail_other += 1
-        print(f"  [ERR] {e}")
-
-
-async def run_wsi(
-    url: str,
-    pool: list[bytes],
-    tiles: int,
-    concurrency: int,
-    timeout: float,
-    wsi_id: int,
-    stats: Stats,
-) -> float:
-    """Simuluje jeden WSI — pošle `tiles` requestů s max `concurrency` souběžně."""
-    semaphore = asyncio.Semaphore(concurrency)
-    pool_len = len(pool)
-
-    limits = httpx.Limits(
-        max_connections=concurrency + 8,
-        max_keepalive_connections=concurrency + 8,
-    )
-
-    async def bounded_send(client: httpx.AsyncClient, idx: int) -> None:
-        async with semaphore:
-            await send_tile(
-                client,
-                url,
-                pool[idx % pool_len],
-                stats,
-                timeout,
-                progress_every=max(tiles // 10, 100),
-            )
-
-    t0 = time.perf_counter()
-    async with httpx.AsyncClient(limits=limits, follow_redirects=True) as client:
-        tasks = [bounded_send(client, i) for i in range(tiles)]
-        await asyncio.gather(*tasks)
-    return time.perf_counter() - t0
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-
-async def main() -> None:
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-    )
-    parser.add_argument(
-        "--url", default="http://localhost:8000/virchow2/", help="Endpoint URL"
-    )
-    parser.add_argument(
-        "--tiles",
-        type=int,
-        default=5000,
-        help="Počet dlaždic na jeden WSI (default: 5000)",
-    )
-    parser.add_argument(
-        "--wsi-count",
-        type=int,
-        default=1,
-        help="Počet paralelních WSI slidů (default: 1)",
-    )
-    parser.add_argument(
-        "--concurrency",
-        type=int,
-        default=128,
-        help="Max souběžných requestů na WSI (default: 128, "
-        "mělo by odpovídat target_ongoing_requests)",
-    )
-    parser.add_argument("--tile-size", type=int, default=TILE_SIZE_DEFAULT)
-    parser.add_argument(
-        "--timeout",
-        type=float,
-        default=120.0,
-        help="Timeout na jeden request v sekundách (default: 120)",
-    )
-    parser.add_argument(
-        "--warmup",
-        type=int,
-        default=50,
-        help="Warmup requestů před testem (default: 50)",
-    )
-    parser.add_argument("--no-warmup", action="store_true", help="Přeskočit warmup")
-    args = parser.parse_args()
-
-    url = args.url.rstrip("/") + "/"
-    pool = make_pool(args.tile_size)
-    total_tiles = args.tiles * args.wsi_count
-
-    print("=" * 60)
-    print("Virchow2 WSI Load Test")
-    print("=" * 60)
-    print(f"URL:              {url}")
-    print(f"Tiles per WSI:    {args.tiles:,}")
-    print(f"WSI count:        {args.wsi_count}")
-    print(f"Total tiles:      {total_tiles:,}")
-    print(f"Concurrency/WSI:  {args.concurrency}")
-    print(f"Total concurrent: {args.concurrency * args.wsi_count}")
-    print(f"Request timeout:  {args.timeout}s")
-    print()
-
-    # Warmup
-    if not args.no_warmup:
-        print(f"Warmup ({args.warmup} tiles)...")
-        warmup_stats = Stats()
-        await run_wsi(
-            url,
-            pool,
-            args.warmup,
-            min(args.concurrency, 32),
-            args.timeout,
-            wsi_id=0,
-            stats=warmup_stats,
-        )
-        print(
-            f"Warmup done (ok={warmup_stats.ok}, fail={warmup_stats.fail_503 + warmup_stats.fail_other}).\n"
-        )
-
-    # Actual test
-    stats = Stats()
-    print(
-        f"▶ Spouštím {'paralelně ' + str(args.wsi_count) + ' WSI' if args.wsi_count > 1 else '1 WSI'}  "
-        f"({total_tiles:,} tiles celkem)...\n"
-    )
-
-    t0 = time.perf_counter()
-
-    if args.wsi_count == 1:
-        await run_wsi(
-            url, pool, args.tiles, args.concurrency, args.timeout, wsi_id=0, stats=stats
-        )
-    else:
-        # Všechny WSI slidy spustit paralelně — simulace více scannerů najednou
-        await asyncio.gather(
-            *[
-                run_wsi(
-                    url,
-                    pool,
-                    args.tiles,
-                    args.concurrency,
-                    args.timeout,
-                    wsi_id=i,
-                    stats=stats,
-                )
-                for i in range(args.wsi_count)
-            ]
-        )
-
-    elapsed = time.perf_counter() - t0
-    rps = stats.ok / elapsed if elapsed > 0 else 0.0
-
-    # Report
-    print()
-    print("=" * 60)
-    print("Výsledky")
-    print("=" * 60)
-    print(f"Celkový čas:      {elapsed:.1f}s  ({elapsed / 60:.1f} min)")
-    print(f"Throughput:       {rps:.1f} img/s")
-    print()
-    print(
-        f"Úspěšné:          {stats.ok:,} / {total_tiles:,}  ({100 * stats.ok / total_tiles:.1f}%)"
-    )
-    print(
-        f"503 backpressure: {stats.fail_503:,}  ({100 * stats.fail_503 / total_tiles:.1f}%)"
-    )
-    print(f"Jiné chyby:       {stats.fail_other:,}")
-    print()
-    if stats.latencies:
-        print("Latence (úspěšné requesty):")
-        print(f"  p50:  {stats.percentile(50) * 1000:.0f} ms")
-        print(f"  p90:  {stats.percentile(90) * 1000:.0f} ms")
-        print(f"  p99:  {stats.percentile(99) * 1000:.0f} ms")
-        print(f"  max:  {max(stats.latencies) * 1000:.0f} ms")
-    print()
-
-    # Verdict
-    fail_rate = (stats.fail_503 + stats.fail_other) / total_tiles
-    if fail_rate == 0:
-        print("✅ PASS — žádné chyby, nastavení je v pořádku pro WSI.")
-    elif fail_rate < 0.01:
-        print(
-            f"⚠️  WARN — {fail_rate * 100:.2f}% chyb. Zvažte zvýšení max_queued_requests."
-        )
-    else:
-        print(
-            f"❌ FAIL — {fail_rate * 100:.1f}% chyb. Nastavení nestačí pro tento objem."
-        )
-        print("   → Zvyšte max_queued_requests nebo snižte --concurrency klientů.")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 3bba3e0..8078bdf 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -3,9 +3,10 @@
 from pathlib import Path
 
 import numpy as np
-from _shared import _read_tile_at
 from rationai import Client
 
+from tests.model_snapshots._shared import _read_tile_at
+
 
 OUT_DIR = Path("/mnt/test_refs")
 MODELS_BASE_URL = os.environ.get(
@@ -17,14 +18,14 @@
 # Keep only one active case here. Store other candidate slides in new_images.txt
 # and swap them in when you want to regenerate a different reference.
 ACTIVE_CASE = {
-    "label": "virchow2",
-    "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
-    "model_id": "virchow2",
-    "type": "embed",
-    "tile_size": 224,
+    "label": "colorectum_kos04",
+    "slide_path": "/mnt/data/MOU/colorectum/tissue_microarray/he/KOS04.mrxs",
+    "model_id": "episeg-1",
+    "type": "semantic",
+    "tile_size": 1024,
     "level": 0,
-    "x": 40000,
-    "y": 70000,
+    "x": 46000,
+    "y": 82400,
 }
 
 CASES = [ACTIVE_CASE]
diff --git a/tests/model_snapshots/new_images.txt b/tests/model_snapshots/new_images.txt
new file mode 100644
index 0000000..46c847e
--- /dev/null
+++ b/tests/model_snapshots/new_images.txt
@@ -0,0 +1,6 @@
+# New images and coordinates
+# Format: label | slide_path | x | y | tile_size | level | notes
+prostate_positive | /mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs | 43390 | 45865 | 512 | 0 | prostate-classifier-1 positive
+prostate_negative | /mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs | 32950 | 108990 | 512 | 0 | prostate-classifier-1 negative
+prostate | /mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs | 40000 | 70000 | 224 | 0 | virchow2 embed
+colorectum_kos04 | /mnt/data/MOU/colorectum/tissue_microarray/he/KOS04.mrxs | 46000 | 82400 | 1024 | 0 | episeg-1 semantic
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index 01e7162..ce3d71f 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -2,7 +2,8 @@
 from pathlib import Path
 
 import pytest
-from _shared import run_binary_classifier_case
+
+from tests.model_snapshots._shared import run_binary_classifier_case
 
 
 BINARY_POSITIVE_THRESHOLD = 0.5
@@ -33,11 +34,6 @@ def test_prostate_classifier_snapshot(
     model_id = "prostate-classifier-1"
     json_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.json")
 
-    if not json_path.exists():
-        pytest.skip(
-            f"Reference file {json_path} missing. Run generate_references.py first."
-        )
-
     with json_path.open() as f:
         reference = json.load(f)
 
diff --git a/tests/model_snapshots/test_prov_gigapath_model_snapshot.py b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
index 9d3f200..9f6aad6 100644
--- a/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
+++ b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
@@ -1,7 +1,8 @@
 from pathlib import Path
 
 import pytest
-from _shared import run_embed_case
+
+from tests.model_snapshots._shared import run_embed_case
 
 
 @pytest.mark.parametrize(
diff --git a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
index 865f50f..8ff06cc 100644
--- a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
+++ b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
@@ -1,7 +1,8 @@
 from pathlib import Path
 
 import pytest
-from _shared import run_semantic_segmentation_case
+
+from tests.model_snapshots._shared import run_semantic_segmentation_case
 
 
 @pytest.mark.parametrize(
diff --git a/tests/model_snapshots/test_virchow2_model_snapshot.py b/tests/model_snapshots/test_virchow2_model_snapshot.py
index e94d097..4714005 100644
--- a/tests/model_snapshots/test_virchow2_model_snapshot.py
+++ b/tests/model_snapshots/test_virchow2_model_snapshot.py
@@ -1,7 +1,8 @@
 from pathlib import Path
 
 import pytest
-from _shared import run_embed_case
+
+from tests.model_snapshots._shared import run_embed_case
 
 
 @pytest.mark.parametrize(
diff --git a/tests/benchmark/perf_throughput.py b/tests/perf_throughput.py
similarity index 98%
rename from tests/benchmark/perf_throughput.py
rename to tests/perf_throughput.py
index 726923f..e89f8d5 100644
--- a/tests/benchmark/perf_throughput.py
+++ b/tests/perf_throughput.py
@@ -42,7 +42,7 @@ def percentile(self, p: float) -> float:
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
     )
 
 

From 661225bdfb85d7c98c97eee77bda2f81ccfca8cf Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 31 May 2026 11:30:38 +0200
Subject: [PATCH 30/35] more fixes

Co-authored-by: Copilot <copilot@github.com>
---
 builders/throughput_runner.py        | 2 +-
 tests/model_snapshots/new_images.txt | 1 +
 tests/perf_throughput.py             | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/builders/throughput_runner.py b/builders/throughput_runner.py
index d3ba549..acb79b9 100644
--- a/builders/throughput_runner.py
+++ b/builders/throughput_runner.py
@@ -29,7 +29,7 @@ def run(
     ) -> Response:
         cmd = [
             sys.executable,
-            "tests/benchmark/perf_throughput.py",
+            "tests/perf_throughput.py",
             "--duration-s",
             str(duration_s),
             "--concurrency",
diff --git a/tests/model_snapshots/new_images.txt b/tests/model_snapshots/new_images.txt
index 46c847e..9a74517 100644
--- a/tests/model_snapshots/new_images.txt
+++ b/tests/model_snapshots/new_images.txt
@@ -3,4 +3,5 @@
 prostate_positive | /mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs | 43390 | 45865 | 512 | 0 | prostate-classifier-1 positive
 prostate_negative | /mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs | 32950 | 108990 | 512 | 0 | prostate-classifier-1 negative
 prostate | /mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs | 40000 | 70000 | 224 | 0 | virchow2 embed
+prov-gigapath | /mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs | 40000 | 70000 | 224 | 0 | prov-gigapath embed
 colorectum_kos04 | /mnt/data/MOU/colorectum/tissue_microarray/he/KOS04.mrxs | 46000 | 82400 | 1024 | 0 | episeg-1 semantic
diff --git a/tests/perf_throughput.py b/tests/perf_throughput.py
index e89f8d5..395da94 100644
--- a/tests/perf_throughput.py
+++ b/tests/perf_throughput.py
@@ -15,6 +15,7 @@
     ("prostate-classifier-1", "binary", 512),
     ("episeg-1", "semantic", 1024),
     ("virchow2", "embed", 224),
+    ("prov-gigapath", "embed", 224),
 ]
 POOL_SIZE_DEFAULT = 64
 

From f629274455dbe6a7130ffe0786880134c620b58d Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 31 May 2026 11:56:05 +0200
Subject: [PATCH 31/35] fixes

---
 tests/model_snapshots/_shared.py              |  7 ----
 .../test_binary_classifier_model_snapshot.py  | 26 ++------------
 tests/perf_throughput.py                      | 36 ++++++++++---------
 3 files changed, 22 insertions(+), 47 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 0c696c1..c3bd2dd 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -2,7 +2,6 @@
 
 import os
 from pathlib import Path
-from time import perf_counter
 
 import numpy as np
 import pytest
@@ -49,9 +48,7 @@ def run_binary_classifier_case(
     tile = _read_tile_at(slide_path, x, y, tile_size, level)
 
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
-        t0 = perf_counter()
         actual_score = float(client.models.classify_image(model=model_id, image=tile))
-        elapsed = perf_counter() - t0
 
     delta = actual_score - expected_score
     name = case_name or "case"
@@ -100,9 +97,7 @@ def run_semantic_segmentation_case(
     expected = np.load(expected_array_path)
 
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
-        t0 = perf_counter()
         actual = np.asarray(client.models.segment_image(model=model_id, image=tile))
-        elapsed = perf_counter() - t0
 
     max_diff = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
 
@@ -182,13 +177,11 @@ def run_embed_case(
     expected = np.load(expected_array_path).flatten().astype(np.float32)
 
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
-        t0 = perf_counter()
         actual = (
             np.asarray(client.models.embed_image(model=model_id, image=tile))
             .flatten()
             .astype(np.float32)
         )
-        elapsed = perf_counter() - t0
 
     similarity = float(
         np.dot(actual, expected) / (np.linalg.norm(actual) * np.linalg.norm(expected))
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index ce3d71f..ca9d8ea 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -10,26 +10,24 @@
 
 
 @pytest.mark.parametrize(
-    "label, slide_path, x, y, is_positive",
+    "label, slide_path, x, y",
     [
         (
             "prostate_positive",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs",
             43390,
             45865,
-            True,
         ),
         (
             "prostate_negative",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
             32950,
             108990,
-            False,
         ),
     ],
 )
 def test_prostate_classifier_snapshot(
-    label: str, slide_path: str, x: int, y: int, is_positive: bool
+    label: str, slide_path: str, x: int, y: int
 ) -> None:
     model_id = "prostate-classifier-1"
     json_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.json")
@@ -37,30 +35,10 @@ def test_prostate_classifier_snapshot(
     with json_path.open() as f:
         reference = json.load(f)
 
-    assert reference.get("label") == label
-    assert reference.get("model_id") == model_id
-    assert reference.get("slide_path") == slide_path
-    assert reference.get("x") == x
-    assert reference.get("y") == y
-    assert reference.get("tile_size") == 512
-    assert reference.get("level") == 0
-
     expected_score = reference["expected_score"]
     threshold = reference.get("threshold", BINARY_POSITIVE_THRESHOLD)
     expected_is_positive = reference.get("expected_is_positive")
     assert expected_is_positive is not None
-    assert expected_is_positive == is_positive
-
-    if is_positive:
-        assert expected_score >= threshold, (
-            f"Reference score {expected_score:.4f} is below positive threshold {threshold:.3f} — "
-            "was the reference generated on the correct tile?"
-        )
-    else:
-        assert expected_score < threshold, (
-            f"Reference score {expected_score:.4f} is above negative threshold {threshold:.3f} — "
-            "was the reference generated on the correct tile?"
-        )
 
     run_binary_classifier_case(
         model_id=model_id,
diff --git a/tests/perf_throughput.py b/tests/perf_throughput.py
index 395da94..be68c4a 100644
--- a/tests/perf_throughput.py
+++ b/tests/perf_throughput.py
@@ -35,9 +35,7 @@ def total(self) -> int:
     def percentile(self, p: float) -> float:
         if not self.latencies:
             return 0.0
-        s = sorted(self.latencies)
-        idx = int(len(s) * p / 100)
-        return s[min(idx, len(s) - 1)]
+        return float(np.percentile(self.latencies, p))
 
 
 def _models_base_url() -> str:
@@ -49,11 +47,10 @@ def _models_base_url() -> str:
 
 def make_pool(tile_size: int, n: int) -> list[np.ndarray]:
     rng = np.random.default_rng(seed=42)
-    pool = []
-    for _ in range(n):
-        img = rng.integers(0, 255, (tile_size, tile_size, 3), dtype=np.uint8)
-        pool.append(img)
-    return pool
+    return [
+        rng.integers(0, 256, (tile_size, tile_size, 3), dtype=np.uint8)
+        for _ in range(n)
+    ]
 
 
 def _call_model(
@@ -78,8 +75,7 @@ def wait_for_ready(
     wait_timeout_s: float,
     wait_interval_s: float,
 ) -> None:
-    pool = make_pool(tile_size, 1)
-    image = pool[0]
+    image = make_pool(tile_size, 1)[0]
     start = time.perf_counter()
     reported = False
 
@@ -88,8 +84,7 @@ def wait_for_ready(
             with Client(models_base_url=models_base_url, timeout=timeout) as client:
                 _call_model(client, model_id, model_type, image)
             if reported:
-                waited = time.perf_counter() - start
-                print(f"{model_id} ready after {waited:.1f}s")
+                print(f"{model_id} ready after {time.perf_counter() - start:.1f}s")
             return
         except Exception as exc:
             status_code = getattr(getattr(exc, "response", None), "status_code", None)
@@ -98,7 +93,8 @@ def wait_for_ready(
             if not reported:
                 print(f"{model_id} waiting for readiness...")
                 reported = True
-            if wait_timeout_s > 0 and (time.perf_counter() - start) >= wait_timeout_s:
+            elapsed = time.perf_counter() - start
+            if wait_timeout_s > 0 and elapsed >= wait_timeout_s:
                 raise RuntimeError(
                     f"{model_id} not ready after {wait_timeout_s:.1f}s"
                 ) from exc
@@ -147,7 +143,7 @@ def run_model(
     timeout: float,
     pool_size: int,
     models_base_url: str,
-) -> dict:
+) -> dict[str, object]:
     if pool_size <= 0:
         raise ValueError("pool_size must be > 0")
 
@@ -186,6 +182,7 @@ def run_model(
         "throughput": throughput,
         "p50": stats.percentile(50),
         "p95": stats.percentile(95),
+        "p99": stats.percentile(99),
     }
 
 
@@ -278,17 +275,24 @@ def main() -> None:
             f"{name} stats: ok={result['ok']} fail_503={result['fail_503']} "
             f"fail_other={result['fail_other']} elapsed={result['elapsed_s']:.2f}s "
             f"img/s={result['throughput']:.2f} p50={result['p50']:.3f}s "
-            f"p95={result['p95']:.3f}s"
+            f"p95={result['p95']:.3f}s p99={result['p99']:.3f}s"
         )
 
     print("Summary")
-    print("name".ljust(28), "img/s".rjust(10), "p50".rjust(10), "p95".rjust(10))
+    print(
+        "name".ljust(28),
+        "img/s".rjust(10),
+        "p50".rjust(10),
+        "p95".rjust(10),
+        "p99".rjust(10),
+    )
     for r in results:
         print(
             r["name"].ljust(28),
             f"{r['throughput']:.2f}".rjust(10),
             f"{r['p50']:.3f}".rjust(10),
             f"{r['p95']:.3f}".rjust(10),
+            f"{r['p99']:.3f}".rjust(10),
         )
 
 

From afbcab6e15b080ba969646b63f2054c595911bed Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 31 May 2026 13:41:28 +0200
Subject: [PATCH 32/35] url fix

Co-authored-by: Copilot <copilot@github.com>
---
 helm/rayservice/applications/test-runner.yaml    |  2 +-
 .../rayservice/applications/throughput-test.yaml |  2 +-
 tests/model_snapshots/_shared.py                 |  2 +-
 tests/model_snapshots/generate_references.py     | 16 ++++++++--------
 tests/perf_throughput.py                         |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/helm/rayservice/applications/test-runner.yaml b/helm/rayservice/applications/test-runner.yaml
index 3aad245..765c0b4 100644
--- a/helm/rayservice/applications/test-runner.yaml
+++ b/helm/rayservice/applications/test-runner.yaml
@@ -2,7 +2,7 @@
   import_path: builders.test_runner:app
   route_prefix: /run-tests
   runtime_env:
-    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/tests.zip?v10
+    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/main.zip
     pip:
       - git+https://github.com/RationAI/rationai-sdk-python.git
   deployments:
diff --git a/helm/rayservice/applications/throughput-test.yaml b/helm/rayservice/applications/throughput-test.yaml
index 6bb045a..a056a56 100644
--- a/helm/rayservice/applications/throughput-test.yaml
+++ b/helm/rayservice/applications/throughput-test.yaml
@@ -2,7 +2,7 @@
   import_path: builders.throughput_runner:app
   route_prefix: /run-throughput
   runtime_env:
-    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/tests.zip?v1
+    working_dir: https://github.com/RationAI/model-service/archive/refs/heads/main.zip
     pip:
       - git+https://github.com/RationAI/rationai-sdk-python.git
   deployments:
diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index c3bd2dd..122f2f7 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -13,7 +13,7 @@
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
     )
 
 
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 8078bdf..4543302 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -11,21 +11,21 @@
 OUT_DIR = Path("/mnt/test_refs")
 MODELS_BASE_URL = os.environ.get(
     "MODEL_SERVICE_MODELS_BASE_URL",
-    "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+    "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
 )
 BINARY_POSITIVE_THRESHOLD = 0.5
 
 # Keep only one active case here. Store other candidate slides in new_images.txt
 # and swap them in when you want to regenerate a different reference.
 ACTIVE_CASE = {
-    "label": "colorectum_kos04",
-    "slide_path": "/mnt/data/MOU/colorectum/tissue_microarray/he/KOS04.mrxs",
-    "model_id": "episeg-1",
-    "type": "semantic",
-    "tile_size": 1024,
+    "label": "prov-gigapath",
+    "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
+    "model_id": "prov-gigapath",
+    "type": "embed",
+    "tile_size": 224,
     "level": 0,
-    "x": 46000,
-    "y": 82400,
+    "x": 40000,
+    "y": 70000,
 }
 
 CASES = [ACTIVE_CASE]
diff --git a/tests/perf_throughput.py b/tests/perf_throughput.py
index be68c4a..06fa89e 100644
--- a/tests/perf_throughput.py
+++ b/tests/perf_throughput.py
@@ -41,7 +41,7 @@ def percentile(self, p: float) -> float:
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
     )
 
 

From 5bbc9b51edded9af6dab625604d9942fc4547fb6 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 31 May 2026 13:52:01 +0200
Subject: [PATCH 33/35] fix: uv lock and mypy

---
 builders/test_runner.py                       |  3 +-
 builders/throughput_runner.py                 |  2 +-
 tests/model_snapshots/generate_references.py  | 31 +++++++++-------
 .../test_binary_classifier_model_snapshot.py  | 20 +++++++++--
 .../test_prov_gigapath_model_snapshot.py      |  2 +-
 ...st_semantic_segmentation_model_snapshot.py |  2 +-
 .../test_virchow2_model_snapshot.py           |  2 +-
 tests/perf_throughput.py                      | 21 +++++++++--
 uv.lock                                       | 36 +++++++++++++++++++
 9 files changed, 95 insertions(+), 24 deletions(-)

diff --git a/builders/test_runner.py b/builders/test_runner.py
index 3baf09f..d98c97a 100644
--- a/builders/test_runner.py
+++ b/builders/test_runner.py
@@ -19,7 +19,6 @@ def __init__(self) -> None:
 
     @fastapi.post("/")
     def run(self) -> Response:
-
         result = subprocess.run(
             [
                 sys.executable,
@@ -42,4 +41,4 @@ def run(self) -> Response:
         return Response(content=output, media_type="text/plain")
 
 
-app = TestRunner.bind()
+app = TestRunner.bind()  # type: ignore[attr-defined]
diff --git a/builders/throughput_runner.py b/builders/throughput_runner.py
index acb79b9..976d596 100644
--- a/builders/throughput_runner.py
+++ b/builders/throughput_runner.py
@@ -55,4 +55,4 @@ def run(
         return Response(content=output, media_type="text/plain")
 
 
-app = ThroughputRunner.bind()
+app = ThroughputRunner.bind()  # type: ignore[attr-defined]
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 4543302..99ed0ee 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -1,6 +1,7 @@
 import json
 import os
 from pathlib import Path
+from typing import TypedDict
 
 import numpy as np
 from rationai import Client
@@ -11,13 +12,25 @@
 OUT_DIR = Path("/mnt/test_refs")
 MODELS_BASE_URL = os.environ.get(
     "MODEL_SERVICE_MODELS_BASE_URL",
-    "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+    "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
 )
 BINARY_POSITIVE_THRESHOLD = 0.5
 
+
+class CaseConfig(TypedDict):
+    label: str
+    slide_path: str
+    model_id: str
+    type: str
+    tile_size: int
+    level: int
+    x: int
+    y: int
+
+
 # Keep only one active case here. Store other candidate slides in new_images.txt
 # and swap them in when you want to regenerate a different reference.
-ACTIVE_CASE = {
+ACTIVE_CASE: CaseConfig = {
     "label": "prov-gigapath",
     "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
     "model_id": "prov-gigapath",
@@ -28,7 +41,7 @@
     "y": 70000,
 }
 
-CASES = [ACTIVE_CASE]
+CASES: list[CaseConfig] = [ACTIVE_CASE]
 
 
 def generate_references() -> None:
@@ -55,9 +68,7 @@ def generate_references() -> None:
             try:
                 if mtype == "binary":
                     score = float(
-                        client.models.classify_image(
-                            model=model_id, image=tile, timeout=600
-                        )
+                        client.models.classify_image(model=model_id, image=tile)
                     )
                     out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
                     with out_file.open("w") as f:
@@ -82,9 +93,7 @@ def generate_references() -> None:
 
                 elif mtype == "semantic":
                     arr = np.asarray(
-                        client.models.segment_image(
-                            model=model_id, image=tile, timeout=1200
-                        )
+                        client.models.segment_image(model=model_id, image=tile)
                     )
                     out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
                     np.save(out_file, arr)
@@ -92,9 +101,7 @@ def generate_references() -> None:
 
                 elif mtype == "embed":
                     arr = np.asarray(
-                        client.models.embed_image(
-                            model=model_id, image=tile, timeout=1200
-                        )
+                        client.models.embed_image(model=model_id, image=tile)
                     )
                     out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
                     np.save(out_file, arr)
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index ca9d8ea..dde86eb 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -9,25 +9,27 @@
 BINARY_POSITIVE_THRESHOLD = 0.5
 
 
-@pytest.mark.parametrize(
-    "label, slide_path, x, y",
+@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
+    "label, slide_path, x, y, is_positive",
     [
         (
             "prostate_positive",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs",
             43390,
             45865,
+            True,
         ),
         (
             "prostate_negative",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
             32950,
             108990,
+            False,
         ),
     ],
 )
 def test_prostate_classifier_snapshot(
-    label: str, slide_path: str, x: int, y: int
+    label: str, slide_path: str, x: int, y: int, is_positive: bool
 ) -> None:
     model_id = "prostate-classifier-1"
     json_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.json")
@@ -39,6 +41,18 @@ def test_prostate_classifier_snapshot(
     threshold = reference.get("threshold", BINARY_POSITIVE_THRESHOLD)
     expected_is_positive = reference.get("expected_is_positive")
     assert expected_is_positive is not None
+    assert expected_is_positive == is_positive
+
+    if is_positive:
+        assert expected_score >= threshold, (
+            f"Reference score {expected_score:.4f} is below positive threshold {threshold:.3f} — "
+            "was the reference generated on the correct tile?"
+        )
+    else:
+        assert expected_score < threshold, (
+            f"Reference score {expected_score:.4f} is above negative threshold {threshold:.3f} — "
+            "was the reference generated on the correct tile?"
+        )
 
     run_binary_classifier_case(
         model_id=model_id,
diff --git a/tests/model_snapshots/test_prov_gigapath_model_snapshot.py b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
index 9f6aad6..0df6f49 100644
--- a/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
+++ b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
@@ -5,7 +5,7 @@
 from tests.model_snapshots._shared import run_embed_case
 
 
-@pytest.mark.parametrize(
+@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
     "label, slide_path, x, y",
     [
         (
diff --git a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
index 8ff06cc..50b638c 100644
--- a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
+++ b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
@@ -5,7 +5,7 @@
 from tests.model_snapshots._shared import run_semantic_segmentation_case
 
 
-@pytest.mark.parametrize(
+@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
     "label, slide_path, x, y",
     [
         (
diff --git a/tests/model_snapshots/test_virchow2_model_snapshot.py b/tests/model_snapshots/test_virchow2_model_snapshot.py
index 4714005..005cad1 100644
--- a/tests/model_snapshots/test_virchow2_model_snapshot.py
+++ b/tests/model_snapshots/test_virchow2_model_snapshot.py
@@ -5,7 +5,7 @@
 from tests.model_snapshots._shared import run_embed_case
 
 
-@pytest.mark.parametrize(
+@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
     "label, slide_path, x, y",
     [
         (
diff --git a/tests/perf_throughput.py b/tests/perf_throughput.py
index 06fa89e..e76ba04 100644
--- a/tests/perf_throughput.py
+++ b/tests/perf_throughput.py
@@ -6,6 +6,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from threading import Lock
+from typing import TypedDict
 
 import numpy as np
 from rationai import Client
@@ -20,6 +21,20 @@
 POOL_SIZE_DEFAULT = 64
 
 
+class ModelResult(TypedDict):
+    name: str
+    model_type: str
+    tile_size: int
+    elapsed_s: float
+    ok: int
+    fail_503: int
+    fail_other: int
+    throughput: float
+    p50: float
+    p95: float
+    p99: float
+
+
 @dataclass
 class Stats:
     ok: int = 0
@@ -41,7 +56,7 @@ def percentile(self, p: float) -> float:
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
     )
 
 
@@ -143,7 +158,7 @@ def run_model(
     timeout: float,
     pool_size: int,
     models_base_url: str,
-) -> dict[str, object]:
+) -> ModelResult:
     if pool_size <= 0:
         raise ValueError("pool_size must be > 0")
 
@@ -248,7 +263,7 @@ def main() -> None:
     print(f"Timeout:         {args.timeout}s")
     print()
 
-    results = []
+    results: list[ModelResult] = []
     for name, model_type, tile_size in models:
         if args.wait_ready:
             wait_for_ready(
diff --git a/uv.lock b/uv.lock
index 3ac8fef..ecad61f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1235,6 +1235,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" },
 ]
 
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
+]
+
 [[package]]
 name = "itsdangerous"
 version = "2.2.0"
@@ -1777,6 +1786,7 @@ dependencies = [
 [package.dev-dependencies]
 dev = [
     { name = "mypy" },
+    { name = "pytest" },
     { name = "ruff" },
 ]
 docs = [
@@ -1798,6 +1808,7 @@ requires-dist = [
 [package.metadata.requires-dev]
 dev = [
     { name = "mypy", specifier = ">=1.18.2" },
+    { name = "pytest", specifier = ">=8.4.2" },
     { name = "ruff", specifier = ">=0.14.6" },
 ]
 docs = [
@@ -2570,6 +2581,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/75/a6/a0a304dc33b49145b21f4808d763822111e67d1c3a32b524a1baf947b6e1/platformdirs-4.9.6-py3-none-any.whl", hash = "sha256:e61adb1d5e5cb3441b4b7710bea7e4c12250ca49439228cc1021c00dcfac0917", size = 21348, upload-time = "2026-04-09T00:04:09.463Z" },
 ]
 
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" },
+]
+
 [[package]]
 name = "prometheus-client"
 version = "0.25.0"
@@ -2983,6 +3003,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/15/73/a7141a1a0559bf1a7aa42a11c879ceb19f02f5c6c371c6d57fd86cefd4d1/pyproj-3.7.2-cp314-cp314t-win_arm64.whl", hash = "sha256:d9d25bae416a24397e0d85739f84d323b55f6511e45a522dd7d7eae70d10c7e4", size = 6391844, upload-time = "2025-08-14T12:05:40.745Z" },
 ]
 
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" },
+]
+
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"

From ab16f2379f2cc01fe3bc070d0f8f1357005f759c Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 31 May 2026 14:10:05 +0200
Subject: [PATCH 34/35] review fixes

---
 tests/model_snapshots/_shared.py              | 18 ++++++----
 tests/model_snapshots/generate_references.py  | 34 +++++++------------
 .../test_binary_classifier_model_snapshot.py  | 25 +++-----------
 .../test_prov_gigapath_model_snapshot.py      |  8 ++---
 ...st_semantic_segmentation_model_snapshot.py |  8 ++---
 .../test_virchow2_model_snapshot.py           |  8 ++---
 tests/perf_throughput.py                      | 21 +++---------
 7 files changed, 42 insertions(+), 80 deletions(-)

diff --git a/tests/model_snapshots/_shared.py b/tests/model_snapshots/_shared.py
index 122f2f7..ed40070 100644
--- a/tests/model_snapshots/_shared.py
+++ b/tests/model_snapshots/_shared.py
@@ -13,10 +13,14 @@
 def _models_base_url() -> str:
     return os.environ.get(
         "MODEL_SERVICE_MODELS_BASE_URL",
-        "http://rayservice-model-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
+        "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
     )
 
 
+def test_refs_dir() -> Path:
+    return Path(os.environ.get("MODEL_SERVICE_TEST_REFS_DIR", "/mnt/test_refs"))
+
+
 def _read_tile_at(
     slide_path: str, x: int, y: int, tile_size: int, level: int
 ) -> NDArray[np.uint8]:
@@ -99,6 +103,9 @@ def run_semantic_segmentation_case(
     with Client(models_base_url=_models_base_url(), timeout=timeout_s) as client:
         actual = np.asarray(client.models.segment_image(model=model_id, image=tile))
 
+    if actual.shape != expected.shape:
+        pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
+
     max_diff = np.abs(actual.astype(np.float32) - expected.astype(np.float32)).max()
 
     if actual.ndim == 4:
@@ -115,9 +122,6 @@ def run_semantic_segmentation_case(
     frac_05 = float((stats_slice >= 0.5).mean())
     name = case_name or "case"
 
-    if actual.shape != expected.shape:
-        pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
-
     close_mask = np.isclose(actual, expected, rtol=rtol, atol=atol)
     if not close_mask.all():
         mismatch_fraction = float((~close_mask).mean())
@@ -183,6 +187,9 @@ def run_embed_case(
             .astype(np.float32)
         )
 
+    if actual.shape != expected.shape:
+        pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
+
     similarity = float(
         np.dot(actual, expected) / (np.linalg.norm(actual) * np.linalg.norm(expected))
     )
@@ -190,9 +197,6 @@ def run_embed_case(
     expected_norm = float(np.linalg.norm(expected))
     name = case_name or "case"
 
-    if actual.shape != expected.shape:
-        pytest.fail(f"Shape mismatch: expected={expected.shape}, actual={actual.shape}")
-
     if similarity < min_cosine_similarity:
         pytest.fail(
             f"Embedding similarity too low: {similarity:.6f} < {min_cosine_similarity}"
diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 99ed0ee..001ca5b 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -1,36 +1,22 @@
 import json
 import os
-from pathlib import Path
-from typing import TypedDict
 
 import numpy as np
 from rationai import Client
 
-from tests.model_snapshots._shared import _read_tile_at
+from tests.model_snapshots._shared import _read_tile_at, test_refs_dir
 
 
-OUT_DIR = Path("/mnt/test_refs")
+OUT_DIR = test_refs_dir()
 MODELS_BASE_URL = os.environ.get(
     "MODEL_SERVICE_MODELS_BASE_URL",
     "http://rayservice-model-tests-serve-svc.rationai-jobs-ns.svc.cluster.local:8000",
 )
 BINARY_POSITIVE_THRESHOLD = 0.5
 
-
-class CaseConfig(TypedDict):
-    label: str
-    slide_path: str
-    model_id: str
-    type: str
-    tile_size: int
-    level: int
-    x: int
-    y: int
-
-
 # Keep only one active case here. Store other candidate slides in new_images.txt
 # and swap them in when you want to regenerate a different reference.
-ACTIVE_CASE: CaseConfig = {
+ACTIVE_CASE = {
     "label": "prov-gigapath",
     "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
     "model_id": "prov-gigapath",
@@ -41,7 +27,7 @@ class CaseConfig(TypedDict):
     "y": 70000,
 }
 
-CASES: list[CaseConfig] = [ACTIVE_CASE]
+CASES = [ACTIVE_CASE]
 
 
 def generate_references() -> None:
@@ -68,7 +54,9 @@ def generate_references() -> None:
             try:
                 if mtype == "binary":
                     score = float(
-                        client.models.classify_image(model=model_id, image=tile)
+                        client.models.classify_image(
+                            model=model_id, image=tile, timeout=600
+                        )
                     )
                     out_file = OUT_DIR / f"{label}_{model_id}_expected.json"
                     with out_file.open("w") as f:
@@ -93,7 +81,9 @@ def generate_references() -> None:
 
                 elif mtype == "semantic":
                     arr = np.asarray(
-                        client.models.segment_image(model=model_id, image=tile)
+                        client.models.segment_image(
+                            model=model_id, image=tile, timeout=1200
+                        )
                     )
                     out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
                     np.save(out_file, arr)
@@ -101,7 +91,9 @@ def generate_references() -> None:
 
                 elif mtype == "embed":
                     arr = np.asarray(
-                        client.models.embed_image(model=model_id, image=tile)
+                        client.models.embed_image(
+                            model=model_id, image=tile, timeout=1200
+                        )
                     )
                     out_file = OUT_DIR / f"{label}_{model_id}_expected.npy"
                     np.save(out_file, arr)
diff --git a/tests/model_snapshots/test_binary_classifier_model_snapshot.py b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
index dde86eb..fcbef95 100644
--- a/tests/model_snapshots/test_binary_classifier_model_snapshot.py
+++ b/tests/model_snapshots/test_binary_classifier_model_snapshot.py
@@ -1,38 +1,35 @@
 import json
-from pathlib import Path
 
 import pytest
 
-from tests.model_snapshots._shared import run_binary_classifier_case
+from tests.model_snapshots._shared import run_binary_classifier_case, test_refs_dir
 
 
 BINARY_POSITIVE_THRESHOLD = 0.5
 
 
-@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
-    "label, slide_path, x, y, is_positive",
+@pytest.mark.parametrize(
+    "label, slide_path, x, y",
     [
         (
             "prostate_positive",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_2386-06-1.mrxs",
             43390,
             45865,
-            True,
         ),
         (
             "prostate_negative",
             "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_0845-02-0.mrxs",
             32950,
             108990,
-            False,
         ),
     ],
 )
 def test_prostate_classifier_snapshot(
-    label: str, slide_path: str, x: int, y: int, is_positive: bool
+    label: str, slide_path: str, x: int, y: int
 ) -> None:
     model_id = "prostate-classifier-1"
-    json_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.json")
+    json_path = test_refs_dir() / f"{label}_{model_id}_expected.json"
 
     with json_path.open() as f:
         reference = json.load(f)
@@ -41,18 +38,6 @@ def test_prostate_classifier_snapshot(
     threshold = reference.get("threshold", BINARY_POSITIVE_THRESHOLD)
     expected_is_positive = reference.get("expected_is_positive")
     assert expected_is_positive is not None
-    assert expected_is_positive == is_positive
-
-    if is_positive:
-        assert expected_score >= threshold, (
-            f"Reference score {expected_score:.4f} is below positive threshold {threshold:.3f} — "
-            "was the reference generated on the correct tile?"
-        )
-    else:
-        assert expected_score < threshold, (
-            f"Reference score {expected_score:.4f} is above negative threshold {threshold:.3f} — "
-            "was the reference generated on the correct tile?"
-        )
 
     run_binary_classifier_case(
         model_id=model_id,
diff --git a/tests/model_snapshots/test_prov_gigapath_model_snapshot.py b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
index 0df6f49..f206d19 100644
--- a/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
+++ b/tests/model_snapshots/test_prov_gigapath_model_snapshot.py
@@ -1,11 +1,9 @@
-from pathlib import Path
-
 import pytest
 
-from tests.model_snapshots._shared import run_embed_case
+from tests.model_snapshots._shared import run_embed_case, test_refs_dir
 
 
-@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
+@pytest.mark.parametrize(
     "label, slide_path, x, y",
     [
         (
@@ -18,7 +16,7 @@
 )
 def test_prov_gigapath(label: str, slide_path: str, x: int, y: int) -> None:
     model_id = "prov-gigapath"
-    expected_array_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.npy")
+    expected_array_path = test_refs_dir() / f"{label}_{model_id}_expected.npy"
 
     run_embed_case(
         model_id=model_id,
diff --git a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
index 50b638c..7022871 100644
--- a/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
+++ b/tests/model_snapshots/test_semantic_segmentation_model_snapshot.py
@@ -1,11 +1,9 @@
-from pathlib import Path
-
 import pytest
 
-from tests.model_snapshots._shared import run_semantic_segmentation_case
+from tests.model_snapshots._shared import run_semantic_segmentation_case, test_refs_dir
 
 
-@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
+@pytest.mark.parametrize(
     "label, slide_path, x, y",
     [
         (
@@ -18,7 +16,7 @@
 )
 def test_semantic_episeg(label: str, slide_path: str, x: int, y: int) -> None:
     model_id = "episeg-1"
-    expected_array_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.npy")
+    expected_array_path = test_refs_dir() / f"{label}_{model_id}_expected.npy"
 
     run_semantic_segmentation_case(
         model_id=model_id,
diff --git a/tests/model_snapshots/test_virchow2_model_snapshot.py b/tests/model_snapshots/test_virchow2_model_snapshot.py
index 005cad1..8fa79d8 100644
--- a/tests/model_snapshots/test_virchow2_model_snapshot.py
+++ b/tests/model_snapshots/test_virchow2_model_snapshot.py
@@ -1,11 +1,9 @@
-from pathlib import Path
-
 import pytest
 
-from tests.model_snapshots._shared import run_embed_case
+from tests.model_snapshots._shared import run_embed_case, test_refs_dir
 
 
-@pytest.mark.parametrize(  # type: ignore[untyped-decorator]
+@pytest.mark.parametrize(
     "label, slide_path, x, y",
     [
         (
@@ -18,7 +16,7 @@
 )
 def test_virchow2(label: str, slide_path: str, x: int, y: int) -> None:
     model_id = "virchow2"
-    expected_array_path = Path(f"/mnt/test_refs/{label}_{model_id}_expected.npy")
+    expected_array_path = test_refs_dir() / f"{label}_{model_id}_expected.npy"
 
     run_embed_case(
         model_id=model_id,
diff --git a/tests/perf_throughput.py b/tests/perf_throughput.py
index e76ba04..037f78d 100644
--- a/tests/perf_throughput.py
+++ b/tests/perf_throughput.py
@@ -6,7 +6,6 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from threading import Lock
-from typing import TypedDict
 
 import numpy as np
 from rationai import Client
@@ -21,20 +20,6 @@
 POOL_SIZE_DEFAULT = 64
 
 
-class ModelResult(TypedDict):
-    name: str
-    model_type: str
-    tile_size: int
-    elapsed_s: float
-    ok: int
-    fail_503: int
-    fail_other: int
-    throughput: float
-    p50: float
-    p95: float
-    p99: float
-
-
 @dataclass
 class Stats:
     ok: int = 0
@@ -102,6 +87,8 @@ def wait_for_ready(
                 print(f"{model_id} ready after {time.perf_counter() - start:.1f}s")
             return
         except Exception as exc:
+            if isinstance(exc, ValueError):
+                raise
             status_code = getattr(getattr(exc, "response", None), "status_code", None)
             if status_code not in (None, 503, 504):
                 raise
@@ -158,7 +145,7 @@ def run_model(
     timeout: float,
     pool_size: int,
     models_base_url: str,
-) -> ModelResult:
+) -> dict[str, object]:
     if pool_size <= 0:
         raise ValueError("pool_size must be > 0")
 
@@ -263,7 +250,7 @@ def main() -> None:
     print(f"Timeout:         {args.timeout}s")
     print()
 
-    results: list[ModelResult] = []
+    results = []
     for name, model_type, tile_size in models:
         if args.wait_ready:
             wait_for_ready(

From 11c1dcd36f286e9ddb0a6fa404f810091ad54da3 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 31 May 2026 14:13:30 +0200
Subject: [PATCH 35/35] new classes

---
 tests/model_snapshots/generate_references.py | 17 +++++++++++++++--
 tests/perf_throughput.py                     | 19 +++++++++++++++++--
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/tests/model_snapshots/generate_references.py b/tests/model_snapshots/generate_references.py
index 001ca5b..24c49b5 100644
--- a/tests/model_snapshots/generate_references.py
+++ b/tests/model_snapshots/generate_references.py
@@ -1,5 +1,6 @@
 import json
 import os
+from typing import TypedDict
 
 import numpy as np
 from rationai import Client
@@ -14,9 +15,21 @@
 )
 BINARY_POSITIVE_THRESHOLD = 0.5
 
+
+class CaseConfig(TypedDict):
+    label: str
+    slide_path: str
+    model_id: str
+    type: str
+    tile_size: int
+    level: int
+    x: int
+    y: int
+
+
 # Keep only one active case here. Store other candidate slides in new_images.txt
 # and swap them in when you want to regenerate a different reference.
-ACTIVE_CASE = {
+ACTIVE_CASE: CaseConfig = {
     "label": "prov-gigapath",
     "slide_path": "/mnt/data/MOU/prostate/tile_level_annotations/P-2016_1367-01-0.mrxs",
     "model_id": "prov-gigapath",
@@ -27,7 +40,7 @@
     "y": 70000,
 }
 
-CASES = [ACTIVE_CASE]
+CASES: list[CaseConfig] = [ACTIVE_CASE]
 
 
 def generate_references() -> None:
diff --git a/tests/perf_throughput.py b/tests/perf_throughput.py
index 037f78d..2be004d 100644
--- a/tests/perf_throughput.py
+++ b/tests/perf_throughput.py
@@ -6,6 +6,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from threading import Lock
+from typing import TypedDict
 
 import numpy as np
 from rationai import Client
@@ -20,6 +21,20 @@
 POOL_SIZE_DEFAULT = 64
 
 
+class ModelResult(TypedDict):
+    name: str
+    model_type: str
+    tile_size: int
+    elapsed_s: float
+    ok: int
+    fail_503: int
+    fail_other: int
+    throughput: float
+    p50: float
+    p95: float
+    p99: float
+
+
 @dataclass
 class Stats:
     ok: int = 0
@@ -145,7 +160,7 @@ def run_model(
     timeout: float,
     pool_size: int,
     models_base_url: str,
-) -> dict[str, object]:
+) -> ModelResult:
     if pool_size <= 0:
         raise ValueError("pool_size must be > 0")
 
@@ -250,7 +265,7 @@ def main() -> None:
     print(f"Timeout:         {args.timeout}s")
     print()
 
-    results = []
+    results: list[ModelResult] = []
     for name, model_type, tile_size in models:
         if args.wait_ready:
             wait_for_ready(