From 4667a4d0517a99c56f275bf6d5f0a3ab56ecab2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20C=C3=ADfka?= <550433@mail.muni.cz> Date: Sun, 14 Jun 2026 23:21:57 +0200 Subject: [PATCH] chore: update embedding run IDs and centralize them per model Repoint Virchow2 and ProvGigaPath embedding configs at the new embedding runs and remove the storage disparity between them. Previously the Virchow2 run ID lived in dataset.yaml while the ProvGigaPath ID was a literal duplicated across three experiment configs. Replace both with a single per-model map in dataset.yaml: mlflow_artifacts.embedding_run_ids: virchow2: 262d680f52374614baaf19f1e4d05940 provgigapath: 5099ce57e886411dababfc1234a88b23 All embedding configs now interpolate from this map, so each run ID has one source of truth. No consumer code changes: nothing in Python reads mlflow_artifacts.embedding_run_id; resolution is pure Hydra interpolation. Also expand the README with repository structure, setup, and usage sections. Co-Authored-By: Claude Opus 4.8 --- README.md | 43 ++++++++++++++++++- configs/data/dataset.yaml | 4 +- .../ml/final_linear_provgigapath_adamw.yaml | 2 +- .../ml/final_linear_provgigapath_lbfgs.yaml | 2 +- ...linear_provgigapath_adamw_group_kfold.yaml | 2 +- configs/ml/task/final_linear_classifier.yaml | 2 +- configs/ml/task/kfold_linear_classifier.yaml | 2 +- 7 files changed, 49 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 72003cd..df94b77 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,42 @@ # Tissue Classification -* repository for classification of various tissue types using WSIs -* detailed description can be found on [Youtrack](https://youtrack.rationai.cloud.e-infra.cz/articles/TC) \ No newline at end of file + +This repository contains a pipeline for tissue classification in whole-slide +images (WSIs). It includes preprocessing utilities, dataset splitting tools, +machine learning training and evaluation code, Hydra configurations, and +submission scripts for running individual pipeline stages. + +## Repository Structure + +- `preprocessing/` contains source code for WSI preprocessing, tiling, mask + generation, embedding extraction, quality control, and related statistics. +- `ml/` contains machine learning code for data loading, model training, + evaluation, callbacks, and prediction outputs. +- `split/` contains utilities for train/test and k-fold dataset splitting. +- `configs/` contains Hydra configuration files for preprocessing, splitting, + logging, datasets, and machine learning experiments. +- `scripts/` contains executable submission scripts for running pipeline stages. +- `pyproject.toml` and `uv.lock` define the Python project metadata and locked + dependencies. +- `LICENSE` contains the MIT License for this repository. + +## Setup + +The project uses Python 3.12 and `uv` for dependency management. + +```bash +uv sync +``` + +## Usage + +Pipeline stages can be run through the corresponding scripts in `scripts/`. +For example: + +```bash +uv run python scripts/submit_tiling.py +uv run python scripts/submit_embeddings.py +uv run python scripts/submit_train_linear_probe.py +``` + +Configuration is managed with Hydra. Base configurations are stored in +`configs/`, with task-specific configurations grouped by pipeline stage. diff --git a/configs/data/dataset.yaml b/configs/data/dataset.yaml index f061371..766849f 100644 --- a/configs/data/dataset.yaml +++ b/configs/data/dataset.yaml @@ -16,7 +16,9 @@ dataset: filter_tiles_run_id: "4e8f5d3c82124ea5a8f871a42d3ed9ba" stratified_kfold_run_id: "850c81506684450b9af92296acfd045a" stratified_group_kfold_run_id: "382b41d2fa894514908e8067949c4326" - embedding_run_id: "c325e3a5033b4077b6febb0e3e6b0bd6" + embedding_run_ids: + virchow2: "262d680f52374614baaf19f1e4d05940" + provgigapath: "5099ce57e886411dababfc1234a88b23" tissue_masks_run_id: "52bc0924f8624b259819c480c7cf213f" tissue_stats_run_id: "16ae2d003d88471b924e5f332415232a" diff --git a/configs/experiment/ml/final_linear_provgigapath_adamw.yaml b/configs/experiment/ml/final_linear_provgigapath_adamw.yaml index 2de2584..fb54dc6 100644 --- a/configs/experiment/ml/final_linear_provgigapath_adamw.yaml +++ b/configs/experiment/ml/final_linear_provgigapath_adamw.yaml @@ -6,7 +6,7 @@ defaults: embedding_model_name: ProvGigaPath embedding_dim: 1536 -embedding_run_id: 410c8672471348ceb4c58817f70fa097 +embedding_run_id: ${dataset.mlflow_artifacts.embedding_run_ids.provgigapath} kfold_strategy: stratified_group kfold_run_id: ${dataset.mlflow_artifacts.stratified_group_kfold_run_id} mlflow_artifact_path: linear_classifier_final_provgigapath diff --git a/configs/experiment/ml/final_linear_provgigapath_lbfgs.yaml b/configs/experiment/ml/final_linear_provgigapath_lbfgs.yaml index 49c71f3..85c8171 100644 --- a/configs/experiment/ml/final_linear_provgigapath_lbfgs.yaml +++ b/configs/experiment/ml/final_linear_provgigapath_lbfgs.yaml @@ -6,7 +6,7 @@ defaults: embedding_model_name: ProvGigaPath embedding_dim: 1536 -embedding_run_id: 410c8672471348ceb4c58817f70fa097 +embedding_run_id: ${dataset.mlflow_artifacts.embedding_run_ids.provgigapath} kfold_strategy: stratified_group kfold_run_id: ${dataset.mlflow_artifacts.stratified_group_kfold_run_id} mlflow_artifact_path: linear_classifier_final_provgigapath diff --git a/configs/experiment/ml/train_linear_provgigapath_adamw_group_kfold.yaml b/configs/experiment/ml/train_linear_provgigapath_adamw_group_kfold.yaml index 797f6e0..30b700a 100644 --- a/configs/experiment/ml/train_linear_provgigapath_adamw_group_kfold.yaml +++ b/configs/experiment/ml/train_linear_provgigapath_adamw_group_kfold.yaml @@ -6,7 +6,7 @@ defaults: embedding_model_name: ProvGigaPath embedding_dim: 1536 -embedding_run_id: 410c8672471348ceb4c58817f70fa097 +embedding_run_id: ${dataset.mlflow_artifacts.embedding_run_ids.provgigapath} mlflow_artifact_path: linear_classifier_provgigapath metadata: diff --git a/configs/ml/task/final_linear_classifier.yaml b/configs/ml/task/final_linear_classifier.yaml index 5e8f3b3..a33638e 100644 --- a/configs/ml/task/final_linear_classifier.yaml +++ b/configs/ml/task/final_linear_classifier.yaml @@ -12,7 +12,7 @@ mode: fit embedding_model_name: Virchow2 embedding_dim: 2560 -embedding_run_id: ${dataset.mlflow_artifacts.embedding_run_id} +embedding_run_id: ${dataset.mlflow_artifacts.embedding_run_ids.virchow2} kfold_strategy: stratified kfold_run_id: ${dataset.mlflow_artifacts.stratified_kfold_run_id} filter_tiles_run_id: ${dataset.mlflow_artifacts.filter_tiles_run_id} diff --git a/configs/ml/task/kfold_linear_classifier.yaml b/configs/ml/task/kfold_linear_classifier.yaml index 6b6951b..dd0a05d 100644 --- a/configs/ml/task/kfold_linear_classifier.yaml +++ b/configs/ml/task/kfold_linear_classifier.yaml @@ -12,7 +12,7 @@ mode: fit embedding_model_name: Virchow2 embedding_dim: 2560 -embedding_run_id: ${dataset.mlflow_artifacts.embedding_run_id} +embedding_run_id: ${dataset.mlflow_artifacts.embedding_run_ids.virchow2} kfold_strategy: stratified kfold_run_id: ${dataset.mlflow_artifacts.stratified_kfold_run_id} filter_tiles_run_id: ${dataset.mlflow_artifacts.filter_tiles_run_id}