From d61d65b6525811209b41d2e3a0b7caee8a38e5f1 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 10:14:53 +0100
Subject: [PATCH 01/76] initial implementation

---
 Cargo.lock                                    |   9 +
 docs/specs/rerandomization.md                 | 244 ++++++++++
 .../continuous-rerand-local.sh                | 112 +++++
 .../bin/iris-mpc-upgrade/rerandomize_db.rs    | 104 ++---
 .../iris-mpc-upgrade/run-rerand-e2e-tests.sh  |  53 +++
 iris-mpc-bins/bin/iris-mpc/server.rs          |  14 +-
 iris-mpc-common/src/helpers/sync.rs           |  20 +
 iris-mpc-store/src/lib.rs                     |   1 +
 iris-mpc-store/src/rerand.rs                  | 441 ++++++++++++++++++
 iris-mpc-upgrade/Cargo.toml                   |  12 +
 iris-mpc-upgrade/src/config.rs                |  34 ++
 iris-mpc-upgrade/src/continuous_rerand.rs     | 295 ++++++++++++
 iris-mpc-upgrade/src/epoch.rs                 | 223 +++++++++
 iris-mpc-upgrade/src/lib.rs                   |   3 +
 iris-mpc-upgrade/src/rerandomization.rs       |  59 ++-
 iris-mpc-upgrade/src/s3_coordination.rs       | 279 +++++++++++
 .../tests/continuous_rerand_e2e.rs            | 312 +++++++++++++
 iris-mpc-upgrade/tests/test_utils.rs          | 371 +++++++++++++++
 iris-mpc/src/server/mod.rs                    |  13 +
 .../20260226000001_add_rerand_epoch.down.sql  |  14 +
 .../20260226000001_add_rerand_epoch.up.sql    |  15 +
 ...0226000002_create_rerand_progress.down.sql |   1 +
 ...260226000002_create_rerand_progress.up.sql |   8 +
 23 files changed, 2566 insertions(+), 71 deletions(-)
 create mode 100644 docs/specs/rerandomization.md
 create mode 100755 iris-mpc-bins/bin/iris-mpc-upgrade/continuous-rerand-local.sh
 create mode 100755 iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
 create mode 100644 iris-mpc-store/src/rerand.rs
 create mode 100644 iris-mpc-upgrade/src/continuous_rerand.rs
 create mode 100644 iris-mpc-upgrade/src/epoch.rs
 create mode 100644 iris-mpc-upgrade/src/s3_coordination.rs
 create mode 100644 iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
 create mode 100644 iris-mpc-upgrade/tests/test_utils.rs
 create mode 100644 migrations/20260226000001_add_rerand_epoch.down.sql
 create mode 100644 migrations/20260226000001_add_rerand_epoch.up.sql
 create mode 100644 migrations/20260226000002_create_rerand_progress.down.sql
 create mode 100644 migrations/20260226000002_create_rerand_progress.up.sql

diff --git a/Cargo.lock b/Cargo.lock
index c0aff9899b..b11660e34b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3183,12 +3183,18 @@ dependencies = [
  "ark-ec",
  "ark-ff",
  "ark-serialize",
+ "aws-config",
+ "aws-sdk-s3",
+ "aws-sdk-secretsmanager",
  "axum 0.7.7",
+ "base64",
  "blake3",
  "bytemuck",
  "clap",
  "criterion",
+ "dotenvy",
  "eyre",
+ "futures",
  "iris-mpc-common",
  "iris-mpc-store",
  "itertools 0.13.0",
@@ -3198,9 +3204,12 @@ dependencies = [
  "rayon",
  "serde",
  "serde-big-array",
+ "serde_json",
  "sha2",
+ "sqlx",
  "thiserror 1.0.65",
  "tokio",
+ "tokio-util",
  "tonic",
  "tonic-build",
  "tracing",
diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
new file mode 100644
index 0000000000..d541aeef29
--- /dev/null
+++ b/docs/specs/rerandomization.md
@@ -0,0 +1,244 @@
+# Continuous Rerandomization Plan
+
+## Overview
+
+Replaces the existing, one-off rerandomization protocol by a continuous, online process that rerandomizes shares while the system is running. No downtime or restart required.
+
+Key design decision: in-memory shares are less likely to be exfiltrated, so only the DB (at-rest persistence) is rerandomized. The actor is completely unmodified. The rerand server handles everything, writing to a staging schema and then copying to live once all parties confirm.
+
+## Architecture
+
+1. **Rerand Server** (modified `iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs`, separate process, one per party) — rerandomizes shares, writes to staging, coordinates with peers via S3 markers, copies confirmed chunks to live DB. Replaces the existing one-off `RerandomizeDb` subcommand with a new `RerandomizeContinuous` subcommand. Core rerandomization logic in `iris-mpc-upgrade/src/rerandomization.rs` is reused; the new subcommand adds the continuous loop, S3 coordination, and staging management.
+2. **Main Server** (existing, minimal changes) — at startup, syncs rerand progress with peers and catches up any missing chunks from staging before loading the DB into memory.
+
+The GPU actor, batch processing, and result processor are completely untouched.
+
+## Seed & Randomness
+
+One epoch is active at a time. At the start of each epoch:
+
+1. Each rerand server generates a fresh BLS12-381 keypair
+2. Private key is saved to Secrets Manager at `{env}/iris-mpc-db-rerandomization/epoch-{E}/private-key-party-{P}`
+3. Public key is uploaded to S3 at `s3://bucket/rerand/epoch-{E}/party-{P}/public-key`
+4. Each rerand server downloads the other two parties' public keys from S3 (polling until all present)
+5. Each derives the same 32-byte `shared_secret` via the BLS12-381 pairing
+
+Only the rerand server needs access to the key. The main server never touches it.
+
+### Keygen is idempotent on restart
+
+When starting an epoch, the rerand server:
+
+1. Checks if an epoch-scoped private key already exists in Secrets Manager at `{env}/iris-mpc-db-rerandomization/epoch-{E}/private-key-party-{P}`
+2. If yes: loads it, derives the public key, and uploads the public key to S3 if not already present (covers crash-after-SM-write-before-S3-upload)
+3. If no: generates a new keypair, saves the private key to Secrets Manager first, then uploads the public key to S3
+
+Secrets Manager is checked first because the private key is written to SM before the public key is uploaded to S3. If we crash between the two writes, on restart we find the key in SM and re-upload to S3.
+
+### Epoch transition
+
+One epoch at a time, no overlap:
+
+1. All three rerand servers finish processing all chunks for epoch E
+2. Each server uploads a completion marker: `s3://bucket/rerand/epoch-{E}/party-{P}/complete`
+3. Each server polls until all three completion markers exist
+4. Keys for epoch E are deleted from Secrets Manager — old secret is destroyed, old shares (overwritten in live DB) are unrecoverable
+5. Epoch E+1 begins: create/publish `manifest.json`, keygen, derive new `shared_secret`, start processing
+
+Old S3 markers under `epoch-{E}/` are left in place (no active cleanup). Use S3 lifecycle policies to reap old epoch prefixes after a retention period.
+
+On restart mid-epoch: private key is still in SM, public keys and markers are still in S3, `rerand_progress` table tells you the current epoch and which chunk to resume from. Re-derive `shared_secret`, continue.
+
+## S3 Coordination Bus
+
+All cross-party coordination uses S3 markers in a shared bucket. Each party writes to its own prefixed paths. Marker layout:
+
+```
+s3://bucket/rerand/epoch-{E}/party-{P}/public-key            # public key for DH
+s3://bucket/rerand/epoch-{E}/party-{P}/max-id                # party P watermark for manifest (MAX(id))
+s3://bucket/rerand/epoch-{E}/party-{P}/manifest.json         # epoch chunking manifest (party 0 writes, others read)
+s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged      # chunk K staging committed
+s3://bucket/rerand/epoch-{E}/party-{P}/complete              # epoch E fully done
+```
+
+Coordination is polling-based: a rerand server checks for peer markers by listing the S3 prefix. A few seconds of polling latency is fine for background work.
+
+Authentication: the shared bucket uses IAM prefix policies to scope write access per party. Each party can only write to `s3://bucket/rerand/epoch-*/party-{P}/*`. All parties can read/list the full `s3://bucket/rerand/epoch-{E}/` prefix to observe peer markers. The manifest is written by the designated writer (party 0) under its own prefix (`party-0/manifest.json`) and is read-only for others.
+
+## Schema Changes
+
+### New column on `irises`
+
+```sql
+ALTER TABLE irises ADD COLUMN rerand_epoch INTEGER NOT NULL DEFAULT 0;
+```
+
+### Modified `increment_version_id` trigger
+
+```sql
+CREATE OR REPLACE FUNCTION increment_version_id()
+RETURNS TRIGGER AS $$
+BEGIN
+    IF (OLD.left_code IS DISTINCT FROM NEW.left_code OR
+        OLD.left_mask IS DISTINCT FROM NEW.left_mask OR
+        OLD.right_code IS DISTINCT FROM NEW.right_code OR
+        OLD.right_mask IS DISTINCT FROM NEW.right_mask)
+       AND NEW.rerand_epoch IS NOT DISTINCT FROM OLD.rerand_epoch THEN
+        NEW.version_id = COALESCE(OLD.version_id, 0) + 1;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+```
+
+When `rerand_epoch` changes (rerandomization), share data changes but `version_id` stays the same. When `rerand_epoch` stays the same (user-facing modification), `version_id` bumps as before.
+
+### Staging schema
+
+Each party has a staging schema (e.g. `SMPC_rerand_staging`) with:
+
+```sql
+CREATE TABLE irises (
+    epoch                INTEGER NOT NULL,
+    id                   BIGINT NOT NULL,
+    chunk_id             INTEGER NOT NULL,
+    left_code            BYTEA,
+    left_mask            BYTEA,
+    right_code           BYTEA,
+    right_mask           BYTEA,
+    original_version_id  SMALLINT,
+    rerand_epoch         INTEGER,
+    PRIMARY KEY (epoch, id)
+);
+```
+
+### Coordination table
+
+A `rerand_progress` table in each party's DB:
+
+```sql
+CREATE TABLE rerand_progress (
+    epoch           INTEGER NOT NULL,
+    chunk_id        INTEGER NOT NULL,
+    staging_written BOOLEAN NOT NULL DEFAULT FALSE,
+    all_confirmed   BOOLEAN NOT NULL DEFAULT FALSE,
+    live_applied    BOOLEAN NOT NULL DEFAULT FALSE,
+    PRIMARY KEY (epoch, chunk_id)
+);
+```
+
+Chunk ranges are derived from the manifest (`chunk_size`, `max_id_inclusive`) and `chunk_id`, so they are not stored here.
+
+Lifecycle: `staging_written` → `all_confirmed` → `live_applied`.
+
+## Flow
+
+### Step 1: Rerand Server (per party, separate process)
+
+Runs continuously:
+
+1. Determine the active epoch E and load its manifest (the highest epoch with a manifest at `s3://bucket/rerand/epoch-{E}/party-0/manifest.json` but without all three completion markers). If no manifest exists for the next epoch, create it (party 0 only): collect watermarks, compute `max_id_inclusive`, write `manifest.json`.
+2. Derive `shared_secret` for epoch E (keygen or resume — see above)
+3. Pick next chunk range `[start, end)` for chunk K from the manifest
+4. Read entries from live schema, recording each entry's `version_id`
+5. Rerandomize shares using `BLAKE3(shared_secret || iris_id)` XOF
+6. Write rerandomized shares to staging schema with `epoch = E`, `original_version_id`, `chunk_id = K`, and `rerand_epoch = E + 1`
+7. Set `staging_written = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
+8. Upload S3 marker after staging commit: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged`
+9. Poll S3 until all 3 party markers exist for chunk K
+10. Set `all_confirmed = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
+11. Acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection, then copy from staging to live DB, delete staging, and mark applied — all in one transaction (scoped to epoch and chunk):
+    ```sql
+    SELECT pg_advisory_lock(RERAND_APPLY_LOCK);   -- on dedicated connection
+    BEGIN;
+    UPDATE irises SET
+      left_code = staging.left_code,
+      left_mask = staging.left_mask,
+      right_code = staging.right_code,
+      right_mask = staging.right_mask,
+      rerand_epoch = staging.rerand_epoch
+    FROM staging_schema.irises AS staging
+    WHERE irises.id = staging.id
+      AND staging.epoch = E
+      AND staging.chunk_id = K
+      AND irises.version_id = staging.original_version_id;
+    DELETE FROM staging_schema.irises WHERE epoch = E AND chunk_id = K;
+    UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = E AND chunk_id = K;
+    COMMIT;
+    SELECT pg_advisory_unlock(RERAND_APPLY_LOCK);  -- release after commit
+    ```
+12. Proceed to next chunk (or start epoch transition if all chunks done)
+
+### Step 2: Main Server Startup (minimal changes)
+
+At startup, before `load_iris_db`:
+
+1. **Existing**: modification sync (`sync_modifications`) — all parties catch up on modifications, producing identical `version_id` values
+2. **New**: rerand sync — parties exchange a compact rerand watermark during the existing startup sync (`SyncState` exchange):
+   - Each party computes `(epoch, max_confirmed_chunk)` from its local `rerand_progress` table: the active epoch E and the highest `chunk_id` where `all_confirmed = TRUE`. Since chunks are processed in strictly increasing order, all chunks `0..max_confirmed_chunk` are implicitly confirmed.
+   - Each party sends this single `(epoch, max_confirmed_chunk)` pair as part of `SyncState`.
+   - Each party computes `safe_up_to = max(max_confirmed_chunk_party_0, max_confirmed_chunk_party_1, max_confirmed_chunk_party_2)` for the agreed epoch E, then locally applies all chunks `0..safe_up_to` where `live_applied = FALSE`.
+   - This is safe because `all_confirmed = TRUE` at any party means that party observed all three S3 `staged` markers, which means all three parties successfully committed the chunk to their staging schemas. A slower party may not have polled S3 yet, but its staging data is already there. Using `max` ensures all parties converge to the same applied set, preventing cross-party desync where one party loads rerandomized shares and another loads stale shares.
+   - Edge case: if no chunks have been confirmed yet (fresh epoch or very start), `max_confirmed_chunk` is -1 / None. `safe_up_to` becomes -1 / None and the catch-up step is skipped entirely.
+3. **New (DB-only catch-up)**: acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection. Then for every chunk K in `0..safe_up_to` where locally `live_applied = FALSE` (in increasing order): run the same apply transaction as Step 1.11. **Keep the lock held** through step 4.
+4. **Existing**: `load_iris_db` — loads from live DB into GPU memory. The advisory lock is still held, so the rerand server cannot apply new chunks while the DB is being read into memory.
+5. Release the advisory lock: `SELECT pg_advisory_unlock(RERAND_APPLY_LOCK)` on the dedicated connection, then drop the connection.
+
+### Advisory lock: startup vs rerand server concurrency
+
+Both the rerand server (Step 1.11) and the main server startup (Steps 2.3–2.4) acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` before applying chunks. This ensures:
+
+- Only one process applies chunks at a time (no interleaving).
+- The main server holds the lock from catch-up through `load_iris_db`, so the rerand server cannot sneak in applies between catch-up and memory load.
+- If either process crashes, the connection drops and Postgres automatically releases the session-level lock. No stale locks.
+
+**Implementation with connection pools (sqlx)**: session-level advisory locks are tied to a specific Postgres connection. When using a connection pool, acquire a **dedicated connection** (`pool.acquire()`) and hold it (do not drop/return it) for the entire lock window. The catch-up queries and `load_iris_db` can use the pool normally — the dedicated connection just sits idle holding the lock. Release with `pg_advisory_unlock(...)` on the same connection after `load_iris_db` completes, then drop the connection.
+
+```rust
+let mut lock_conn = pool.acquire().await?;
+sqlx::query("SELECT pg_advisory_lock($1)")
+    .bind(RERAND_APPLY_LOCK)
+    .execute(&mut *lock_conn).await?;
+
+apply_catchup_chunks(&pool).await?;  // uses pool
+load_iris_db(&pool).await?;          // uses pool
+
+sqlx::query("SELECT pg_advisory_unlock($1)")
+    .bind(RERAND_APPLY_LOCK)
+    .execute(&mut *lock_conn).await?;
+drop(lock_conn);
+```
+
+### Why modification sync before rerand sync matters
+
+Modification sync ensures all parties have the same `version_id` values before the rerand staging copy runs. This guarantees the optimistic lock (`WHERE version_id = original_version_id`) produces the same skip set on all parties — the same entries are updated, the same entries are skipped.
+
+## Conflict Resolution: Rerandomization vs Modifications
+
+### Why the optimistic lock is needed
+
+The rerand server reads entry X at time T with `version_id = V`. A modification (reauth/deletion) may happen later, bumping `version_id` to V+1. The staging still has `original_version_id = V`. The optimistic lock prevents overwriting the modification:
+
+```sql
+UPDATE irises SET ... WHERE version_id = original_version_id;
+-- V ≠ V+1 → entry X skipped
+```
+
+### Why `rerand_epoch` and the trigger are needed
+
+Without the trigger change, the staging copy would bump `version_id` (because share data changed). The trigger change keeps `version_id` as a pure "user-facing modification counter," separate from rerandomization.
+
+## Chunking
+
+Chunk boundaries must be identical across parties for chunk K to be meaningful. Define them via an epoch manifest object in S3:
+
+- `s3://bucket/rerand/epoch-{E}/party-0/manifest.json`: `{ epoch: E, chunk_size: N, max_id_inclusive: M }`
+- Party 0 writes the manifest once at epoch start under its own prefix (IAM-compliant); other parties poll until it exists and treat it as immutable.
+- **Watermark sync**: before the manifest is written, each party P uploads its local watermark `max_id_party_P = SELECT MAX(id) FROM irises` to `s3://bucket/rerand/epoch-{E}/party-{P}/max-id`.
+- The manifest writer waits until all three `max-id` markers exist, then sets `max_id_inclusive` as:
+  - `M = min(max_id_party_0, max_id_party_1, max_id_party_2) - safety_buffer_ids`
+  - `safety_buffer_ids` is configurable (default 0 or one chunk) to avoid rerandomizing the “tip” where replication/ingest lag could differ across parties.
+- New inserts with `id > M` are left for a future epoch.
+- Chunk K corresponds to `[start, end)` where `start = 1 + K * N` and `end = min(start + N, M + 1)`.
+
+A configurable delay (`--chunk-delay`, default e.g. 5s) is inserted between chunks to avoid sustained DB load. The rerand server should not stress the live DB with continuous writes — the delay spreads the I/O over time. The delay, chunk size, and number of parallel DB connections should all be configurable via CLI flags or environment variables.
\ No newline at end of file
diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/continuous-rerand-local.sh b/iris-mpc-bins/bin/iris-mpc-upgrade/continuous-rerand-local.sh
new file mode 100755
index 0000000000..aeb3a0aa28
--- /dev/null
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/continuous-rerand-local.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+rm -rf "*.log"
+
+docker-compose -f "$SCRIPT_DIR/docker-compose.rand.yaml" down --remove-orphans -v
+docker-compose -f "$SCRIPT_DIR/docker-compose.rand.yaml" up -d
+
+sleep 10
+
+aws_local() {
+    AWS_ACCESS_KEY_ID=test AWS_SECRET_ACCESS_KEY=test AWS_DEFAULT_REGION=us-east-1 \
+    aws --endpoint-url=http://${LOCALSTACK_HOST:-localhost}:4566 "$@"
+}
+
+# Create S3 bucket for rerand coordination markers
+BUCKET_NAME=wf-smpcv2-rerand-testing
+aws_local s3api create-bucket --bucket $BUCKET_NAME --region us-east-1
+
+# Build binaries
+cargo build -p iris-mpc-bins --release --bin seed-v2-dbs --bin rerandomize-db
+
+TARGET_DIR=$(cargo metadata --format-version 1 | jq ".target_directory" -r)
+
+# Set AWS env vars for localstack
+export AWS_ACCESS_KEY_ID=test
+export AWS_SECRET_ACCESS_KEY=test
+export AWS_DEFAULT_REGION=us-east-1
+export AWS_ENDPOINT_URL="http://127.0.0.1:4566"
+
+export ENVIRONMENT="testing"
+
+# Seed DBs with initial data (using first 3 new-db containers as live DBs)
+echo "=== Seeding DBs ==="
+$TARGET_DIR/release/seed-v2-dbs \
+  --db-url-party-0 postgres://postgres:postgres@localhost:6200 \
+  --db-url-party-1 postgres://postgres:postgres@localhost:6201 \
+  --db-url-party-2 postgres://postgres:postgres@localhost:6202 \
+  --schema-name-party-0 SMPC_testing_0 \
+  --schema-name-party-1 SMPC_testing_1 \
+  --schema-name-party-2 SMPC_testing_2 \
+  --fill-to 1000 \
+  --batch-size 100
+echo "Seeding complete"
+
+# Run continuous rerandomization for all 3 parties in parallel
+echo "=== Starting continuous rerandomization ==="
+COMMON_ARGS="--chunk-size 200 --chunk-delay-secs 1 --s3-poll-interval-ms 2000 --safety-buffer-ids 0"
+
+$TARGET_DIR/release/rerandomize-db rerandomize-continuous \
+  --party-id 0 \
+  --db-url postgres://postgres:postgres@localhost:6200 \
+  --schema-name SMPC_testing_0 \
+  --s3-bucket $BUCKET_NAME \
+  --healthcheck-port 3010 \
+  $COMMON_ARGS &
+PID_0=$!
+
+$TARGET_DIR/release/rerandomize-db rerandomize-continuous \
+  --party-id 1 \
+  --db-url postgres://postgres:postgres@localhost:6201 \
+  --schema-name SMPC_testing_1 \
+  --s3-bucket $BUCKET_NAME \
+  --healthcheck-port 3011 \
+  $COMMON_ARGS &
+PID_1=$!
+
+$TARGET_DIR/release/rerandomize-db rerandomize-continuous \
+  --party-id 2 \
+  --db-url postgres://postgres:postgres@localhost:6202 \
+  --schema-name SMPC_testing_2 \
+  --s3-bucket $BUCKET_NAME \
+  --healthcheck-port 3012 \
+  $COMMON_ARGS &
+PID_2=$!
+
+echo "Rerand servers started: PIDs $PID_0, $PID_1, $PID_2"
+echo "Waiting for one epoch to complete (watching for completion markers in S3)..."
+
+# Poll until epoch 0 completion markers exist for all parties
+MAX_WAIT=300
+ELAPSED=0
+while [ $ELAPSED -lt $MAX_WAIT ]; do
+    COMPLETE=true
+    for P in 0 1 2; do
+        KEY="rerand/epoch-0/party-${P}/complete"
+        if ! aws_local s3api head-object --bucket $BUCKET_NAME --key "$KEY" >/dev/null 2>&1; then
+            COMPLETE=false
+            break
+        fi
+    done
+    if [ "$COMPLETE" = true ]; then
+        echo "=== Epoch 0 completed! ==="
+        break
+    fi
+    sleep 5
+    ELAPSED=$((ELAPSED + 5))
+    echo "Waiting... ($ELAPSED s)"
+done
+
+if [ $ELAPSED -ge $MAX_WAIT ]; then
+    echo "ERROR: Epoch 0 did not complete within ${MAX_WAIT}s"
+fi
+
+# Stop the rerand servers
+kill $PID_0 $PID_1 $PID_2 2>/dev/null || true
+wait $PID_0 $PID_1 $PID_2 2>/dev/null || true
+
+echo "=== Continuous rerandomization test finished ==="
diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
index 58910d38b3..5cbe5c51e8 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
@@ -11,19 +11,17 @@ use base64::Engine;
 use clap::Parser;
 use eyre::Result;
 use futures::TryStreamExt;
-use iris_mpc_common::galois;
-use iris_mpc_common::galois::degree4::basis::Monomial;
-use iris_mpc_common::galois::degree4::GaloisRingElement;
 use iris_mpc_common::galois_engine::degree4::{
     GaloisRingIrisCodeShare, GaloisRingTrimmedMaskCodeShare,
 };
-use iris_mpc_common::id::PartyID;
 use iris_mpc_common::postgres::{AccessMode, PostgresClient};
 use iris_mpc_store::{DbStoredIris, Store, StoredIrisRef};
 use iris_mpc_upgrade::config::{
     KeyGenConfig, ReRandomizeCheckConfig, ReRandomizeConfig, ReRandomizeDbSubCommand,
+    RerandomizeContinuousConfig,
 };
-use iris_mpc_upgrade::rerandomization::randomize_iris;
+use iris_mpc_upgrade::continuous_rerand;
+use iris_mpc_upgrade::rerandomization::{randomize_iris, reconstruct_shares};
 use iris_mpc_upgrade::tripartite_dh;
 use iris_mpc_upgrade::{
     config::ReRandomizeDbConfig,
@@ -42,6 +40,9 @@ async fn main() -> Result<()> {
         ReRandomizeDbSubCommand::RerandomizeDb(config) => rerandomize_db_main(config).await,
         ReRandomizeDbSubCommand::KeyGen(config) => keygen_main(config).await,
         ReRandomizeDbSubCommand::RerandomizeCheck(config) => rerandomize_check_main(config).await,
+        ReRandomizeDbSubCommand::RerandomizeContinuous(config) => {
+            rerandomize_continuous_main(config).await
+        }
     }
 }
 
@@ -531,6 +532,35 @@ async fn rerandomize_check_main(config: ReRandomizeCheckConfig) -> Result<()> {
     Ok(())
 }
 
+async fn rerandomize_continuous_main(config: RerandomizeContinuousConfig) -> Result<()> {
+    tracing::info!("Starting continuous rerandomization for party {}", config.party_id);
+
+    let mut background_tasks = TaskMonitor::new();
+    let healthcheck_port = config.healthcheck_port;
+    let _health_check_abort = background_tasks
+        .spawn(async move { spawn_healthcheck_server(healthcheck_port).await });
+    background_tasks.check_tasks();
+
+    let sdk_config = aws_config::from_env().load().await;
+    let s3_config = aws_sdk_s3::config::Builder::from(&sdk_config);
+    let sm_config = aws_sdk_secretsmanager::config::Builder::from(&sdk_config);
+    let s3_client = S3Client::from_conf(s3_config.build());
+    let sm_client = SecretsManagerClient::from_conf(sm_config.build());
+
+    let postgres_client = PostgresClient::new(
+        &config.db_url,
+        &config.schema_name,
+        AccessMode::ReadWrite,
+    )
+    .await?;
+    let store = Store::new(&postgres_client).await?;
+
+    continuous_rerand::run_continuous_rerand(&config, &s3_client, &sm_client, &store, None).await?;
+
+    background_tasks.abort_and_wait_for_finish().await;
+    Ok(())
+}
+
 async fn download_public_key(config: &ReRandomizeConfig, party_id: u8) -> Result<String> {
     if config.env == "testing" {
         let bucket = config.public_key_bucket_name.as_ref().ok_or_else(|| {
@@ -570,70 +600,6 @@ async fn build_read_only_store(db_url: &str, schema_name: &str) -> Result<Store>
     Store::new(&postgres_client).await
 }
 
-fn reconstruct_shares(share0: &[u16], share1: &[u16], share2: &[u16]) -> Vec<u16> {
-    let lag_01 = galois::degree4::ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(
-        PartyID::ID0,
-        PartyID::ID1,
-    );
-    let lag_10 = galois::degree4::ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(
-        PartyID::ID1,
-        PartyID::ID0,
-    );
-    let lag_02 = galois::degree4::ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(
-        PartyID::ID0,
-        PartyID::ID2,
-    );
-    let lag_20 = galois::degree4::ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(
-        PartyID::ID2,
-        PartyID::ID0,
-    );
-    let lag_12 = galois::degree4::ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(
-        PartyID::ID1,
-        PartyID::ID2,
-    );
-    let lag_21 = galois::degree4::ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(
-        PartyID::ID2,
-        PartyID::ID1,
-    );
-
-    assert!(share0.len() == share1.len() && share1.len() == share2.len());
-
-    let recon01 = share0
-        .chunks_exact(4)
-        .zip_eq(share1.chunks_exact(4))
-        .flat_map(|(a, b)| {
-            let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
-            let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
-            let c = a * lag_01 + b * lag_10;
-            c.coefs
-        })
-        .collect_vec();
-    let recon12 = share1
-        .chunks_exact(4)
-        .zip_eq(share2.chunks_exact(4))
-        .flat_map(|(a, b)| {
-            let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
-            let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
-            let c = a * lag_12 + b * lag_21;
-            c.coefs
-        })
-        .collect_vec();
-    let recon02 = share0
-        .chunks_exact(4)
-        .zip_eq(share2.chunks_exact(4))
-        .flat_map(|(a, b)| {
-            let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
-            let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
-            let c = a * lag_02 + b * lag_20;
-            c.coefs
-        })
-        .collect_vec();
-
-    assert_eq!(recon01, recon12);
-    assert_eq!(recon01, recon02);
-    recon01
-}
-
 async fn download_public_key_from_localstack(bucket: &str, party_id: u8) -> Result<String> {
     let key = format!("{}-{}", PUBLIC_KEY_S3_KEY_NAME_PREFIX, party_id);
     let request_url = format!("http://localhost:4566/{}/{}", bucket, key);
diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh b/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
new file mode 100755
index 0000000000..167f2682ae
--- /dev/null
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+#
+# Run the continuous rerandomization e2e chaos tests.
+# Starts Postgres + localstack via docker-compose, runs the Rust tests, then
+# tears everything down.
+#
+# Usage:
+#   ./run-rerand-e2e-tests.sh
+#
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+COMPOSE_FILE="$SCRIPT_DIR/docker-compose.rand.yaml"
+
+cleanup() {
+    echo "=== Tearing down containers ==="
+    docker-compose -f "$COMPOSE_FILE" down --remove-orphans -v 2>/dev/null || true
+}
+trap cleanup EXIT
+
+echo "=== Starting Postgres + localstack ==="
+docker-compose -f "$COMPOSE_FILE" down --remove-orphans -v 2>/dev/null || true
+docker-compose -f "$COMPOSE_FILE" up -d
+
+echo "Waiting for services to be ready..."
+for i in $(seq 1 30); do
+    if docker exec iris-mpc-upgrade-new-db-1-1 pg_isready -U postgres -q 2>/dev/null; then
+        break
+    fi
+    sleep 1
+done
+docker exec iris-mpc-upgrade-new-db-1-1 pg_isready -U postgres || { echo "Postgres not ready"; exit 1; }
+
+for i in $(seq 1 30); do
+    STATUS=$(docker inspect --format='{{.State.Health.Status}}' iris-mpc-upgrade-localstack-1 2>/dev/null || echo "unknown")
+    if [ "$STATUS" = "healthy" ]; then
+        break
+    fi
+    sleep 1
+done
+echo "Infrastructure ready."
+
+echo "=== Running e2e chaos tests ==="
+cd "$REPO_ROOT"
+AWS_ACCESS_KEY_ID=test \
+AWS_SECRET_ACCESS_KEY=test \
+AWS_DEFAULT_REGION=us-east-1 \
+AWS_ENDPOINT_URL=http://127.0.0.1:4566 \
+ENVIRONMENT=testing \
+    cargo test -p iris-mpc-upgrade --test continuous_rerand_e2e --features db_dependent -- --nocapture
+
+echo "=== All tests passed ==="
diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 2e9e318e6a..82d49a76e7 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -58,6 +58,7 @@ use iris_mpc_common::{
 };
 use iris_mpc_gpu::server::ServerActor;
 use iris_mpc_store::loader::load_iris_db;
+use iris_mpc_store::rerand as rerand_store;
 use iris_mpc_store::{
     fetch_and_parse_chunks, last_snapshot_timestamp, DbStoredIris, ObjectStore, S3Store,
     S3StoredIris, Store, StoredIrisRef,
@@ -981,11 +982,13 @@ async fn server_main(config: Config) -> Result<()> {
     let is_ready_flag = Arc::new(AtomicBool::new(false));
     let is_ready_flag_cloned = Arc::clone(&is_ready_flag);
 
+    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool).await.ok();
     let my_state = SyncState {
         db_len: store_len as u64,
         modifications: store.last_modifications(max_modification_lookback).await?,
         next_sns_sequence_num: next_sns_seq_number_future.await?,
         common_config: CommonConfig::from(config.clone()),
+        rerand_state,
     };
 
     tracing::info!("Sync state: {:?}", my_state);
@@ -1296,7 +1299,7 @@ async fn server_main(config: Config) -> Result<()> {
             None,
             &aws_clients,
             &shares_encryption_key_pair,
-            sync_result,
+            sync_result.clone(),
         )
         .await?;
     }
@@ -1315,6 +1318,13 @@ async fn server_main(config: Config) -> Result<()> {
         }
     }
 
+    let rerand_lock_conn = rerand_store::rerand_catchup_and_lock(
+        &store.pool,
+        &store.schema_name,
+        &sync_result,
+    )
+    .await?;
+
     if download_shutdown_handler.is_shutting_down() {
         tracing::warn!("Shutting down has been triggered");
         return Ok(());
@@ -1408,6 +1418,8 @@ async fn server_main(config: Config) -> Result<()> {
 
     let (mut handle, store) = rx.await??;
 
+    rerand_store::release_rerand_lock(rerand_lock_conn).await?;
+
     background_tasks.check_tasks();
 
     // Start thread that will be responsible for communicating back the results
diff --git a/iris-mpc-common/src/helpers/sync.rs b/iris-mpc-common/src/helpers/sync.rs
index 89c6238754..9719af4f6f 100644
--- a/iris-mpc-common/src/helpers/sync.rs
+++ b/iris-mpc-common/src/helpers/sync.rs
@@ -10,6 +10,15 @@ pub struct SyncState {
     pub modifications: Vec<Modification>,
     pub next_sns_sequence_num: Option<u128>,
     pub common_config: CommonConfig,
+    #[serde(default)]
+    pub rerand_state: Option<RerandSyncState>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct RerandSyncState {
+    pub epoch: i32,
+    /// Highest chunk_id where all_confirmed = TRUE. -1 if none confirmed.
+    pub max_confirmed_chunk: i32,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -405,6 +414,7 @@ mod tests {
             modifications,
             next_sns_sequence_num: None,
             common_config: CommonConfig::from(config),
+            rerand_state: None,
         }
     }
 
@@ -846,18 +856,21 @@ mod tests {
                 modifications: vec![],
                 next_sns_sequence_num: Some(100),
                 common_config: CommonConfig::default(),
+                rerand_state: None,
             },
             SyncState {
                 db_len: 20,
                 modifications: vec![],
                 next_sns_sequence_num: Some(200),
                 common_config: CommonConfig::default(),
+                rerand_state: None,
             },
             SyncState {
                 db_len: 30,
                 modifications: vec![],
                 next_sns_sequence_num: Some(150),
                 common_config: CommonConfig::default(),
+                rerand_state: None,
             },
         ];
 
@@ -870,6 +883,7 @@ mod tests {
             modifications: vec![],
             next_sns_sequence_num: None,
             common_config: CommonConfig::default(),
+            rerand_state: None,
         };
         let all_states = vec![
             state_with_none_sequence_num.clone(),
@@ -892,18 +906,21 @@ mod tests {
                 modifications: vec![],
                 next_sns_sequence_num: None, // NodeX - advanced but empty queue
                 common_config: CommonConfig::default(),
+                rerand_state: None,
             },
             SyncState {
                 db_len: 20,
                 modifications: vec![],
                 next_sns_sequence_num: Some(123), // Other nodes still have messages
                 common_config: CommonConfig::default(),
+                rerand_state: None,
             },
             SyncState {
                 db_len: 30,
                 modifications: vec![],
                 next_sns_sequence_num: Some(123),
                 common_config: CommonConfig::default(),
+                rerand_state: None,
             },
         ];
 
@@ -1023,18 +1040,21 @@ mod tests {
                 modifications: vec![],
                 next_sns_sequence_num: Some(100),
                 common_config: CommonConfig::from(config1),
+                rerand_state: None,
             },
             SyncState {
                 db_len: 20,
                 modifications: vec![],
                 next_sns_sequence_num: Some(100),
                 common_config: CommonConfig::from(config2),
+                rerand_state: None,
             },
             SyncState {
                 db_len: 20,
                 modifications: vec![],
                 next_sns_sequence_num: Some(100),
                 common_config: CommonConfig::from(config3),
+                rerand_state: None,
             },
         ];
 
diff --git a/iris-mpc-store/src/lib.rs b/iris-mpc-store/src/lib.rs
index ffddede11a..e351f2a9b9 100644
--- a/iris-mpc-store/src/lib.rs
+++ b/iris-mpc-store/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod loader;
+pub mod rerand;
 mod s3_importer;
 
 use bytemuck::cast_slice;
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
new file mode 100644
index 0000000000..557304c057
--- /dev/null
+++ b/iris-mpc-store/src/rerand.rs
@@ -0,0 +1,441 @@
+use eyre::Result;
+use iris_mpc_common::helpers::sync::{RerandSyncState, SyncResult};
+use sqlx::{pool::PoolConnection, PgPool, Postgres};
+
+pub const RERAND_APPLY_LOCK: i64 = 0x5245_5241_4E44;
+
+pub struct StagingIrisEntry {
+    pub epoch: i32,
+    pub id: i64,
+    pub chunk_id: i32,
+    pub left_code: Vec<u8>,
+    pub left_mask: Vec<u8>,
+    pub right_code: Vec<u8>,
+    pub right_mask: Vec<u8>,
+    pub original_version_id: i16,
+    pub rerand_epoch: i32,
+}
+
+#[derive(sqlx::FromRow, Debug, Clone)]
+pub struct RerandProgress {
+    pub epoch: i32,
+    pub chunk_id: i32,
+    pub staging_written: bool,
+    pub all_confirmed: bool,
+    pub live_applied: bool,
+}
+
+pub fn staging_schema_name(live_schema: &str) -> String {
+    format!("{}_rerand_staging", live_schema)
+}
+
+pub async fn ensure_staging_schema(pool: &PgPool, staging_schema: &str) -> Result<()> {
+    let create_schema = format!(r#"CREATE SCHEMA IF NOT EXISTS "{}""#, staging_schema);
+    sqlx::query(&create_schema).execute(pool).await?;
+
+    let create_table = format!(
+        r#"
+        CREATE TABLE IF NOT EXISTS "{}".irises (
+            epoch               INTEGER NOT NULL,
+            id                  BIGINT NOT NULL,
+            chunk_id            INTEGER NOT NULL,
+            left_code           BYTEA,
+            left_mask           BYTEA,
+            right_code          BYTEA,
+            right_mask          BYTEA,
+            original_version_id SMALLINT,
+            rerand_epoch        INTEGER,
+            PRIMARY KEY (epoch, id)
+        )
+        "#,
+        staging_schema,
+    );
+    sqlx::query(&create_table).execute(pool).await?;
+    Ok(())
+}
+
+pub async fn insert_staging_irises(
+    pool: &PgPool,
+    staging_schema: &str,
+    entries: &[StagingIrisEntry],
+) -> Result<()> {
+    if entries.is_empty() {
+        return Ok(());
+    }
+
+    let table = format!("\"{}\".irises", staging_schema);
+    let header = format!(
+        "INSERT INTO {} (epoch, id, chunk_id, left_code, left_mask, right_code, right_mask, original_version_id, rerand_epoch)",
+        table
+    );
+
+    let mut qb = sqlx::QueryBuilder::new(header);
+    qb.push_values(entries, |mut b, e| {
+        b.push_bind(e.epoch);
+        b.push_bind(e.id);
+        b.push_bind(e.chunk_id);
+        b.push_bind(&e.left_code);
+        b.push_bind(&e.left_mask);
+        b.push_bind(&e.right_code);
+        b.push_bind(&e.right_mask);
+        b.push_bind(e.original_version_id);
+        b.push_bind(e.rerand_epoch);
+    });
+
+    qb.push(" ON CONFLICT (epoch, id) DO NOTHING");
+    qb.build().execute(pool).await?;
+    Ok(())
+}
+
+/// Apply a confirmed staging chunk to the live DB.
+///
+/// Within a single transaction:
+///   1. UPDATE live irises from staging (optimistic lock on version_id)
+///   2. DELETE staging rows for this chunk
+///   3. Mark live_applied in rerand_progress
+pub async fn apply_staging_chunk(
+    pool: &PgPool,
+    staging_schema: &str,
+    epoch: i32,
+    chunk_id: i32,
+) -> Result<u64> {
+    let mut tx = pool.begin().await?;
+
+    let update_sql = format!(
+        r#"
+        UPDATE irises SET
+            left_code    = staging.left_code,
+            left_mask    = staging.left_mask,
+            right_code   = staging.right_code,
+            right_mask   = staging.right_mask,
+            rerand_epoch = staging.rerand_epoch
+        FROM "{}".irises AS staging
+        WHERE irises.id = staging.id
+          AND staging.epoch = $1
+          AND staging.chunk_id = $2
+          AND irises.version_id = staging.original_version_id
+        "#,
+        staging_schema,
+    );
+    let result = sqlx::query(&update_sql)
+        .bind(epoch)
+        .bind(chunk_id)
+        .execute(&mut *tx)
+        .await?;
+    let rows_updated = result.rows_affected();
+
+    let delete_sql = format!(
+        r#"DELETE FROM "{}".irises WHERE epoch = $1 AND chunk_id = $2"#,
+        staging_schema,
+    );
+    sqlx::query(&delete_sql)
+        .bind(epoch)
+        .bind(chunk_id)
+        .execute(&mut *tx)
+        .await?;
+
+    sqlx::query(
+        "UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = $1 AND chunk_id = $2",
+    )
+    .bind(epoch)
+    .bind(chunk_id)
+    .execute(&mut *tx)
+    .await?;
+
+    tx.commit().await?;
+    Ok(rows_updated)
+}
+
+pub async fn upsert_rerand_progress(pool: &PgPool, epoch: i32, chunk_id: i32) -> Result<()> {
+    sqlx::query(
+        r#"
+        INSERT INTO rerand_progress (epoch, chunk_id)
+        VALUES ($1, $2)
+        ON CONFLICT (epoch, chunk_id) DO NOTHING
+        "#,
+    )
+    .bind(epoch)
+    .bind(chunk_id)
+    .execute(pool)
+    .await?;
+    Ok(())
+}
+
+pub async fn set_staging_written(pool: &PgPool, epoch: i32, chunk_id: i32) -> Result<()> {
+    sqlx::query(
+        "UPDATE rerand_progress SET staging_written = TRUE WHERE epoch = $1 AND chunk_id = $2",
+    )
+    .bind(epoch)
+    .bind(chunk_id)
+    .execute(pool)
+    .await?;
+    Ok(())
+}
+
+pub async fn set_all_confirmed(pool: &PgPool, epoch: i32, chunk_id: i32) -> Result<()> {
+    sqlx::query(
+        "UPDATE rerand_progress SET all_confirmed = TRUE WHERE epoch = $1 AND chunk_id = $2",
+    )
+    .bind(epoch)
+    .bind(chunk_id)
+    .execute(pool)
+    .await?;
+    Ok(())
+}
+
+pub async fn get_rerand_progress(
+    pool: &PgPool,
+    epoch: i32,
+    chunk_id: i32,
+) -> Result<Option<RerandProgress>> {
+    let row = sqlx::query_as::<_, RerandProgress>(
+        "SELECT epoch, chunk_id, staging_written, all_confirmed, live_applied FROM rerand_progress WHERE epoch = $1 AND chunk_id = $2",
+    )
+    .bind(epoch)
+    .bind(chunk_id)
+    .fetch_optional(pool)
+    .await?;
+    Ok(row)
+}
+
+/// Returns the highest chunk_id where all_confirmed = TRUE for a given epoch,
+/// or None if no chunks are confirmed.
+pub async fn get_max_confirmed_chunk(pool: &PgPool, epoch: i32) -> Result<Option<i32>> {
+    let row: Option<(i32,)> = sqlx::query_as(
+        "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND all_confirmed = TRUE",
+    )
+    .bind(epoch)
+    .fetch_optional(pool)
+    .await?;
+    match row {
+        Some((max,)) => Ok(Some(max)),
+        None => Ok(None),
+    }
+}
+
+/// Returns the highest epoch that has any rerand_progress rows.
+pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
+    let row: (Option<i32>,) =
+        sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
+            .fetch_one(pool)
+            .await?;
+    Ok(row.0)
+}
+
+/// Returns chunk_ids for a given epoch where live_applied = FALSE and
+/// chunk_id <= up_to_chunk, ordered ascending.
+pub async fn get_unapplied_chunks(
+    pool: &PgPool,
+    epoch: i32,
+    up_to_chunk: i32,
+) -> Result<Vec<i32>> {
+    let rows: Vec<(i32,)> = sqlx::query_as(
+        r#"
+        SELECT chunk_id FROM rerand_progress
+        WHERE epoch = $1 AND chunk_id <= $2 AND live_applied = FALSE
+        ORDER BY chunk_id ASC
+        "#,
+    )
+    .bind(epoch)
+    .bind(up_to_chunk)
+    .fetch_all(pool)
+    .await?;
+    Ok(rows.into_iter().map(|(id,)| id).collect())
+}
+
+// ---------------------------------------------------------------------------
+// Shared startup helpers (used by both HNSW and GPU servers)
+// ---------------------------------------------------------------------------
+
+/// Build the rerand sync state from the local `rerand_progress` table.
+pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<RerandSyncState> {
+    let epoch = get_current_epoch(pool).await?.unwrap_or(0);
+    let max_confirmed = get_max_confirmed_chunk(pool, epoch)
+        .await?
+        .unwrap_or(-1);
+    Ok(RerandSyncState {
+        epoch,
+        max_confirmed_chunk: max_confirmed,
+    })
+}
+
+/// Compute the safe-to-apply watermark from all parties' rerand sync states.
+/// Returns `Some((epoch, max_chunk_id))` if there are chunks to catch up,
+/// `None` otherwise.
+pub fn compute_rerand_safe_up_to(sync_result: &SyncResult) -> Result<Option<(i32, i32)>> {
+    let my_state = match sync_result.my_state.rerand_state.as_ref() {
+        Some(s) => s,
+        None => return Ok(None),
+    };
+    let my_epoch = my_state.epoch;
+
+    let rerand_states: Vec<&RerandSyncState> = sync_result
+        .all_states
+        .iter()
+        .filter_map(|s| s.rerand_state.as_ref())
+        .collect();
+
+    if rerand_states.is_empty() {
+        return Ok(None);
+    }
+
+    let mut safe_up_to = -1;
+    for s in rerand_states {
+        let diff = s.epoch - my_epoch;
+        match diff {
+            0 => {
+                safe_up_to = safe_up_to.max(s.max_confirmed_chunk);
+            }
+            1 => {
+                safe_up_to = i32::MAX;
+            }
+            -1 => {
+                // They are behind, they contribute -1
+            }
+            _ => {
+                eyre::bail!("Fatal epoch desync: local epoch is {}, but peer is on epoch {}", my_epoch, s.epoch);
+            }
+        }
+    }
+
+    if safe_up_to < 0 {
+        return Ok(None);
+    }
+
+    Ok(Some((my_epoch, safe_up_to)))
+}
+
+/// Perform rerand catch-up and acquire the advisory lock.
+///
+/// 1. Computes the safe-to-apply watermark from `sync_result`.
+/// 2. If there are unapplied chunks, acquires `pg_advisory_lock(RERAND_APPLY_LOCK)`
+///    on a dedicated connection, then applies all unapplied chunks.
+/// 3. Returns the lock-holding connection (if the lock was acquired).
+///
+/// The caller **must** keep the returned connection alive until `load_iris_db`
+/// finishes, then call [`release_rerand_lock`] to release it.
+pub async fn rerand_catchup_and_lock(
+    pool: &PgPool,
+    schema_name: &str,
+    sync_result: &SyncResult,
+) -> Result<Option<PoolConnection<Postgres>>> {
+    let safe_up_to = match compute_rerand_safe_up_to(sync_result)? {
+        Some(v) => v,
+        None => return Ok(None),
+    };
+
+    let staging_schema = staging_schema_name(schema_name);
+    tracing::info!(
+        "Rerand catch-up: applying chunks up to {} for epoch {}",
+        safe_up_to.1,
+        safe_up_to.0
+    );
+
+    let mut conn = pool.acquire().await?;
+    sqlx::query("SELECT pg_advisory_lock($1)")
+        .bind(RERAND_APPLY_LOCK)
+        .execute(&mut *conn)
+        .await?;
+
+    let unapplied = get_unapplied_chunks(pool, safe_up_to.0, safe_up_to.1).await?;
+    for chunk_id in unapplied {
+        let rows =
+            apply_staging_chunk(pool, &staging_schema, safe_up_to.0, chunk_id).await?;
+        tracing::info!(
+            "Rerand catch-up: applied epoch {} chunk {} ({} rows)",
+            safe_up_to.0,
+            chunk_id,
+            rows
+        );
+    }
+
+    Ok(Some(conn))
+}
+
+/// Release the advisory lock acquired by [`rerand_catchup_and_lock`].
+pub async fn release_rerand_lock(
+    lock_conn: Option<PoolConnection<Postgres>>,
+) -> Result<()> {
+    if let Some(mut conn) = lock_conn {
+        sqlx::query("SELECT pg_advisory_unlock($1)")
+            .bind(RERAND_APPLY_LOCK)
+            .execute(&mut *conn)
+            .await?;
+        drop(conn);
+        tracing::info!("Rerand advisory lock released after DB load");
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use iris_mpc_common::config::CommonConfig;
+    use iris_mpc_common::helpers::sync::SyncState;
+
+    fn dummy_sync_state(epoch: i32, max_confirmed_chunk: i32) -> SyncState {
+        SyncState {
+            db_len: 100,
+            modifications: vec![],
+            next_sns_sequence_num: None,
+            common_config: CommonConfig::default(),
+            rerand_state: Some(RerandSyncState {
+                epoch,
+                max_confirmed_chunk,
+            }),
+        }
+    }
+
+    #[test]
+    fn test_compute_rerand_safe_up_to_same_epoch() {
+        let p0 = dummy_sync_state(1, 5);
+        let p1 = dummy_sync_state(1, 4);
+        let p2 = dummy_sync_state(1, 6);
+        let sync_result = SyncResult {
+            my_state: p0.clone(),
+            all_states: vec![p0, p1, p2],
+        };
+        assert_eq!(compute_rerand_safe_up_to(&sync_result).unwrap(), Some((1, 6)));
+    }
+
+    #[test]
+    fn test_compute_rerand_safe_up_to_peer_ahead() {
+        // I am on epoch 0, but peer is on epoch 1.
+        // This implies the peer has confirmed all my chunks for epoch 0.
+        let p0 = dummy_sync_state(0, 5);
+        let p1 = dummy_sync_state(1, 0); // ahead
+        let p2 = dummy_sync_state(0, 5);
+        let sync_result = SyncResult {
+            my_state: p0.clone(),
+            all_states: vec![p0, p1, p2],
+        };
+        assert_eq!(compute_rerand_safe_up_to(&sync_result).unwrap(), Some((0, i32::MAX)));
+    }
+
+    #[test]
+    fn test_compute_rerand_safe_up_to_peer_behind() {
+        // I am on epoch 1, but peer is on epoch 0.
+        // This implies the peer has not confirmed any chunks for epoch 1.
+        let p0 = dummy_sync_state(1, 2);
+        let p1 = dummy_sync_state(0, 10); // behind
+        let p2 = dummy_sync_state(1, 2);
+        let sync_result = SyncResult {
+            my_state: p0.clone(),
+            all_states: vec![p0, p1, p2],
+        };
+        assert_eq!(compute_rerand_safe_up_to(&sync_result).unwrap(), Some((1, 2)));
+    }
+    
+    #[test]
+    fn test_compute_rerand_safe_up_to_fatal_desync() {
+        // I am on epoch 1, but peer is on epoch 3 (difference > 1).
+        let p0 = dummy_sync_state(1, 2);
+        let p1 = dummy_sync_state(3, 10); // way ahead
+        let p2 = dummy_sync_state(1, 2);
+        let sync_result = SyncResult {
+            my_state: p0.clone(),
+            all_states: vec![p0, p1, p2],
+        };
+        assert!(compute_rerand_safe_up_to(&sync_result).is_err());
+    }
+}
diff --git a/iris-mpc-upgrade/Cargo.toml b/iris-mpc-upgrade/Cargo.toml
index 93eeb19589..e9f4c61b19 100644
--- a/iris-mpc-upgrade/Cargo.toml
+++ b/iris-mpc-upgrade/Cargo.toml
@@ -12,18 +12,24 @@ ark-bls12-381 = "0.5.0"
 ark-ff = "0.5.0"
 ark-ec = "0.5.0"
 ark-serialize = "0.5.0"
+aws-config.workspace = true
+aws-sdk-s3.workspace = true
+aws-sdk-secretsmanager.workspace = true
 axum.workspace = true
 iris-mpc-common = { path = "../iris-mpc-common" }
 iris-mpc-store = { path = "../iris-mpc-store" }
 clap = { workspace = true, features = ["env"] }
 eyre.workspace = true
 bytemuck.workspace = true
+base64.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 serde-big-array = "0.5"
 tracing.workspace = true
 itertools.workspace = true
 rand.workspace = true
 rand_chacha = "0.3"
+sqlx.workspace = true
 tokio.workspace = true
 tracing-subscriber.workspace = true
 
@@ -36,11 +42,17 @@ prost = "0.13.3"
 sha2.workspace = true
 thiserror.workspace = true
 blake3 = "1.8.2"
+futures.workspace = true
+tokio-util.workspace = true
 
 
 [dev-dependencies]
 criterion = "0.5"
 rayon = "1.10.0"
+dotenvy.workspace = true
+
+[features]
+db_dependent = []
 
 
 [build-dependencies]
diff --git a/iris-mpc-upgrade/src/config.rs b/iris-mpc-upgrade/src/config.rs
index 3444eb3011..82ad7b347d 100644
--- a/iris-mpc-upgrade/src/config.rs
+++ b/iris-mpc-upgrade/src/config.rs
@@ -240,6 +240,7 @@ pub enum ReRandomizeDbSubCommand {
     KeyGen(KeyGenConfig),
     RerandomizeDb(ReRandomizeConfig),
     RerandomizeCheck(ReRandomizeCheckConfig),
+    RerandomizeContinuous(RerandomizeContinuousConfig),
 }
 
 #[derive(Args)]
@@ -341,3 +342,36 @@ pub struct ReRandomizeCheckConfig {
     #[clap(long, env = "NEW_SCHEMA_NAME_PARTY_2")]
     pub new_schema_name_party_2: String,
 }
+
+#[derive(Args, Debug)]
+pub struct RerandomizeContinuousConfig {
+    #[clap(long, env = "PARTY_ID")]
+    pub party_id: u8,
+
+    #[clap(long, env = "DB_URL")]
+    pub db_url: String,
+
+    #[clap(long, env = "ENVIRONMENT")]
+    pub env: String,
+
+    #[clap(long, env = "RERAND_S3_BUCKET")]
+    pub s3_bucket: String,
+
+    #[clap(long, env = "SCHEMA_NAME")]
+    pub schema_name: String,
+
+    #[clap(long, default_value = "10000", env = "CHUNK_SIZE")]
+    pub chunk_size: u64,
+
+    #[clap(long, default_value = "5", env = "CHUNK_DELAY_SECS")]
+    pub chunk_delay_secs: u64,
+
+    #[clap(long, default_value = "0", env = "SAFETY_BUFFER_IDS")]
+    pub safety_buffer_ids: u64,
+
+    #[clap(long, default_value = "5000", env = "S3_POLL_INTERVAL_MS")]
+    pub s3_poll_interval_ms: u64,
+
+    #[clap(long, default_value = "3000", env = "HEALTHCHECK_PORT")]
+    pub healthcheck_port: usize,
+}
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
new file mode 100644
index 0000000000..c948b5cd86
--- /dev/null
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -0,0 +1,295 @@
+use aws_sdk_s3::Client as S3Client;
+use aws_sdk_secretsmanager::Client as SecretsManagerClient;
+use bytemuck::cast_slice;
+use eyre::Result;
+use futures::TryStreamExt;
+use iris_mpc_store::rerand::{
+    apply_staging_chunk, ensure_staging_schema, get_rerand_progress, insert_staging_irises,
+    set_all_confirmed, set_staging_written, staging_schema_name, upsert_rerand_progress,
+    StagingIrisEntry, RERAND_APPLY_LOCK,
+};
+use iris_mpc_store::Store;
+use sqlx::PgPool;
+use std::time::Duration;
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+
+use crate::config::RerandomizeContinuousConfig;
+use crate::epoch;
+use crate::rerandomization::randomize_iris;
+use crate::s3_coordination::{self, Manifest};
+
+/// Run the continuous rerandomization loop.
+///
+/// If `cancel` is provided, the loop checks for cancellation between chunk
+/// stages and exits cleanly with `Ok(())` when cancelled. Pass `None` for
+/// production use where the loop runs until the process is killed.
+pub async fn run_continuous_rerand(
+    config: &RerandomizeContinuousConfig,
+    s3: &S3Client,
+    sm: &SecretsManagerClient,
+    store: &Store,
+    cancel: Option<&CancellationToken>,
+) -> Result<()> {
+    let pool = &store.pool;
+    let staging_schema = staging_schema_name(&store.schema_name);
+    let poll_interval = Duration::from_millis(config.s3_poll_interval_ms);
+    let chunk_delay = Duration::from_secs(config.chunk_delay_secs);
+
+    ensure_staging_schema(pool, &staging_schema).await?;
+    tracing::info!("Staging schema ensured: {}", staging_schema);
+
+    loop {
+        if is_cancelled(cancel) {
+            return Ok(());
+        }
+
+        let active_epoch = epoch::determine_active_epoch(s3, &config.s3_bucket).await?;
+        tracing::info!("Active epoch: {}", active_epoch);
+
+        let shared_secret = epoch::derive_shared_secret(
+            sm,
+            s3,
+            &config.s3_bucket,
+            &config.env,
+            active_epoch,
+            config.party_id,
+            poll_interval,
+        )
+        .await?;
+
+        let manifest =
+            get_or_create_manifest(s3, store, config, active_epoch, poll_interval).await?;
+        tracing::info!(
+            "Epoch {} manifest: chunk_size={}, max_id_inclusive={}",
+            active_epoch,
+            manifest.chunk_size,
+            manifest.max_id_inclusive
+        );
+
+        let mut chunk_id: u32 = 0;
+        loop {
+            if is_cancelled(cancel) {
+                return Ok(());
+            }
+
+            if manifest.chunk_is_empty(chunk_id) {
+                break;
+            }
+
+            let progress =
+                get_rerand_progress(pool, active_epoch as i32, chunk_id as i32).await?;
+
+            if progress.as_ref().is_some_and(|p| p.live_applied) {
+                chunk_id += 1;
+                continue;
+            }
+
+            upsert_rerand_progress(pool, active_epoch as i32, chunk_id as i32).await?;
+
+            if !progress.as_ref().is_some_and(|p| p.staging_written) {
+                process_chunk_staging(
+                    pool,
+                    store,
+                    &staging_schema,
+                    &shared_secret,
+                    config.party_id,
+                    active_epoch,
+                    chunk_id,
+                    &manifest,
+                )
+                .await?;
+
+                set_staging_written(pool, active_epoch as i32, chunk_id as i32).await?;
+
+                s3_coordination::upload_chunk_staged(
+                    s3,
+                    &config.s3_bucket,
+                    active_epoch,
+                    config.party_id,
+                    chunk_id,
+                )
+                .await?;
+                tracing::info!(
+                    "Epoch {} chunk {}: staging written, S3 marker uploaded",
+                    active_epoch,
+                    chunk_id
+                );
+            }
+
+            if is_cancelled(cancel) {
+                return Ok(());
+            }
+
+            if !progress.as_ref().is_some_and(|p| p.all_confirmed) {
+                s3_coordination::poll_chunk_staged_all(
+                    s3,
+                    &config.s3_bucket,
+                    active_epoch,
+                    chunk_id,
+                    poll_interval,
+                )
+                .await?;
+
+                set_all_confirmed(pool, active_epoch as i32, chunk_id as i32).await?;
+                tracing::info!(
+                    "Epoch {} chunk {}: all parties confirmed",
+                    active_epoch,
+                    chunk_id
+                );
+            }
+
+            if is_cancelled(cancel) {
+                return Ok(());
+            }
+
+            let mut lock_conn = pool.acquire().await?;
+            sqlx::query("SELECT pg_advisory_lock($1)")
+                .bind(RERAND_APPLY_LOCK)
+                .execute(&mut *lock_conn)
+                .await?;
+
+            let rows =
+                apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32)
+                    .await?;
+            tracing::info!(
+                "Epoch {} chunk {}: applied to live DB ({} rows updated)",
+                active_epoch,
+                chunk_id,
+                rows
+            );
+
+            sqlx::query("SELECT pg_advisory_unlock($1)")
+                .bind(RERAND_APPLY_LOCK)
+                .execute(&mut *lock_conn)
+                .await?;
+            drop(lock_conn);
+
+            chunk_id += 1;
+
+            if chunk_delay > Duration::ZERO {
+                sleep(chunk_delay).await;
+            }
+        }
+
+        if chunk_id == 0 && chunk_delay > Duration::ZERO {
+            tracing::info!(
+                "Epoch {} is empty, sleeping to avoid spinning",
+                active_epoch
+            );
+            sleep(chunk_delay).await;
+        }
+
+        epoch::complete_epoch(
+            sm,
+            s3,
+            &config.s3_bucket,
+            &config.env,
+            active_epoch,
+            config.party_id,
+            poll_interval,
+        )
+        .await?;
+        tracing::info!("Epoch {} completed, moving to next epoch", active_epoch);
+    }
+}
+
+fn is_cancelled(cancel: Option<&CancellationToken>) -> bool {
+    cancel.is_some_and(|c| c.is_cancelled())
+}
+
+async fn get_or_create_manifest(
+    s3: &S3Client,
+    store: &Store,
+    config: &RerandomizeContinuousConfig,
+    epoch: u32,
+    poll_interval: Duration,
+) -> Result<Manifest> {
+    if s3_coordination::manifest_exists(s3, &config.s3_bucket, epoch).await? {
+        return s3_coordination::download_manifest(s3, &config.s3_bucket, epoch, poll_interval)
+            .await;
+    }
+
+    if config.party_id == 0 {
+        let local_max = store.get_max_serial_id().await? as u64;
+        s3_coordination::upload_max_id(s3, &config.s3_bucket, epoch, 0, local_max).await?;
+
+        let all_max_ids =
+            s3_coordination::download_all_max_ids(s3, &config.s3_bucket, epoch, poll_interval)
+                .await?;
+        let min_max = *all_max_ids.iter().min().unwrap();
+        let max_id_inclusive = min_max.saturating_sub(config.safety_buffer_ids);
+
+        let manifest = Manifest {
+            epoch,
+            chunk_size: config.chunk_size,
+            max_id_inclusive,
+        };
+        s3_coordination::upload_manifest(s3, &config.s3_bucket, epoch, &manifest).await?;
+        tracing::info!(
+            "Epoch {}: manifest created (max_id_inclusive={}, chunk_size={})",
+            epoch,
+            max_id_inclusive,
+            config.chunk_size
+        );
+        Ok(manifest)
+    } else {
+        let local_max = store.get_max_serial_id().await? as u64;
+        s3_coordination::upload_max_id(
+            s3,
+            &config.s3_bucket,
+            epoch,
+            config.party_id,
+            local_max,
+        )
+        .await?;
+
+        s3_coordination::download_manifest(s3, &config.s3_bucket, epoch, poll_interval).await
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn process_chunk_staging(
+    pool: &PgPool,
+    store: &Store,
+    staging_schema: &str,
+    shared_secret: &[u8; 32],
+    party_id: u8,
+    epoch: u32,
+    chunk_id: u32,
+    manifest: &Manifest,
+) -> Result<()> {
+    let (start, end) = manifest.chunk_range(chunk_id);
+
+    let entries: Vec<_> = store
+        .stream_irises_in_range(start..end)
+        .try_collect()
+        .await?;
+
+    let staging_entries: Vec<StagingIrisEntry> = entries
+        .into_iter()
+        .map(|iris| {
+            let version_id = iris.version_id();
+            let iris_id = iris.id();
+            let (_, lc, lm, rc, rm) = randomize_iris(iris, shared_secret, party_id as usize);
+            StagingIrisEntry {
+                epoch: epoch as i32,
+                id: iris_id,
+                chunk_id: chunk_id as i32,
+                left_code: cast_slice::<u16, u8>(&lc.coefs).to_vec(),
+                left_mask: cast_slice::<u16, u8>(&lm.coefs).to_vec(),
+                right_code: cast_slice::<u16, u8>(&rc.coefs).to_vec(),
+                right_mask: cast_slice::<u16, u8>(&rm.coefs).to_vec(),
+                original_version_id: version_id,
+                rerand_epoch: (epoch + 1) as i32,
+            }
+        })
+        .collect();
+
+    const BATCH_SIZE: usize = 500;
+    for batch in staging_entries.chunks(BATCH_SIZE) {
+        insert_staging_irises(pool, staging_schema, batch).await?;
+    }
+
+    Ok(())
+}
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
new file mode 100644
index 0000000000..1cd79da13d
--- /dev/null
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -0,0 +1,223 @@
+use aws_sdk_s3::Client as S3Client;
+use aws_sdk_secretsmanager::Client as SecretsManagerClient;
+use base64::engine::general_purpose::STANDARD;
+use base64::Engine;
+use eyre::{eyre, Result};
+use std::time::Duration;
+
+use crate::s3_coordination;
+use crate::tripartite_dh;
+
+fn secret_id(env: &str, epoch: u32, party_id: u8) -> String {
+    format!(
+        "{}/iris-mpc-db-rerandomization/epoch-{}/private-key-party-{}",
+        env, epoch, party_id
+    )
+}
+
+/// Check if a private key for this epoch already exists in Secrets Manager.
+async fn load_private_key_from_sm(
+    sm: &SecretsManagerClient,
+    env: &str,
+    epoch: u32,
+    party_id: u8,
+) -> Result<Option<tripartite_dh::PrivateKey>> {
+    let sid = secret_id(env, epoch, party_id);
+    match sm
+        .get_secret_value()
+        .secret_id(&sid)
+        .version_stage("AWSCURRENT")
+        .send()
+        .await
+    {
+        Ok(output) => {
+            let b64 = output
+                .secret_string()
+                .ok_or_else(|| eyre!("Secret {} has no string value", sid))?;
+            let bytes = STANDARD.decode(b64)?;
+            let key = tripartite_dh::PrivateKey::deserialize(&bytes)
+                .map_err(|e| eyre!("Failed to deserialize private key from SM: {:?}", e))?;
+            Ok(Some(key))
+        }
+        Err(e) => {
+            let svc = e.into_service_error();
+            if svc.is_resource_not_found_exception() {
+                Ok(None)
+            } else {
+                Err(eyre!("SM GetSecretValue failed for {}: {}", sid, svc))
+            }
+        }
+    }
+}
+
+async fn save_private_key_to_sm(
+    sm: &SecretsManagerClient,
+    env: &str,
+    epoch: u32,
+    party_id: u8,
+    key: &tripartite_dh::PrivateKey,
+) -> Result<()> {
+    let sid = secret_id(env, epoch, party_id);
+    let b64 = STANDARD.encode(key.serialize());
+
+    match sm
+        .create_secret()
+        .name(&sid)
+        .secret_string(&b64)
+        .send()
+        .await
+    {
+        Ok(_) => Ok(()),
+        Err(e) => {
+            let svc = e.into_service_error();
+            if svc.is_resource_exists_exception() {
+                sm.put_secret_value()
+                    .secret_id(&sid)
+                    .secret_string(&b64)
+                    .send()
+                    .await
+                    .map_err(|e| eyre!("SM PutSecretValue failed for {}: {}", sid, e))?;
+                Ok(())
+            } else {
+                Err(eyre!("SM CreateSecret failed for {}: {}", sid, svc))
+            }
+        }
+    }
+}
+
+async fn delete_private_key_from_sm(
+    sm: &SecretsManagerClient,
+    env: &str,
+    epoch: u32,
+    party_id: u8,
+) -> Result<()> {
+    let sid = secret_id(env, epoch, party_id);
+    sm.delete_secret()
+        .secret_id(&sid)
+        .force_delete_without_recovery(true)
+        .send()
+        .await
+        .map_err(|e| eyre!("SM DeleteSecret failed for {}: {}", sid, e))?;
+    tracing::info!("Deleted epoch {} private key from SM", epoch);
+    Ok(())
+}
+
+/// Idempotent key generation for an epoch.
+///
+/// 1. Check SM for existing private key
+/// 2. If found: load it, derive public key, re-upload to S3 (covers crash between SM write and S3 upload)
+/// 3. If not found: generate new keypair, write to SM first, then upload public key to S3
+pub async fn idempotent_keygen(
+    sm: &SecretsManagerClient,
+    s3: &S3Client,
+    bucket: &str,
+    env: &str,
+    epoch: u32,
+    party_id: u8,
+) -> Result<tripartite_dh::PrivateKey> {
+    if let Some(existing) = load_private_key_from_sm(sm, env, epoch, party_id).await? {
+        tracing::info!(
+            "Epoch {}: private key found in SM, re-uploading public key to S3",
+            epoch
+        );
+        let public_key = existing.public_key();
+        let pk_b64 = STANDARD.encode(public_key.serialize());
+        s3_coordination::upload_public_key(s3, bucket, epoch, party_id, &pk_b64).await?;
+        return Ok(existing);
+    }
+
+    tracing::info!(
+        "Epoch {}: generating fresh BLS12-381 keypair for party {}",
+        epoch,
+        party_id
+    );
+    let mut rng = rand::rngs::OsRng;
+    let private_key = tripartite_dh::PrivateKey::random(&mut rng);
+
+    save_private_key_to_sm(sm, env, epoch, party_id, &private_key).await?;
+
+    let public_key = private_key.public_key();
+    let pk_b64 = STANDARD.encode(public_key.serialize());
+    s3_coordination::upload_public_key(s3, bucket, epoch, party_id, &pk_b64).await?;
+
+    Ok(private_key)
+}
+
+/// Derive the shared secret for an epoch: keygen + download peer keys + BLS pairing.
+pub async fn derive_shared_secret(
+    sm: &SecretsManagerClient,
+    s3: &S3Client,
+    bucket: &str,
+    env: &str,
+    epoch: u32,
+    party_id: u8,
+    poll_interval: Duration,
+) -> Result<[u8; 32]> {
+    let private_key = idempotent_keygen(sm, s3, bucket, env, epoch, party_id).await?;
+
+    let next_id = (party_id + 1) % 3;
+    let prev_id = (party_id + 2) % 3;
+
+    let pk_next_b64 =
+        s3_coordination::download_public_key_for_party(s3, bucket, epoch, next_id, poll_interval)
+            .await?;
+    let pk_next = tripartite_dh::PublicKeys::deserialize(&STANDARD.decode(&pk_next_b64)?)
+        .map_err(|e| eyre!("Failed to deserialize public key for party {}: {:?}", next_id, e))?;
+
+    let pk_prev_b64 =
+        s3_coordination::download_public_key_for_party(s3, bucket, epoch, prev_id, poll_interval)
+            .await?;
+    let pk_prev = tripartite_dh::PublicKeys::deserialize(&STANDARD.decode(&pk_prev_b64)?)
+        .map_err(|e| eyre!("Failed to deserialize public key for party {}: {:?}", prev_id, e))?;
+
+    let shared_secret = private_key.derive_shared_secret(&pk_next, &pk_prev);
+    let hash = blake3::hash(&shared_secret);
+    tracing::info!(
+        "Epoch {}: derived shared secret (blake3 fingerprint: {})",
+        epoch,
+        hash.to_hex()
+    );
+    Ok(shared_secret)
+}
+
+/// Determine the active epoch by scanning S3 for the highest epoch with a
+/// manifest but without all three `complete` markers. Falls back to 0 if
+/// no epochs exist.
+pub async fn determine_active_epoch(s3: &S3Client, bucket: &str) -> Result<u32> {
+    let mut epoch: u32 = 0;
+    loop {
+        if !s3_coordination::manifest_exists(s3, bucket, epoch).await? {
+            break;
+        }
+        if s3_coordination::all_parties_complete(s3, bucket, epoch).await? {
+            epoch += 1;
+            continue;
+        }
+        return Ok(epoch);
+    }
+    Ok(epoch)
+}
+
+/// Upload completion marker, poll for all three, then delete the epoch key from SM.
+pub async fn complete_epoch(
+    sm: &SecretsManagerClient,
+    s3: &S3Client,
+    bucket: &str,
+    env: &str,
+    epoch: u32,
+    party_id: u8,
+    poll_interval: Duration,
+) -> Result<()> {
+    s3_coordination::upload_epoch_complete(s3, bucket, epoch, party_id).await?;
+    tracing::info!(
+        "Epoch {}: uploaded completion marker for party {}",
+        epoch,
+        party_id
+    );
+
+    s3_coordination::poll_epoch_complete_all(s3, bucket, epoch, poll_interval).await?;
+    tracing::info!("Epoch {}: all parties completed", epoch);
+
+    delete_private_key_from_sm(sm, env, epoch, party_id).await?;
+    Ok(())
+}
diff --git a/iris-mpc-upgrade/src/lib.rs b/iris-mpc-upgrade/src/lib.rs
index 2ab2e2018d..7d32241d74 100644
--- a/iris-mpc-upgrade/src/lib.rs
+++ b/iris-mpc-upgrade/src/lib.rs
@@ -8,10 +8,13 @@ use std::{
 };
 
 pub mod config;
+pub mod continuous_rerand;
+pub mod epoch;
 pub mod packets;
 pub mod proto;
 pub mod rerandomization;
 pub mod reshare;
+pub mod s3_coordination;
 pub mod tripartite_dh;
 pub mod utils;
 
diff --git a/iris-mpc-upgrade/src/rerandomization.rs b/iris-mpc-upgrade/src/rerandomization.rs
index 3b1e93ea3a..40db34136c 100644
--- a/iris-mpc-upgrade/src/rerandomization.rs
+++ b/iris-mpc-upgrade/src/rerandomization.rs
@@ -1,10 +1,12 @@
 use std::io::Read;
 
 use iris_mpc_common::{
-    galois::degree4::{basis::Monomial, GaloisRingElement},
+    galois::degree4::{basis::Monomial, GaloisRingElement, ShamirGaloisRingShare},
     galois_engine::degree4::{GaloisRingIrisCodeShare, GaloisRingTrimmedMaskCodeShare},
+    id::PartyID,
 };
 use iris_mpc_store::DbStoredIris;
+use itertools::Itertools;
 
 pub fn randomize_iris(
     iris: DbStoredIris,
@@ -87,6 +89,61 @@ fn randomize_galois_ring_coefs(coefs: &mut [u16], xof: &mut blake3::OutputReader
     }
 }
 
+/// Reconstruct the plaintext from 3 Shamir shares using Lagrange interpolation.
+/// Verifies consistency by reconstructing from all 3 pairs (0-1, 1-2, 0-2) and
+/// asserting they agree.
+pub fn reconstruct_shares(share0: &[u16], share1: &[u16], share2: &[u16]) -> Vec<u16> {
+    let lag_01 =
+        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID1);
+    let lag_10 =
+        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID0);
+    let lag_02 =
+        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID2);
+    let lag_20 =
+        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID0);
+    let lag_12 =
+        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID2);
+    let lag_21 =
+        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID1);
+
+    assert!(share0.len() == share1.len() && share1.len() == share2.len());
+
+    let recon01 = share0
+        .chunks_exact(4)
+        .zip_eq(share1.chunks_exact(4))
+        .flat_map(|(a, b)| {
+            let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
+            let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
+            let c = a * lag_01 + b * lag_10;
+            c.coefs
+        })
+        .collect_vec();
+    let recon12 = share1
+        .chunks_exact(4)
+        .zip_eq(share2.chunks_exact(4))
+        .flat_map(|(a, b)| {
+            let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
+            let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
+            let c = a * lag_12 + b * lag_21;
+            c.coefs
+        })
+        .collect_vec();
+    let recon02 = share0
+        .chunks_exact(4)
+        .zip_eq(share2.chunks_exact(4))
+        .flat_map(|(a, b)| {
+            let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
+            let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
+            let c = a * lag_02 + b * lag_20;
+            c.coefs
+        })
+        .collect_vec();
+
+    assert_eq!(recon01, recon12);
+    assert_eq!(recon01, recon02);
+    recon01
+}
+
 #[cfg(test)]
 mod tests {
     use iris_mpc_common::{
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
new file mode 100644
index 0000000000..f8ccfe4713
--- /dev/null
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -0,0 +1,279 @@
+use aws_sdk_s3::Client as S3Client;
+use eyre::{eyre, Result};
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+use tokio::time::sleep;
+
+const NUM_PARTIES: u8 = 3;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Manifest {
+    pub epoch: u32,
+    pub chunk_size: u64,
+    pub max_id_inclusive: u64,
+}
+
+impl Manifest {
+    pub fn num_chunks(&self) -> u32 {
+        if self.max_id_inclusive == 0 {
+            return 0;
+        }
+        self.max_id_inclusive.div_ceil(self.chunk_size) as u32
+    }
+
+    /// Returns (start_id_inclusive, end_id_exclusive) for a given chunk_id.
+    /// IDs are 1-based.
+    pub fn chunk_range(&self, chunk_id: u32) -> (u64, u64) {
+        let start = 1 + (chunk_id as u64) * self.chunk_size;
+        let end = std::cmp::min(start + self.chunk_size, self.max_id_inclusive + 1);
+        (start, end)
+    }
+
+    pub fn chunk_is_empty(&self, chunk_id: u32) -> bool {
+        let (start, end) = self.chunk_range(chunk_id);
+        start >= end
+    }
+}
+
+fn epoch_party_prefix(epoch: u32, party: u8) -> String {
+    format!("rerand/epoch-{}/party-{}", epoch, party)
+}
+
+pub async fn upload_marker(
+    s3: &S3Client,
+    bucket: &str,
+    key: &str,
+    body: Vec<u8>,
+) -> Result<()> {
+    s3.put_object()
+        .bucket(bucket)
+        .key(key)
+        .body(body.into())
+        .send()
+        .await
+        .map_err(|e| eyre!("S3 PutObject failed for key {}: {}", key, e))?;
+    Ok(())
+}
+
+pub async fn marker_exists(s3: &S3Client, bucket: &str, key: &str) -> Result<bool> {
+    match s3.head_object().bucket(bucket).key(key).send().await {
+        Ok(_) => Ok(true),
+        Err(e) => {
+            let svc_err = e.into_service_error();
+            if svc_err.is_not_found() {
+                Ok(false)
+            } else {
+                Err(eyre!("S3 HeadObject failed for key {}: {}", key, svc_err))
+            }
+        }
+    }
+}
+
+pub async fn download_marker(s3: &S3Client, bucket: &str, key: &str) -> Result<Vec<u8>> {
+    let resp = s3
+        .get_object()
+        .bucket(bucket)
+        .key(key)
+        .send()
+        .await
+        .map_err(|e| eyre!("S3 GetObject failed for key {}: {}", key, e))?;
+    let bytes = resp
+        .body
+        .collect()
+        .await
+        .map_err(|e| eyre!("Failed to read S3 body for key {}: {}", key, e))?;
+    Ok(bytes.to_vec())
+}
+
+pub async fn poll_until_marker_exists(
+    s3: &S3Client,
+    bucket: &str,
+    key: &str,
+    poll_interval: Duration,
+) -> Result<()> {
+    loop {
+        if marker_exists(s3, bucket, key).await? {
+            return Ok(());
+        }
+        tracing::debug!("Waiting for S3 marker: {}", key);
+        sleep(poll_interval).await;
+    }
+}
+
+/// Polls until all three parties have uploaded a given marker suffix for an epoch.
+pub async fn poll_until_all_parties_marker(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    marker_suffix: &str,
+    poll_interval: Duration,
+) -> Result<()> {
+    loop {
+        let mut all_present = true;
+        for party in 0..NUM_PARTIES {
+            let key = format!("{}/{}", epoch_party_prefix(epoch, party), marker_suffix);
+            if !marker_exists(s3, bucket, &key).await? {
+                all_present = false;
+                break;
+            }
+        }
+        if all_present {
+            return Ok(());
+        }
+        tracing::debug!(
+            "Waiting for all parties' {} markers for epoch {}",
+            marker_suffix,
+            epoch
+        );
+        sleep(poll_interval).await;
+    }
+}
+
+// ---- Public key ----
+
+pub async fn upload_public_key(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    key_b64: &str,
+) -> Result<()> {
+    let key = format!("{}/public-key", epoch_party_prefix(epoch, party));
+    upload_marker(s3, bucket, &key, key_b64.as_bytes().to_vec()).await
+}
+
+pub async fn download_public_key_for_party(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    poll_interval: Duration,
+) -> Result<String> {
+    let key = format!("{}/public-key", epoch_party_prefix(epoch, party));
+    poll_until_marker_exists(s3, bucket, &key, poll_interval).await?;
+    let bytes = download_marker(s3, bucket, &key).await?;
+    Ok(String::from_utf8(bytes)?)
+}
+
+// ---- Max ID watermark ----
+
+pub async fn upload_max_id(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    max_id: u64,
+) -> Result<()> {
+    let key = format!("{}/max-id", epoch_party_prefix(epoch, party));
+    upload_marker(s3, bucket, &key, max_id.to_string().into_bytes()).await
+}
+
+pub async fn download_all_max_ids(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    poll_interval: Duration,
+) -> Result<[u64; 3]> {
+    let mut ids = [0u64; 3];
+    for party in 0..NUM_PARTIES {
+        let key = format!("{}/max-id", epoch_party_prefix(epoch, party));
+        poll_until_marker_exists(s3, bucket, &key, poll_interval).await?;
+        let bytes = download_marker(s3, bucket, &key).await?;
+        let s = String::from_utf8(bytes)?;
+        ids[party as usize] = s
+            .trim()
+            .parse()
+            .map_err(|e| eyre!("Failed to parse max-id from party {}: {}", party, e))?;
+    }
+    Ok(ids)
+}
+
+// ---- Manifest ----
+
+pub async fn upload_manifest(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    manifest: &Manifest,
+) -> Result<()> {
+    let key = format!("{}/manifest.json", epoch_party_prefix(epoch, 0));
+    let body = serde_json::to_vec(manifest)?;
+    upload_marker(s3, bucket, &key, body).await
+}
+
+pub async fn download_manifest(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    poll_interval: Duration,
+) -> Result<Manifest> {
+    let key = format!("{}/manifest.json", epoch_party_prefix(epoch, 0));
+    poll_until_marker_exists(s3, bucket, &key, poll_interval).await?;
+    let bytes = download_marker(s3, bucket, &key).await?;
+    let manifest: Manifest = serde_json::from_slice(&bytes)?;
+    Ok(manifest)
+}
+
+pub async fn manifest_exists(s3: &S3Client, bucket: &str, epoch: u32) -> Result<bool> {
+    let key = format!("{}/manifest.json", epoch_party_prefix(epoch, 0));
+    marker_exists(s3, bucket, &key).await
+}
+
+// ---- Chunk staged markers ----
+
+pub async fn upload_chunk_staged(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    chunk_id: u32,
+) -> Result<()> {
+    let key = format!(
+        "{}/chunk-{}/staged",
+        epoch_party_prefix(epoch, party),
+        chunk_id
+    );
+    upload_marker(s3, bucket, &key, b"ok".to_vec()).await
+}
+
+pub async fn poll_chunk_staged_all(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    chunk_id: u32,
+    poll_interval: Duration,
+) -> Result<()> {
+    let suffix = format!("chunk-{}/staged", chunk_id);
+    poll_until_all_parties_marker(s3, bucket, epoch, &suffix, poll_interval).await
+}
+
+// ---- Epoch completion ----
+
+pub async fn upload_epoch_complete(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+) -> Result<()> {
+    let key = format!("{}/complete", epoch_party_prefix(epoch, party));
+    upload_marker(s3, bucket, &key, b"done".to_vec()).await
+}
+
+pub async fn all_parties_complete(s3: &S3Client, bucket: &str, epoch: u32) -> Result<bool> {
+    for party in 0..NUM_PARTIES {
+        let key = format!("{}/complete", epoch_party_prefix(epoch, party));
+        if !marker_exists(s3, bucket, &key).await? {
+            return Ok(false);
+        }
+    }
+    Ok(true)
+}
+
+pub async fn poll_epoch_complete_all(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    poll_interval: Duration,
+) -> Result<()> {
+    poll_until_all_parties_marker(s3, bucket, epoch, "complete", poll_interval).await
+}
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
new file mode 100644
index 0000000000..1b92b96571
--- /dev/null
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -0,0 +1,312 @@
+#![cfg(feature = "db_dependent")]
+
+mod test_utils;
+
+use eyre::Result;
+use std::sync::Mutex;
+use test_utils::*;
+
+const STACK_SIZE: usize = 16 * 1024 * 1024;
+
+/// Tests share 3 Postgres instances and a global advisory lock constant, so
+/// they must run sequentially. This mutex enforces that even without
+/// `--test-threads=1`.
+static SERIAL: Mutex<()> = Mutex::new(());
+
+fn run_async(f: impl std::future::Future<Output = Result<()>> + Send + 'static) {
+    let _guard = SERIAL.lock().unwrap_or_else(|e| e.into_inner());
+    let result = std::thread::Builder::new()
+        .stack_size(STACK_SIZE)
+        .name("e2e".into())
+        .spawn(move || {
+            tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(4)
+                .thread_stack_size(STACK_SIZE)
+                .enable_all()
+                .build()
+                .unwrap()
+                .block_on(f)
+        })
+        .unwrap()
+        .join()
+        .unwrap();
+    result.unwrap();
+}
+
+// ============================================================================
+// Phase 1: Clean epoch -- run one full epoch, verify crypto correctness
+// ============================================================================
+
+#[test]
+fn phase1_clean_epoch() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 1] Clean epoch...");
+
+        let (h, t) = env.spawn_all();
+        wait_epoch_done(&env.harness, 0).await?;
+        stop_all(t, h).await;
+
+        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        assert!(ep >= 1, "Expected rerand_epoch >= 1, got {}", ep);
+        verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+        println!("[phase 1] PASSED (epoch={})", ep);
+
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 2: Kill-and-resume -- kill mid-epoch, restart, verify recovery
+// ============================================================================
+
+#[test]
+fn phase2_kill_and_resume() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 2] Kill-and-resume...");
+
+        // Run epoch 0, let 2 chunks stage, then kill
+        let (h, t) = env.spawn_all();
+        wait_chunks_staged(&env.harness, 0, 2).await?;
+        println!("[phase 2]   killing after 2 chunks staged");
+        stop_all(t, h).await;
+
+        // Restart -- should resume from where it left off
+        println!("[phase 2]   restarting...");
+        let (h, t) = env.spawn_all();
+        wait_epoch_done(&env.harness, 0).await?;
+        stop_all(t, h).await;
+
+        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        assert!(ep >= 1);
+        verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+        println!("[phase 2] PASSED (epoch={})", ep);
+
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 3: Concurrent modifications -- bump version_id mid-epoch, verify
+//           optimistic lock skips those rows
+// ============================================================================
+
+#[test]
+fn phase3_concurrent_modifications() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        let modified_ids: Vec<i64> = vec![5, 10, 15];
+        println!("[phase 3] Concurrent modifications...");
+
+        let (h, t) = env.spawn_all();
+        wait_chunks_staged(&env.harness, 0, 1).await?;
+
+        // Bump version_id on a few rows (simulates a reauth)
+        for &id in &modified_ids {
+            for party in &env.harness.parties {
+                sqlx::query("UPDATE irises SET left_code = left_code WHERE id = $1")
+                    .bind(id)
+                    .execute(&party.store.pool)
+                    .await?;
+            }
+        }
+        println!("[phase 3]   bumped version_id on {:?}", modified_ids);
+
+        wait_epoch_done(&env.harness, 0).await?;
+        stop_all(t, h).await;
+
+        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        assert!(ep >= 1);
+        verify_fingerprints(&env.harness, &env.fingerprints, &modified_ids).await?;
+        println!("[phase 3] PASSED (epoch={})", ep);
+
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 4: Server restart during rerand -- simulate main server startup while
+//           rerand is running, verify advisory lock serializes access
+// ============================================================================
+
+#[test]
+fn phase4_server_restart_during_rerand() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 4] Server restart during rerand...");
+
+        let (h, t) = env.spawn_all();
+        wait_chunks_staged(&env.harness, 0, 1).await?;
+
+        for p in 0..NUM_PARTIES {
+            let r = simulate_server_startup(&env.harness, p).await;
+            println!("[phase 4]   party {} server startup: {:?}", p, r.is_ok());
+        }
+
+        wait_epoch_done(&env.harness, 0).await?;
+        stop_all(t, h).await;
+
+        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        assert!(ep >= 1);
+        verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+        println!("[phase 4] PASSED (epoch={})", ep);
+
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 5: Staggered restart -- kill one party mid-epoch, restart it, verify
+//           it catches up and the epoch completes
+// ============================================================================
+
+#[test]
+fn phase5_staggered_restart() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 5] Staggered restart...");
+
+        let (h, t) = env.spawn_all();
+        wait_chunks_staged(&env.harness, 0, 2).await?;
+
+        // Kill party 0
+        println!("[phase 5]   killing party 0 after 2 chunks");
+        t[0].cancel();
+        h[0].abort();
+
+        // Immediately restart party 0
+        println!("[phase 5]   restarting party 0");
+        let (h0, t0) = env.spawn_rerand(0);
+
+        wait_epoch_done(&env.harness, 0).await?;
+
+        t0.cancel();
+        h0.abort();
+        let _ = h0.await;
+        stop_all(t, h).await;
+
+        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        assert!(ep >= 1);
+        verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+        println!("[phase 5] PASSED (epoch={})", ep);
+
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 6: Multiple Epochs -- let the system run continuously across multiple
+//           epochs, verify seamless transition and correct rerandomization
+// ============================================================================
+
+#[test]
+fn phase6_multiple_epochs() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 6] Multiple epochs...");
+
+        let (h, t) = env.spawn_all();
+        
+        // Wait for epoch 0 to finish
+        wait_epoch_done(&env.harness, 0).await?;
+        println!("[phase 6]   epoch 0 completed");
+
+        // The continuous rerand servers should automatically move to epoch 1
+        wait_epoch_done(&env.harness, 1).await?;
+        println!("[phase 6]   epoch 1 completed");
+
+        stop_all(t, h).await;
+
+        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        assert!(ep >= 2, "Expected rerand_epoch >= 2, got {}", ep);
+        verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+        println!("[phase 6] PASSED (epoch={})", ep);
+
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 7: Epoch boundary desync -- simulate epoch mismatch
+// ============================================================================
+
+#[test]
+fn phase7_epoch_boundary_desync() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 7] Epoch boundary desync...");
+
+        // Setup the exact boundary desync state in DB manually to test catch-up logic
+        // P1 is on Epoch 0 (has max epoch 0)
+        // P0 and P2 are on Epoch 1 (have max epoch 1)
+        for p in 0..NUM_PARTIES {
+            let pool = &env.harness.parties[p].store.pool;
+            // Everyone completes Epoch 0
+            sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (0, 0, TRUE, TRUE, TRUE)")
+                .execute(pool).await.unwrap();
+        }
+
+        // P0 and P2 move to Epoch 1
+        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (1, 0, TRUE, TRUE, FALSE)")
+            .execute(&env.harness.parties[0].store.pool).await.unwrap();
+        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (1, 0, TRUE, TRUE, FALSE)")
+            .execute(&env.harness.parties[2].store.pool).await.unwrap();
+
+        // Now simulate P1 main server startup (P1 is behind on Epoch 0)
+        // Should catch up using safe_up_to = i32::MAX
+        let r1 = simulate_server_startup(&env.harness, 1).await;
+        assert!(r1.is_ok(), "P1 startup failed during epoch mismatch");
+
+        // Now simulate P0 main server startup (P0 is ahead on Epoch 1)
+        // Should catch up using safe_up_to = -1 (nobody confirmed Epoch 1 yet since P1 hasn't started it)
+        let r0 = simulate_server_startup(&env.harness, 0).await;
+        assert!(r0.is_ok(), "P0 startup failed during epoch mismatch");
+
+        println!("[phase 7] PASSED");
+
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 8: Disallow loading mismatched peers
+// ============================================================================
+
+#[test]
+fn phase8_reject_desync() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 8] Reject desync...");
+
+        // Setup the exact boundary desync state in DB manually
+        // P1 is on Epoch 0 (has max epoch 0)
+        // P0 and P2 are on Epoch 2 (have max epoch 2)
+        // If a peer is *more than 1 epoch ahead*, we should panic/reject
+        for p in 0..NUM_PARTIES {
+            let pool = &env.harness.parties[p].store.pool;
+            sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (0, 0, TRUE, TRUE, TRUE)")
+                .execute(pool).await.unwrap();
+        }
+
+        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (2, 0, TRUE, TRUE, FALSE)")
+            .execute(&env.harness.parties[0].store.pool).await.unwrap();
+        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (2, 0, TRUE, TRUE, FALSE)")
+            .execute(&env.harness.parties[2].store.pool).await.unwrap();
+
+        let r1 = simulate_server_startup(&env.harness, 1).await;
+        assert!(r1.is_err(), "P1 startup should have failed due to large epoch gap");
+
+        println!("[phase 8] PASSED");
+
+        env.teardown().await
+    });
+}
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
new file mode 100644
index 0000000000..e23cd3869b
--- /dev/null
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -0,0 +1,371 @@
+#![allow(dead_code)]
+
+use eyre::Result;
+use iris_mpc_common::{
+    config::CommonConfig,
+    galois_engine::degree4::FullGaloisRingIrisCodeShare,
+    helpers::sync::{SyncResult, SyncState},
+    iris_db::iris::IrisCode,
+    postgres::{AccessMode, PostgresClient},
+};
+use iris_mpc_store::rerand::{self as rerand_store};
+use iris_mpc_store::{Store, StoredIrisRef};
+use iris_mpc_upgrade::config::RerandomizeContinuousConfig;
+use iris_mpc_upgrade::continuous_rerand::run_continuous_rerand;
+use iris_mpc_upgrade::rerandomization::reconstruct_shares;
+use rand::{rngs::StdRng, SeedableRng};
+use std::collections::HashMap;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+
+pub const NUM_PARTIES: usize = 3;
+pub const DB_SIZE: usize = 50;
+pub const CHUNK_SIZE: u64 = 25;
+
+fn db_urls() -> Vec<String> {
+    (0..3)
+        .map(|i| format!("postgres://postgres:postgres@localhost:{}", 6200 + i))
+        .collect()
+}
+
+pub struct PartyDb {
+    pub store: Store,
+    pub schema_name: String,
+}
+
+pub struct TestHarness {
+    pub parties: Vec<PartyDb>,
+}
+
+impl TestHarness {
+    pub async fn new(db_urls: &[&str], schema_prefix: &str) -> Result<Self> {
+        let mut parties = Vec::new();
+        for (i, url) in db_urls.iter().enumerate() {
+            let schema = format!("{}_{}", schema_prefix, i);
+            let pg = PostgresClient::new(url, &schema, AccessMode::ReadWrite).await?;
+            let store = Store::new(&pg).await?;
+            rerand_store::ensure_staging_schema(
+                &store.pool,
+                &rerand_store::staging_schema_name(&schema),
+            )
+            .await?;
+            parties.push(PartyDb {
+                store,
+                schema_name: schema,
+            });
+        }
+        Ok(Self { parties })
+    }
+
+    pub fn store(&self, party: usize) -> &Store {
+        &self.parties[party].store
+    }
+}
+
+/// Full test environment: harness + AWS clients + unique prefix + unique S3 bucket.
+pub struct TestEnv {
+    pub harness: TestHarness,
+    pub s3: aws_sdk_s3::Client,
+    pub sm: aws_sdk_secretsmanager::Client,
+    pub prefix: String,
+    pub bucket: String,
+    pub fingerprints: PlaintextFingerprints,
+}
+
+impl TestEnv {
+    pub async fn setup() -> Result<Self> {
+        let id = rand::random::<u32>();
+        let prefix = format!("SMPC_e2e_{}", id);
+        let bucket = format!("rerand-e2e-{}", id);
+        let urls = db_urls();
+        let url_refs: Vec<&str> = urls.iter().map(|s| s.as_str()).collect();
+        let harness = TestHarness::new(&url_refs, &prefix).await?;
+
+        let sdk = aws_config::from_env().load().await;
+        let s3 = aws_sdk_s3::Client::new(&sdk);
+        let sm = aws_sdk_secretsmanager::Client::new(&sdk);
+
+        s3.create_bucket().bucket(&bucket).send().await
+            .map_err(|e| eyre::eyre!("Failed to create bucket {}: {}", bucket, e))?;
+
+        println!("  [setup] Seeding {} irises (prefix={}, bucket={})", DB_SIZE, prefix, bucket);
+        seed_three_party_db(&harness, DB_SIZE).await?;
+        let fingerprints = snapshot_all_fingerprints(&harness).await?;
+
+        Ok(Self { harness, s3, sm, prefix, bucket, fingerprints })
+    }
+
+    pub async fn teardown(&self) -> Result<()> {
+        cleanup(&self.harness).await?;
+        // Delete all objects in the bucket then delete the bucket
+        let mut token = None;
+        loop {
+            let mut req = self.s3.list_objects_v2().bucket(&self.bucket);
+            if let Some(t) = &token { req = req.continuation_token(t); }
+            let resp = req.send().await?;
+            for obj in resp.contents() {
+                if let Some(key) = obj.key() {
+                    self.s3.delete_object().bucket(&self.bucket).key(key).send().await?;
+                }
+            }
+            if resp.is_truncated() == Some(true) {
+                token = resp.next_continuation_token().map(|s| s.to_string());
+            } else { break; }
+        }
+        let _ = self.s3.delete_bucket().bucket(&self.bucket).send().await;
+        Ok(())
+    }
+
+    pub fn make_config(&self, party_id: u8) -> RerandomizeContinuousConfig {
+        RerandomizeContinuousConfig {
+            party_id,
+            db_url: format!(
+                "postgres://postgres:postgres@localhost:{}",
+                6200 + party_id as u16
+            ),
+            env: "testing".to_string(),
+            s3_bucket: self.bucket.clone(),
+            schema_name: format!("{}_{}", self.prefix, party_id),
+            chunk_size: CHUNK_SIZE,
+            chunk_delay_secs: 0,
+            safety_buffer_ids: 0,
+            s3_poll_interval_ms: 200,
+            healthcheck_port: 3020 + party_id as usize,
+        }
+    }
+
+    pub fn spawn_rerand(&self, party_id: u8) -> (tokio::task::JoinHandle<Result<()>>, CancellationToken) {
+        let config = self.make_config(party_id);
+        let s3 = self.s3.clone();
+        let sm = self.sm.clone();
+        let store = self.harness.store(party_id as usize).clone();
+        let token = CancellationToken::new();
+        let tc = token.clone();
+        let h = tokio::spawn(async move {
+            run_continuous_rerand(&config, &s3, &sm, &store, Some(&tc)).await
+        });
+        (h, token)
+    }
+
+    pub fn spawn_all(&self) -> (Vec<tokio::task::JoinHandle<Result<()>>>, Vec<CancellationToken>) {
+        let mut handles = Vec::new();
+        let mut tokens = Vec::new();
+        for p in 0u8..3 {
+            let (h, t) = self.spawn_rerand(p);
+            handles.push(h);
+            tokens.push(t);
+        }
+        (handles, tokens)
+    }
+}
+
+pub async fn stop_all(
+    tokens: Vec<CancellationToken>,
+    handles: Vec<tokio::task::JoinHandle<Result<()>>>,
+) {
+    for t in &tokens { t.cancel(); }
+    for h in &handles { h.abort(); }
+    for h in handles { let _ = h.await; }
+}
+
+// ---- DB seeding ----
+
+pub async fn seed_three_party_db(harness: &TestHarness, count: usize) -> Result<()> {
+    let mut rng = StdRng::seed_from_u64(42);
+    for chunk_start in (1..=count).step_by(100) {
+        let chunk_end = std::cmp::min(chunk_start + 100, count + 1);
+
+        struct S { id: i64, lc: Vec<u16>, lm: Vec<u16>, rc: Vec<u16>, rm: Vec<u16> }
+
+        let mut party_data: Vec<Vec<S>> = (0..NUM_PARTIES).map(|_| Vec::new()).collect();
+        for serial_id in chunk_start..chunk_end {
+            let il = IrisCode::random_rng(&mut rng);
+            let ir = IrisCode::random_rng(&mut rng);
+            let [l0, l1, l2] = FullGaloisRingIrisCodeShare::encode_iris_code(&il, &mut rng);
+            let [r0, r1, r2] = FullGaloisRingIrisCodeShare::encode_iris_code(&ir, &mut rng);
+            for (pi, (left, right)) in [(l0, r0), (l1, r1), (l2, r2)].into_iter().enumerate() {
+                party_data[pi].push(S {
+                    id: serial_id as i64,
+                    lc: left.code.coefs.to_vec(), lm: left.mask.coefs.to_vec(),
+                    rc: right.code.coefs.to_vec(), rm: right.mask.coefs.to_vec(),
+                });
+            }
+        }
+        for (pi, shares) in party_data.iter().enumerate() {
+            let refs: Vec<StoredIrisRef> = shares.iter().map(|s| StoredIrisRef {
+                id: s.id, left_code: &s.lc, left_mask: &s.lm,
+                right_code: &s.rc, right_mask: &s.rm,
+            }).collect();
+            let store = harness.store(pi);
+            let mut tx = store.tx().await?;
+            store.insert_irises_overriding(&mut tx, &refs).await?;
+            tx.commit().await?;
+        }
+    }
+    Ok(())
+}
+
+// ---- Fingerprint verification ----
+
+/// blake3 hash of the concatenated reconstructed plaintext (left_code ++ left_mask
+/// ++ right_code ++ right_mask) for every iris ID.
+pub type PlaintextFingerprints = HashMap<i64, [u8; 32]>;
+
+/// Compute a fingerprint for every iris in the DB by reconstructing shares
+/// from all 3 parties.
+pub async fn snapshot_all_fingerprints(harness: &TestHarness) -> Result<PlaintextFingerprints> {
+    let ids: Vec<(i64,)> = sqlx::query_as("SELECT id FROM irises ORDER BY id")
+        .fetch_all(&harness.store(0).pool)
+        .await?;
+
+    let mut fps = PlaintextFingerprints::new();
+    for (id,) in ids {
+        let mut shares = Vec::new();
+        for party in 0..NUM_PARTIES {
+            shares.push(harness.store(party).get_iris_data_by_id(id).await?);
+        }
+        let mut hasher = blake3::Hasher::new();
+        let fields: Vec<[&[u16]; 3]> = vec![
+            [shares[0].left_code(), shares[1].left_code(), shares[2].left_code()],
+            [shares[0].left_mask(), shares[1].left_mask(), shares[2].left_mask()],
+            [shares[0].right_code(), shares[1].right_code(), shares[2].right_code()],
+            [shares[0].right_mask(), shares[1].right_mask(), shares[2].right_mask()],
+        ];
+        for [s0, s1, s2] in &fields {
+            let recon = reconstruct_shares(s0, s1, s2);
+            hasher.update(bytemuck::cast_slice::<u16, u8>(&recon));
+        }
+        fps.insert(id, *hasher.finalize().as_bytes());
+    }
+    Ok(fps)
+}
+
+/// Verify that current shares reconstruct to the same plaintexts as the
+/// snapshot. `skip_ids` are excluded (modified during test).
+pub async fn verify_fingerprints(
+    harness: &TestHarness,
+    expected: &PlaintextFingerprints,
+    skip_ids: &[i64],
+) -> Result<()> {
+    let current = snapshot_all_fingerprints(harness).await?;
+    let mut checked = 0;
+    for (id, exp) in expected {
+        if skip_ids.contains(id) {
+            continue;
+        }
+        let cur = current
+            .get(id)
+            .unwrap_or_else(|| panic!("ID {} missing from current DB", id));
+        assert_eq!(exp, cur, "Plaintext fingerprint mismatch for id {}", id);
+        checked += 1;
+    }
+    println!("  verified {}/{} iris fingerprints", checked, expected.len());
+    Ok(())
+}
+
+// ---- Polling helpers ----
+
+pub async fn wait_epoch_done(harness: &TestHarness, epoch: i32) -> Result<()> {
+    let deadline = tokio::time::Instant::now() + Duration::from_secs(120);
+    let start = std::time::Instant::now();
+    let mut last_print = start;
+    loop {
+        if tokio::time::Instant::now() > deadline {
+            eyre::bail!("Timeout waiting for epoch {}", epoch);
+        }
+        let mut done = true;
+        let mut applied = [0usize; 3];
+        for (i, party) in harness.parties.iter().enumerate() {
+            let rows: Vec<(bool,)> = sqlx::query_as(
+                "SELECT live_applied FROM rerand_progress WHERE epoch = $1",
+            ).bind(epoch).fetch_all(&party.store.pool).await?;
+            applied[i] = rows.iter().filter(|(a,)| *a).count();
+            if rows.is_empty() || !rows.iter().all(|(a,)| *a) { done = false; }
+        }
+        if done {
+            println!("  epoch {} done in {:.1}s", epoch, start.elapsed().as_secs_f64());
+            return Ok(());
+        }
+        if last_print.elapsed() > Duration::from_secs(5) {
+            println!("  waiting epoch {}: applied {:?} ({:.0}s)", epoch, applied, start.elapsed().as_secs_f64());
+            last_print = std::time::Instant::now();
+        }
+        tokio::time::sleep(Duration::from_millis(500)).await;
+    }
+}
+
+pub async fn wait_chunks_staged(harness: &TestHarness, epoch: i32, n: i32) -> Result<()> {
+    let deadline = tokio::time::Instant::now() + Duration::from_secs(60);
+    let start = std::time::Instant::now();
+    loop {
+        if tokio::time::Instant::now() > deadline {
+            eyre::bail!("Timeout waiting for {} chunks staged in epoch {}", n, epoch);
+        }
+        let mut max_count = 0i64;
+        for party in &harness.parties {
+            let (count,): (i64,) = sqlx::query_as(
+                "SELECT COUNT(*) FROM rerand_progress WHERE epoch = $1 AND staging_written = TRUE",
+            ).bind(epoch).fetch_one(&party.store.pool).await?;
+            max_count = max_count.max(count);
+        }
+        if max_count >= n as i64 {
+            println!("  {} chunks staged for epoch {} in {:.1}s", max_count, epoch, start.elapsed().as_secs_f64());
+            return Ok(());
+        }
+        tokio::time::sleep(Duration::from_millis(200)).await;
+    }
+}
+
+// ---- Server simulation ----
+
+pub async fn simulate_server_startup(harness: &TestHarness, party: usize) -> Result<()> {
+    let sync_result = build_test_sync_result(harness, party).await?;
+    let pool = &harness.parties[party].store.pool;
+    let schema = &harness.parties[party].schema_name;
+    let lock_conn = rerand_store::rerand_catchup_and_lock(pool, schema, &sync_result).await?;
+    let _count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM irises").fetch_one(pool).await?;
+    rerand_store::release_rerand_lock(lock_conn).await?;
+    Ok(())
+}
+
+async fn build_test_sync_result(harness: &TestHarness, party: usize) -> Result<SyncResult> {
+    let mut all_states = Vec::new();
+    for p in &harness.parties {
+        let rerand_state = rerand_store::build_rerand_sync_state(&p.store.pool).await.ok();
+        all_states.push(SyncState {
+            db_len: p.store.count_irises().await? as u64,
+            modifications: vec![],
+            next_sns_sequence_num: None,
+            common_config: CommonConfig::default(),
+            rerand_state,
+        });
+    }
+    let my_state = all_states[party].clone();
+    Ok(SyncResult { my_state, all_states })
+}
+
+pub async fn assert_consistent_rerand_epoch(harness: &TestHarness) -> Result<i32> {
+    let mut all: Vec<Vec<(i64, i32)>> = Vec::new();
+    for party in &harness.parties {
+        all.push(sqlx::query_as("SELECT id, rerand_epoch FROM irises ORDER BY id")
+            .fetch_all(&party.store.pool).await?);
+    }
+    assert_eq!(all[0].len(), all[1].len());
+    assert_eq!(all[1].len(), all[2].len());
+    for i in 0..all[0].len() {
+        assert_eq!(all[0][i].1, all[1][i].1, "epoch mismatch id {} p0 vs p1", all[0][i].0);
+        assert_eq!(all[0][i].1, all[2][i].1, "epoch mismatch id {} p0 vs p2", all[0][i].0);
+    }
+    Ok(all[0].first().map(|(_, e)| *e).unwrap_or(0))
+}
+
+async fn cleanup(harness: &TestHarness) -> Result<()> {
+    for party in &harness.parties {
+        let staging = rerand_store::staging_schema_name(&party.schema_name);
+        let _ = sqlx::query(&format!(r#"DROP SCHEMA IF EXISTS "{}" CASCADE"#, staging))
+            .execute(&party.store.pool).await;
+        let _ = sqlx::query(&format!(r#"DROP SCHEMA IF EXISTS "{}" CASCADE"#, party.schema_name))
+            .execute(&party.store.pool).await;
+    }
+    Ok(())
+}
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index b33e7637f8..1a28435eec 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -37,6 +37,7 @@ use iris_mpc_cpu::execution::hawk_main::{
 use iris_mpc_cpu::hawkers::aby3::aby3_store::Aby3Store;
 use iris_mpc_cpu::hnsw::graph::graph_store::GraphPg;
 use iris_mpc_store::loader::load_iris_db;
+use iris_mpc_store::rerand::{self as rerand_store};
 use iris_mpc_store::Store;
 use pprof::protos::Message;
 use pprof::ProfilerGuardBuilder;
@@ -138,6 +139,13 @@ pub async fn server_main(config: Config) -> Result<()> {
 
     sync_sqs_queues(&config, &sync_result, &aws_clients).await?;
 
+    let rerand_lock_conn = rerand_store::rerand_catchup_and_lock(
+        &iris_store.pool,
+        &iris_store.schema_name,
+        &sync_result,
+    )
+    .await?;
+
     if shutdown_handler.is_shutting_down() {
         tracing::warn!("Shutting down has been triggered");
         return Ok(());
@@ -166,6 +174,8 @@ pub async fn server_main(config: Config) -> Result<()> {
     )
     .await?;
 
+    rerand_store::release_rerand_lock(rerand_lock_conn).await?;
+
     background_tasks.check_tasks();
 
     let tx_results = start_results_thread(
@@ -387,11 +397,14 @@ async fn build_sync_state(
 
     tracing::info!("Database store length is: {}", db_len);
 
+    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool).await.ok();
+
     Ok(SyncState {
         db_len,
         modifications,
         next_sns_sequence_num,
         common_config,
+        rerand_state,
     })
 }
 
diff --git a/migrations/20260226000001_add_rerand_epoch.down.sql b/migrations/20260226000001_add_rerand_epoch.down.sql
new file mode 100644
index 0000000000..97fde78bcd
--- /dev/null
+++ b/migrations/20260226000001_add_rerand_epoch.down.sql
@@ -0,0 +1,14 @@
+ALTER TABLE irises DROP COLUMN IF EXISTS rerand_epoch;
+
+CREATE OR REPLACE FUNCTION increment_version_id()
+RETURNS TRIGGER AS $$
+BEGIN
+    IF (OLD.left_code IS DISTINCT FROM NEW.left_code OR
+        OLD.left_mask IS DISTINCT FROM NEW.left_mask OR
+        OLD.right_code IS DISTINCT FROM NEW.right_code OR
+        OLD.right_mask IS DISTINCT FROM NEW.right_mask) THEN
+        NEW.version_id = COALESCE(OLD.version_id, 0) + 1;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/migrations/20260226000001_add_rerand_epoch.up.sql b/migrations/20260226000001_add_rerand_epoch.up.sql
new file mode 100644
index 0000000000..ac3822e7e3
--- /dev/null
+++ b/migrations/20260226000001_add_rerand_epoch.up.sql
@@ -0,0 +1,15 @@
+ALTER TABLE irises ADD COLUMN IF NOT EXISTS rerand_epoch INTEGER NOT NULL DEFAULT 0;
+
+CREATE OR REPLACE FUNCTION increment_version_id()
+RETURNS TRIGGER AS $$
+BEGIN
+    IF (OLD.left_code IS DISTINCT FROM NEW.left_code OR
+        OLD.left_mask IS DISTINCT FROM NEW.left_mask OR
+        OLD.right_code IS DISTINCT FROM NEW.right_code OR
+        OLD.right_mask IS DISTINCT FROM NEW.right_mask)
+       AND NEW.rerand_epoch IS NOT DISTINCT FROM OLD.rerand_epoch THEN
+        NEW.version_id = COALESCE(OLD.version_id, 0) + 1;
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/migrations/20260226000002_create_rerand_progress.down.sql b/migrations/20260226000002_create_rerand_progress.down.sql
new file mode 100644
index 0000000000..791f86c6c2
--- /dev/null
+++ b/migrations/20260226000002_create_rerand_progress.down.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS rerand_progress;
diff --git a/migrations/20260226000002_create_rerand_progress.up.sql b/migrations/20260226000002_create_rerand_progress.up.sql
new file mode 100644
index 0000000000..214c4da9c9
--- /dev/null
+++ b/migrations/20260226000002_create_rerand_progress.up.sql
@@ -0,0 +1,8 @@
+CREATE TABLE IF NOT EXISTS rerand_progress (
+    epoch           INTEGER NOT NULL,
+    chunk_id        INTEGER NOT NULL,
+    staging_written BOOLEAN NOT NULL DEFAULT FALSE,
+    all_confirmed   BOOLEAN NOT NULL DEFAULT FALSE,
+    live_applied    BOOLEAN NOT NULL DEFAULT FALSE,
+    PRIMARY KEY (epoch, chunk_id)
+);

From 03557fd4944a700de4182a366a7d089b5bd0c438 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 11:05:25 +0100
Subject: [PATCH 02/76] simplfy and fixes

---
 docs/specs/rerandomization.md                 |  26 ++-
 iris-mpc-store/src/rerand.rs                  | 196 ++++++++++--------
 iris-mpc-upgrade/src/continuous_rerand.rs     |  18 +-
 .../tests/continuous_rerand_e2e.rs            |  16 ++
 4 files changed, 151 insertions(+), 105 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index d541aeef29..2ab365614a 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -175,15 +175,25 @@ At startup, before `load_iris_db`:
 
 1. **Existing**: modification sync (`sync_modifications`) — all parties catch up on modifications, producing identical `version_id` values
 2. **New**: rerand sync — parties exchange a compact rerand watermark during the existing startup sync (`SyncState` exchange):
-   - Each party computes `(epoch, max_confirmed_chunk)` from its local `rerand_progress` table: the active epoch E and the highest `chunk_id` where `all_confirmed = TRUE`. Since chunks are processed in strictly increasing order, all chunks `0..max_confirmed_chunk` are implicitly confirmed.
+   - Each party computes `(epoch, max_confirmed_chunk)` from its local `rerand_progress` table: the active epoch E and the highest `chunk_id` where `all_confirmed = TRUE`.
    - Each party sends this single `(epoch, max_confirmed_chunk)` pair as part of `SyncState`.
-   - Each party computes `safe_up_to = max(max_confirmed_chunk_party_0, max_confirmed_chunk_party_1, max_confirmed_chunk_party_2)` for the agreed epoch E, then locally applies all chunks `0..safe_up_to` where `live_applied = FALSE`.
-   - This is safe because `all_confirmed = TRUE` at any party means that party observed all three S3 `staged` markers, which means all three parties successfully committed the chunk to their staging schemas. A slower party may not have polled S3 yet, but its staging data is already there. Using `max` ensures all parties converge to the same applied set, preventing cross-party desync where one party loads rerandomized shares and another loads stale shares.
-   - Edge case: if no chunks have been confirmed yet (fresh epoch or very start), `max_confirmed_chunk` is -1 / None. `safe_up_to` becomes -1 / None and the catch-up step is skipped entirely.
-3. **New (DB-only catch-up)**: acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection. Then for every chunk K in `0..safe_up_to` where locally `live_applied = FALSE` (in increasing order): run the same apply transaction as Step 1.11. **Keep the lock held** through step 4.
+   - Each party checks whether any peer is exactly 1 confirmed chunk ahead (within the same epoch, or has moved to the next epoch). If so, it applies that single chunk (`my_max_confirmed + 1`) from staging to the live DB.
+   - **Why at most 1 chunk**: the rerand loop has a strict per-chunk synchronization barrier — a node cannot stage chunk K+1 until all three parties have confirmed chunk K via S3 markers. Therefore it is impossible for any peer to be more than 1 confirmed chunk ahead. The implementation enforces this with a fatal bail if the gap exceeds 1 (indicates DB corruption).
+   - **Why `max` across peers**: `all_confirmed = TRUE` at any party means that party observed all three S3 `staged` markers, which means all three parties successfully committed the chunk to their staging schemas. A slower party may not have polled S3 yet, but its staging data is already there.
+   - Edge case: if all parties report the same `max_confirmed_chunk`, there is nothing to catch up and the step is skipped.
+3. **New (DB-only catch-up)**: acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection. If step 2 identified a chunk to apply, run the same apply transaction as Step 1.11. **Keep the lock held** through step 4.
 4. **Existing**: `load_iris_db` — loads from live DB into GPU memory. The advisory lock is still held, so the rerand server cannot apply new chunks while the DB is being read into memory.
 5. Release the advisory lock: `SELECT pg_advisory_unlock(RERAND_APPLY_LOCK)` on the dedicated connection, then drop the connection.
 
+### Epoch and chunk desync safety checks
+
+The startup sync validates two invariants derived from the protocol's synchronization barriers:
+
+- **Epoch gap ≤ 1**: epochs transition via a 3-party S3 barrier (`complete` markers), so no peer can be more than 1 epoch ahead. A gap > 1 is fatal.
+- **Chunk gap ≤ 1** (within the same epoch): the per-chunk S3 barrier (`staged` markers) prevents any peer from confirming more than 1 chunk ahead. A gap > 1 is fatal.
+
+If either check fails, the main server refuses to start. This catches DB corruption, manual interference, or bugs in the rerand server early, before any data is loaded into memory.
+
 ### Advisory lock: startup vs rerand server concurrency
 
 Both the rerand server (Step 1.11) and the main server startup (Steps 2.3–2.4) acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` before applying chunks. This ensures:
@@ -200,8 +210,10 @@ sqlx::query("SELECT pg_advisory_lock($1)")
     .bind(RERAND_APPLY_LOCK)
     .execute(&mut *lock_conn).await?;
 
-apply_catchup_chunks(&pool).await?;  // uses pool
-load_iris_db(&pool).await?;          // uses pool
+if let Some((epoch, chunk_id)) = catchup_chunk {
+    apply_staging_chunk(&pool, epoch, chunk_id).await?;
+}
+load_iris_db(&pool).await?;
 
 sqlx::query("SELECT pg_advisory_unlock($1)")
     .bind(RERAND_APPLY_LOCK)
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 557304c057..d03f231cfe 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -1,6 +1,6 @@
 use eyre::Result;
 use iris_mpc_common::helpers::sync::{RerandSyncState, SyncResult};
-use sqlx::{pool::PoolConnection, PgPool, Postgres};
+use sqlx::{PgPool};
 
 pub const RERAND_APPLY_LOCK: i64 = 0x5245_5241_4E44;
 
@@ -222,27 +222,6 @@ pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
     Ok(row.0)
 }
 
-/// Returns chunk_ids for a given epoch where live_applied = FALSE and
-/// chunk_id <= up_to_chunk, ordered ascending.
-pub async fn get_unapplied_chunks(
-    pool: &PgPool,
-    epoch: i32,
-    up_to_chunk: i32,
-) -> Result<Vec<i32>> {
-    let rows: Vec<(i32,)> = sqlx::query_as(
-        r#"
-        SELECT chunk_id FROM rerand_progress
-        WHERE epoch = $1 AND chunk_id <= $2 AND live_applied = FALSE
-        ORDER BY chunk_id ASC
-        "#,
-    )
-    .bind(epoch)
-    .bind(up_to_chunk)
-    .fetch_all(pool)
-    .await?;
-    Ok(rows.into_iter().map(|(id,)| id).collect())
-}
-
 // ---------------------------------------------------------------------------
 // Shared startup helpers (used by both HNSW and GPU servers)
 // ---------------------------------------------------------------------------
@@ -259,76 +238,85 @@ pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<RerandSyncState> {
     })
 }
 
-/// Compute the safe-to-apply watermark from all parties' rerand sync states.
-/// Returns `Some((epoch, max_chunk_id))` if there are chunks to catch up,
+/// Compute the single chunk (if any) that needs to be applied during startup catch-up.
+///
+/// Because the rerand loop has a strict per-chunk synchronization barrier (all 3 parties
+/// must confirm chunk K before any party can stage chunk K+1), peers can be at most
+/// 1 confirmed chunk ahead. Therefore, catch-up is always 0 or 1 chunks.
+///
+/// Returns `Some((epoch, chunk_id))` if there is exactly one chunk to catch up,
 /// `None` otherwise.
-pub fn compute_rerand_safe_up_to(sync_result: &SyncResult) -> Result<Option<(i32, i32)>> {
+pub fn compute_rerand_catchup_chunk(sync_result: &SyncResult) -> Result<Option<(i32, i32)>> {
     let my_state = match sync_result.my_state.rerand_state.as_ref() {
         Some(s) => s,
         None => return Ok(None),
     };
     let my_epoch = my_state.epoch;
+    let my_chunk = my_state.max_confirmed_chunk;
 
-    let rerand_states: Vec<&RerandSyncState> = sync_result
-        .all_states
-        .iter()
-        .filter_map(|s| s.rerand_state.as_ref())
-        .collect();
+    let mut any_peer_ahead = false;
 
-    if rerand_states.is_empty() {
-        return Ok(None);
-    }
-
-    let mut safe_up_to = -1;
-    for s in rerand_states {
-        let diff = s.epoch - my_epoch;
-        match diff {
+    for s in sync_result.all_states.iter().filter_map(|s| s.rerand_state.as_ref()) {
+        let epoch_diff = s.epoch - my_epoch;
+        match epoch_diff {
             0 => {
-                safe_up_to = safe_up_to.max(s.max_confirmed_chunk);
+                let chunk_diff = s.max_confirmed_chunk - my_chunk;
+                if chunk_diff > 1 {
+                    eyre::bail!(
+                        "Fatal chunk desync: peer confirmed chunk {} but local is at {} \
+                         (max possible difference is 1)",
+                        s.max_confirmed_chunk,
+                        my_chunk
+                    );
+                }
+                if chunk_diff == 1 {
+                    any_peer_ahead = true;
+                }
             }
             1 => {
-                safe_up_to = i32::MAX;
-            }
-            -1 => {
-                // They are behind, they contribute -1
+                any_peer_ahead = true;
             }
+            -1 => {}
             _ => {
-                eyre::bail!("Fatal epoch desync: local epoch is {}, but peer is on epoch {}", my_epoch, s.epoch);
+                eyre::bail!(
+                    "Fatal epoch desync: local epoch is {}, but peer is on epoch {}",
+                    my_epoch,
+                    s.epoch
+                );
             }
         }
     }
 
-    if safe_up_to < 0 {
+    if !any_peer_ahead {
         return Ok(None);
     }
 
-    Ok(Some((my_epoch, safe_up_to)))
+    let catchup_chunk = my_chunk + 1;
+    Ok(Some((my_epoch, catchup_chunk)))
 }
 
 /// Perform rerand catch-up and acquire the advisory lock.
 ///
-/// 1. Computes the safe-to-apply watermark from `sync_result`.
-/// 2. If there are unapplied chunks, acquires `pg_advisory_lock(RERAND_APPLY_LOCK)`
-///    on a dedicated connection, then applies all unapplied chunks.
-/// 3. Returns the lock-holding connection (if the lock was acquired).
-///
-/// The caller **must** keep the returned connection alive until `load_iris_db`
-/// finishes, then call [`release_rerand_lock`] to release it.
+/// 1. Determines whether this node is 1 chunk behind a peer.
+/// 2. If so, acquires `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated
+///    connection and applies the single missing chunk.
+/// 3. Returns the lock-holding connection (caller keeps it alive through
+///    `load_iris_db`, then calls [`release_rerand_lock`]).
 pub async fn rerand_catchup_and_lock(
     pool: &PgPool,
     schema_name: &str,
     sync_result: &SyncResult,
-) -> Result<Option<PoolConnection<Postgres>>> {
-    let safe_up_to = match compute_rerand_safe_up_to(sync_result)? {
+) -> Result<Option<sqlx::pool::PoolConnection<sqlx::Postgres>>> {
+    let (epoch, chunk_id) = match compute_rerand_catchup_chunk(sync_result)? {
         Some(v) => v,
         None => return Ok(None),
     };
 
     let staging_schema = staging_schema_name(schema_name);
     tracing::info!(
-        "Rerand catch-up: applying chunks up to {} for epoch {}",
-        safe_up_to.1,
-        safe_up_to.0
+        "Rerand catch-up: applying epoch {} chunk {}",
+        epoch,
+        chunk_id,
     );
 
     let mut conn = pool.acquire().await?;
@@ -337,24 +325,29 @@ pub async fn rerand_catchup_and_lock(
         .execute(&mut *conn)
         .await?;
 
-    let unapplied = get_unapplied_chunks(pool, safe_up_to.0, safe_up_to.1).await?;
-    for chunk_id in unapplied {
-        let rows =
-            apply_staging_chunk(pool, &staging_schema, safe_up_to.0, chunk_id).await?;
-        tracing::info!(
-            "Rerand catch-up: applied epoch {} chunk {} ({} rows)",
-            safe_up_to.0,
-            chunk_id,
-            rows
-        );
-    }
+    let rows = match apply_staging_chunk(pool, &staging_schema, epoch, chunk_id).await {
+        Ok(r) => r,
+        Err(e) => {
+            let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
+                .bind(RERAND_APPLY_LOCK)
+                .execute(&mut *conn)
+                .await;
+            return Err(e);
+        }
+    };
+    tracing::info!(
+        "Rerand catch-up: applied epoch {} chunk {} ({} rows)",
+        epoch,
+        chunk_id,
+        rows
+    );
 
     Ok(Some(conn))
 }
 
 /// Release the advisory lock acquired by [`rerand_catchup_and_lock`].
 pub async fn release_rerand_lock(
-    lock_conn: Option<PoolConnection<Postgres>>,
+    lock_conn: Option<sqlx::pool::PoolConnection<sqlx::Postgres>>,
 ) -> Result<()> {
     if let Some(mut conn) = lock_conn {
         sqlx::query("SELECT pg_advisory_unlock($1)")
@@ -387,55 +380,80 @@ mod tests {
     }
 
     #[test]
-    fn test_compute_rerand_safe_up_to_same_epoch() {
-        let p0 = dummy_sync_state(1, 5);
+    fn test_catchup_peer_one_chunk_ahead() {
+        let p0 = dummy_sync_state(1, 4);
         let p1 = dummy_sync_state(1, 4);
-        let p2 = dummy_sync_state(1, 6);
+        let p2 = dummy_sync_state(1, 5);
+        let sync_result = SyncResult {
+            my_state: p0.clone(),
+            all_states: vec![p0, p1, p2],
+        };
+        assert_eq!(
+            compute_rerand_catchup_chunk(&sync_result).unwrap(),
+            Some((1, 5))
+        );
+    }
+
+    #[test]
+    fn test_catchup_all_same() {
+        let p0 = dummy_sync_state(1, 5);
+        let p1 = dummy_sync_state(1, 5);
+        let p2 = dummy_sync_state(1, 5);
         let sync_result = SyncResult {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert_eq!(compute_rerand_safe_up_to(&sync_result).unwrap(), Some((1, 6)));
+        assert_eq!(compute_rerand_catchup_chunk(&sync_result).unwrap(), None);
     }
 
     #[test]
-    fn test_compute_rerand_safe_up_to_peer_ahead() {
-        // I am on epoch 0, but peer is on epoch 1.
-        // This implies the peer has confirmed all my chunks for epoch 0.
+    fn test_catchup_peer_epoch_ahead() {
         let p0 = dummy_sync_state(0, 5);
-        let p1 = dummy_sync_state(1, 0); // ahead
+        let p1 = dummy_sync_state(1, 0);
         let p2 = dummy_sync_state(0, 5);
         let sync_result = SyncResult {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert_eq!(compute_rerand_safe_up_to(&sync_result).unwrap(), Some((0, i32::MAX)));
+        assert_eq!(
+            compute_rerand_catchup_chunk(&sync_result).unwrap(),
+            Some((0, 6))
+        );
     }
 
     #[test]
-    fn test_compute_rerand_safe_up_to_peer_behind() {
-        // I am on epoch 1, but peer is on epoch 0.
-        // This implies the peer has not confirmed any chunks for epoch 1.
+    fn test_catchup_peer_epoch_behind() {
         let p0 = dummy_sync_state(1, 2);
-        let p1 = dummy_sync_state(0, 10); // behind
+        let p1 = dummy_sync_state(0, 10);
         let p2 = dummy_sync_state(1, 2);
         let sync_result = SyncResult {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert_eq!(compute_rerand_safe_up_to(&sync_result).unwrap(), Some((1, 2)));
+        assert_eq!(compute_rerand_catchup_chunk(&sync_result).unwrap(), None);
     }
-    
+
+    #[test]
+    fn test_catchup_fatal_chunk_desync() {
+        let p0 = dummy_sync_state(1, 2);
+        let p1 = dummy_sync_state(1, 4);
+        let p2 = dummy_sync_state(1, 2);
+        let sync_result = SyncResult {
+            my_state: p0.clone(),
+            all_states: vec![p0, p1, p2],
+        };
+        assert!(compute_rerand_catchup_chunk(&sync_result).is_err());
+    }
+
     #[test]
-    fn test_compute_rerand_safe_up_to_fatal_desync() {
-        // I am on epoch 1, but peer is on epoch 3 (difference > 1).
+    fn test_catchup_fatal_epoch_desync() {
         let p0 = dummy_sync_state(1, 2);
-        let p1 = dummy_sync_state(3, 10); // way ahead
+        let p1 = dummy_sync_state(3, 10);
         let p2 = dummy_sync_state(1, 2);
         let sync_result = SyncResult {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert!(compute_rerand_safe_up_to(&sync_result).is_err());
+        assert!(compute_rerand_catchup_chunk(&sync_result).is_err());
     }
 }
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index c948b5cd86..abaf9afa73 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -149,15 +149,7 @@ pub async fn run_continuous_rerand(
                 .execute(&mut *lock_conn)
                 .await?;
 
-            let rows =
-                apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32)
-                    .await?;
-            tracing::info!(
-                "Epoch {} chunk {}: applied to live DB ({} rows updated)",
-                active_epoch,
-                chunk_id,
-                rows
-            );
+            let apply_res = apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32).await;
 
             sqlx::query("SELECT pg_advisory_unlock($1)")
                 .bind(RERAND_APPLY_LOCK)
@@ -165,6 +157,14 @@ pub async fn run_continuous_rerand(
                 .await?;
             drop(lock_conn);
 
+            let rows = apply_res?;
+            tracing::info!(
+                "Epoch {} chunk {}: applied to live DB ({} rows updated)",
+                active_epoch,
+                chunk_id,
+                rows
+            );
+
             chunk_id += 1;
 
             if chunk_delay > Duration::ZERO {
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index 1b92b96571..4d5f1f9239 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -305,6 +305,22 @@ fn phase8_reject_desync() {
         let r1 = simulate_server_startup(&env.harness, 1).await;
         assert!(r1.is_err(), "P1 startup should have failed due to large epoch gap");
 
+        // Now test the new chunk gap logic
+        // P1 has chunk 0 confirmed, P0 has chunk 2 confirmed (gap > 1) in the same epoch
+        for p in 0..NUM_PARTIES {
+            let pool = &env.harness.parties[p].store.pool;
+            sqlx::query("DELETE FROM rerand_progress").execute(pool).await.unwrap();
+        }
+        
+        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (3, 0, TRUE, TRUE, TRUE)")
+            .execute(&env.harness.parties[1].store.pool).await.unwrap();
+            
+        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (3, 2, TRUE, TRUE, FALSE)")
+            .execute(&env.harness.parties[0].store.pool).await.unwrap();
+            
+        let r1_chunk_desync = simulate_server_startup(&env.harness, 1).await;
+        assert!(r1_chunk_desync.is_err(), "P1 startup should have failed due to large chunk gap");
+
         println!("[phase 8] PASSED");
 
         env.teardown().await

From 262af1a889bd9da267e0c63e55b35e1f8f1d44e8 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 11:11:06 +0100
Subject: [PATCH 03/76] clippy and semgrep

---
 .../bin/iris-mpc-upgrade/rerandomize_db.rs    |  17 +-
 iris-mpc-bins/bin/iris-mpc/server.rs          |  13 +-
 iris-mpc-store/src/rerand.rs                  |  35 +++-
 iris-mpc-upgrade/src/continuous_rerand.rs     |  17 +-
 iris-mpc-upgrade/src/epoch.rs                 |  20 +-
 iris-mpc-upgrade/src/rerandomization.rs       |  18 +-
 iris-mpc-upgrade/src/s3_coordination.rs       |   7 +-
 .../tests/continuous_rerand_e2e.rs            |  23 +-
 iris-mpc-upgrade/tests/test_utils.rs          | 196 ++++++++++++++----
 iris-mpc/src/server/mod.rs                    |   4 +-
 10 files changed, 243 insertions(+), 107 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
index 5cbe5c51e8..da522dfa56 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
@@ -533,12 +533,15 @@ async fn rerandomize_check_main(config: ReRandomizeCheckConfig) -> Result<()> {
 }
 
 async fn rerandomize_continuous_main(config: RerandomizeContinuousConfig) -> Result<()> {
-    tracing::info!("Starting continuous rerandomization for party {}", config.party_id);
+    tracing::info!(
+        "Starting continuous rerandomization for party {}",
+        config.party_id
+    );
 
     let mut background_tasks = TaskMonitor::new();
     let healthcheck_port = config.healthcheck_port;
-    let _health_check_abort = background_tasks
-        .spawn(async move { spawn_healthcheck_server(healthcheck_port).await });
+    let _health_check_abort =
+        background_tasks.spawn(async move { spawn_healthcheck_server(healthcheck_port).await });
     background_tasks.check_tasks();
 
     let sdk_config = aws_config::from_env().load().await;
@@ -547,12 +550,8 @@ async fn rerandomize_continuous_main(config: RerandomizeContinuousConfig) -> Res
     let s3_client = S3Client::from_conf(s3_config.build());
     let sm_client = SecretsManagerClient::from_conf(sm_config.build());
 
-    let postgres_client = PostgresClient::new(
-        &config.db_url,
-        &config.schema_name,
-        AccessMode::ReadWrite,
-    )
-    .await?;
+    let postgres_client =
+        PostgresClient::new(&config.db_url, &config.schema_name, AccessMode::ReadWrite).await?;
     let store = Store::new(&postgres_client).await?;
 
     continuous_rerand::run_continuous_rerand(&config, &s3_client, &sm_client, &store, None).await?;
diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 82d49a76e7..6d68b5e0b4 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -982,7 +982,9 @@ async fn server_main(config: Config) -> Result<()> {
     let is_ready_flag = Arc::new(AtomicBool::new(false));
     let is_ready_flag_cloned = Arc::clone(&is_ready_flag);
 
-    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool).await.ok();
+    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool)
+        .await
+        .ok();
     let my_state = SyncState {
         db_len: store_len as u64,
         modifications: store.last_modifications(max_modification_lookback).await?,
@@ -1318,12 +1320,9 @@ async fn server_main(config: Config) -> Result<()> {
         }
     }
 
-    let rerand_lock_conn = rerand_store::rerand_catchup_and_lock(
-        &store.pool,
-        &store.schema_name,
-        &sync_result,
-    )
-    .await?;
+    let rerand_lock_conn =
+        rerand_store::rerand_catchup_and_lock(&store.pool, &store.schema_name, &sync_result)
+            .await?;
 
     if download_shutdown_handler.is_shutting_down() {
         tracing::warn!("Shutting down has been triggered");
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index d03f231cfe..a52372d268 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -1,6 +1,6 @@
 use eyre::Result;
 use iris_mpc_common::helpers::sync::{RerandSyncState, SyncResult};
-use sqlx::{PgPool};
+use sqlx::PgPool;
 
 pub const RERAND_APPLY_LOCK: i64 = 0x5245_5241_4E44;
 
@@ -29,7 +29,21 @@ pub fn staging_schema_name(live_schema: &str) -> String {
     format!("{}_rerand_staging", live_schema)
 }
 
+fn validate_identifier(name: &str) -> Result<()> {
+    if name.is_empty() {
+        eyre::bail!("SQL identifier must not be empty");
+    }
+    if !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') {
+        eyre::bail!(
+            "SQL identifier contains invalid characters (only ASCII alphanumeric and _ allowed): {:?}",
+            name
+        );
+    }
+    Ok(())
+}
+
 pub async fn ensure_staging_schema(pool: &PgPool, staging_schema: &str) -> Result<()> {
+    validate_identifier(staging_schema)?;
     let create_schema = format!(r#"CREATE SCHEMA IF NOT EXISTS "{}""#, staging_schema);
     sqlx::query(&create_schema).execute(pool).await?;
 
@@ -62,6 +76,7 @@ pub async fn insert_staging_irises(
     if entries.is_empty() {
         return Ok(());
     }
+    validate_identifier(staging_schema)?;
 
     let table = format!("\"{}\".irises", staging_schema);
     let header = format!(
@@ -99,6 +114,7 @@ pub async fn apply_staging_chunk(
     epoch: i32,
     chunk_id: i32,
 ) -> Result<u64> {
+    validate_identifier(staging_schema)?;
     let mut tx = pool.begin().await?;
 
     let update_sql = format!(
@@ -215,10 +231,9 @@ pub async fn get_max_confirmed_chunk(pool: &PgPool, epoch: i32) -> Result<Option
 
 /// Returns the highest epoch that has any rerand_progress rows.
 pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
-    let row: (Option<i32>,) =
-        sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
-            .fetch_one(pool)
-            .await?;
+    let row: (Option<i32>,) = sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
+        .fetch_one(pool)
+        .await?;
     Ok(row.0)
 }
 
@@ -229,9 +244,7 @@ pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
 /// Build the rerand sync state from the local `rerand_progress` table.
 pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<RerandSyncState> {
     let epoch = get_current_epoch(pool).await?.unwrap_or(0);
-    let max_confirmed = get_max_confirmed_chunk(pool, epoch)
-        .await?
-        .unwrap_or(-1);
+    let max_confirmed = get_max_confirmed_chunk(pool, epoch).await?.unwrap_or(-1);
     Ok(RerandSyncState {
         epoch,
         max_confirmed_chunk: max_confirmed,
@@ -256,7 +269,11 @@ pub fn compute_rerand_catchup_chunk(sync_result: &SyncResult) -> Result<Option<(
 
     let mut any_peer_ahead = false;
 
-    for s in sync_result.all_states.iter().filter_map(|s| s.rerand_state.as_ref()) {
+    for s in sync_result
+        .all_states
+        .iter()
+        .filter_map(|s| s.rerand_state.as_ref())
+    {
         let epoch_diff = s.epoch - my_epoch;
         match epoch_diff {
             0 => {
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index abaf9afa73..b6152948e8 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -77,8 +77,7 @@ pub async fn run_continuous_rerand(
                 break;
             }
 
-            let progress =
-                get_rerand_progress(pool, active_epoch as i32, chunk_id as i32).await?;
+            let progress = get_rerand_progress(pool, active_epoch as i32, chunk_id as i32).await?;
 
             if progress.as_ref().is_some_and(|p| p.live_applied) {
                 chunk_id += 1;
@@ -149,7 +148,9 @@ pub async fn run_continuous_rerand(
                 .execute(&mut *lock_conn)
                 .await?;
 
-            let apply_res = apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32).await;
+            let apply_res =
+                apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32)
+                    .await;
 
             sqlx::query("SELECT pg_advisory_unlock($1)")
                 .bind(RERAND_APPLY_LOCK)
@@ -235,14 +236,8 @@ async fn get_or_create_manifest(
         Ok(manifest)
     } else {
         let local_max = store.get_max_serial_id().await? as u64;
-        s3_coordination::upload_max_id(
-            s3,
-            &config.s3_bucket,
-            epoch,
-            config.party_id,
-            local_max,
-        )
-        .await?;
+        s3_coordination::upload_max_id(s3, &config.s3_bucket, epoch, config.party_id, local_max)
+            .await?;
 
         s3_coordination::download_manifest(s3, &config.s3_bucket, epoch, poll_interval).await
     }
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index 1cd79da13d..00bca5c1bd 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -161,14 +161,26 @@ pub async fn derive_shared_secret(
     let pk_next_b64 =
         s3_coordination::download_public_key_for_party(s3, bucket, epoch, next_id, poll_interval)
             .await?;
-    let pk_next = tripartite_dh::PublicKeys::deserialize(&STANDARD.decode(&pk_next_b64)?)
-        .map_err(|e| eyre!("Failed to deserialize public key for party {}: {:?}", next_id, e))?;
+    let pk_next =
+        tripartite_dh::PublicKeys::deserialize(&STANDARD.decode(&pk_next_b64)?).map_err(|e| {
+            eyre!(
+                "Failed to deserialize public key for party {}: {:?}",
+                next_id,
+                e
+            )
+        })?;
 
     let pk_prev_b64 =
         s3_coordination::download_public_key_for_party(s3, bucket, epoch, prev_id, poll_interval)
             .await?;
-    let pk_prev = tripartite_dh::PublicKeys::deserialize(&STANDARD.decode(&pk_prev_b64)?)
-        .map_err(|e| eyre!("Failed to deserialize public key for party {}: {:?}", prev_id, e))?;
+    let pk_prev =
+        tripartite_dh::PublicKeys::deserialize(&STANDARD.decode(&pk_prev_b64)?).map_err(|e| {
+            eyre!(
+                "Failed to deserialize public key for party {}: {:?}",
+                prev_id,
+                e
+            )
+        })?;
 
     let shared_secret = private_key.derive_shared_secret(&pk_next, &pk_prev);
     let hash = blake3::hash(&shared_secret);
diff --git a/iris-mpc-upgrade/src/rerandomization.rs b/iris-mpc-upgrade/src/rerandomization.rs
index 40db34136c..4038309361 100644
--- a/iris-mpc-upgrade/src/rerandomization.rs
+++ b/iris-mpc-upgrade/src/rerandomization.rs
@@ -93,18 +93,12 @@ fn randomize_galois_ring_coefs(coefs: &mut [u16], xof: &mut blake3::OutputReader
 /// Verifies consistency by reconstructing from all 3 pairs (0-1, 1-2, 0-2) and
 /// asserting they agree.
 pub fn reconstruct_shares(share0: &[u16], share1: &[u16], share2: &[u16]) -> Vec<u16> {
-    let lag_01 =
-        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID1);
-    let lag_10 =
-        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID0);
-    let lag_02 =
-        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID2);
-    let lag_20 =
-        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID0);
-    let lag_12 =
-        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID2);
-    let lag_21 =
-        ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID1);
+    let lag_01 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID1);
+    let lag_10 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID0);
+    let lag_02 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID2);
+    let lag_20 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID0);
+    let lag_12 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID2);
+    let lag_21 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID1);
 
     assert!(share0.len() == share1.len() && share1.len() == share2.len());
 
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index f8ccfe4713..b391a654d7 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -39,12 +39,7 @@ fn epoch_party_prefix(epoch: u32, party: u8) -> String {
     format!("rerand/epoch-{}/party-{}", epoch, party)
 }
 
-pub async fn upload_marker(
-    s3: &S3Client,
-    bucket: &str,
-    key: &str,
-    body: Vec<u8>,
-) -> Result<()> {
+pub async fn upload_marker(s3: &S3Client, bucket: &str, key: &str, body: Vec<u8>) -> Result<()> {
     s3.put_object()
         .bucket(bucket)
         .key(key)
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index 4d5f1f9239..b2a1082dec 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -213,7 +213,7 @@ fn phase6_multiple_epochs() {
         println!("[phase 6] Multiple epochs...");
 
         let (h, t) = env.spawn_all();
-        
+
         // Wait for epoch 0 to finish
         wait_epoch_done(&env.harness, 0).await?;
         println!("[phase 6]   epoch 0 completed");
@@ -303,23 +303,32 @@ fn phase8_reject_desync() {
             .execute(&env.harness.parties[2].store.pool).await.unwrap();
 
         let r1 = simulate_server_startup(&env.harness, 1).await;
-        assert!(r1.is_err(), "P1 startup should have failed due to large epoch gap");
+        assert!(
+            r1.is_err(),
+            "P1 startup should have failed due to large epoch gap"
+        );
 
         // Now test the new chunk gap logic
         // P1 has chunk 0 confirmed, P0 has chunk 2 confirmed (gap > 1) in the same epoch
         for p in 0..NUM_PARTIES {
             let pool = &env.harness.parties[p].store.pool;
-            sqlx::query("DELETE FROM rerand_progress").execute(pool).await.unwrap();
+            sqlx::query("DELETE FROM rerand_progress")
+                .execute(pool)
+                .await
+                .unwrap();
         }
-        
+
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (3, 0, TRUE, TRUE, TRUE)")
             .execute(&env.harness.parties[1].store.pool).await.unwrap();
-            
+
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (3, 2, TRUE, TRUE, FALSE)")
             .execute(&env.harness.parties[0].store.pool).await.unwrap();
-            
+
         let r1_chunk_desync = simulate_server_startup(&env.harness, 1).await;
-        assert!(r1_chunk_desync.is_err(), "P1 startup should have failed due to large chunk gap");
+        assert!(
+            r1_chunk_desync.is_err(),
+            "P1 startup should have failed due to large chunk gap"
+        );
 
         println!("[phase 8] PASSED");
 
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index e23cd3869b..6a6280eafa 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -85,14 +85,27 @@ impl TestEnv {
         let s3 = aws_sdk_s3::Client::new(&sdk);
         let sm = aws_sdk_secretsmanager::Client::new(&sdk);
 
-        s3.create_bucket().bucket(&bucket).send().await
+        s3.create_bucket()
+            .bucket(&bucket)
+            .send()
+            .await
             .map_err(|e| eyre::eyre!("Failed to create bucket {}: {}", bucket, e))?;
 
-        println!("  [setup] Seeding {} irises (prefix={}, bucket={})", DB_SIZE, prefix, bucket);
+        println!(
+            "  [setup] Seeding {} irises (prefix={}, bucket={})",
+            DB_SIZE, prefix, bucket
+        );
         seed_three_party_db(&harness, DB_SIZE).await?;
         let fingerprints = snapshot_all_fingerprints(&harness).await?;
 
-        Ok(Self { harness, s3, sm, prefix, bucket, fingerprints })
+        Ok(Self {
+            harness,
+            s3,
+            sm,
+            prefix,
+            bucket,
+            fingerprints,
+        })
     }
 
     pub async fn teardown(&self) -> Result<()> {
@@ -101,16 +114,25 @@ impl TestEnv {
         let mut token = None;
         loop {
             let mut req = self.s3.list_objects_v2().bucket(&self.bucket);
-            if let Some(t) = &token { req = req.continuation_token(t); }
+            if let Some(t) = &token {
+                req = req.continuation_token(t);
+            }
             let resp = req.send().await?;
             for obj in resp.contents() {
                 if let Some(key) = obj.key() {
-                    self.s3.delete_object().bucket(&self.bucket).key(key).send().await?;
+                    self.s3
+                        .delete_object()
+                        .bucket(&self.bucket)
+                        .key(key)
+                        .send()
+                        .await?;
                 }
             }
             if resp.is_truncated() == Some(true) {
                 token = resp.next_continuation_token().map(|s| s.to_string());
-            } else { break; }
+            } else {
+                break;
+            }
         }
         let _ = self.s3.delete_bucket().bucket(&self.bucket).send().await;
         Ok(())
@@ -134,7 +156,10 @@ impl TestEnv {
         }
     }
 
-    pub fn spawn_rerand(&self, party_id: u8) -> (tokio::task::JoinHandle<Result<()>>, CancellationToken) {
+    pub fn spawn_rerand(
+        &self,
+        party_id: u8,
+    ) -> (tokio::task::JoinHandle<Result<()>>, CancellationToken) {
         let config = self.make_config(party_id);
         let s3 = self.s3.clone();
         let sm = self.sm.clone();
@@ -147,7 +172,12 @@ impl TestEnv {
         (h, token)
     }
 
-    pub fn spawn_all(&self) -> (Vec<tokio::task::JoinHandle<Result<()>>>, Vec<CancellationToken>) {
+    pub fn spawn_all(
+        &self,
+    ) -> (
+        Vec<tokio::task::JoinHandle<Result<()>>>,
+        Vec<CancellationToken>,
+    ) {
         let mut handles = Vec::new();
         let mut tokens = Vec::new();
         for p in 0u8..3 {
@@ -163,9 +193,15 @@ pub async fn stop_all(
     tokens: Vec<CancellationToken>,
     handles: Vec<tokio::task::JoinHandle<Result<()>>>,
 ) {
-    for t in &tokens { t.cancel(); }
-    for h in &handles { h.abort(); }
-    for h in handles { let _ = h.await; }
+    for t in &tokens {
+        t.cancel();
+    }
+    for h in &handles {
+        h.abort();
+    }
+    for h in handles {
+        let _ = h.await;
+    }
 }
 
 // ---- DB seeding ----
@@ -175,7 +211,13 @@ pub async fn seed_three_party_db(harness: &TestHarness, count: usize) -> Result<
     for chunk_start in (1..=count).step_by(100) {
         let chunk_end = std::cmp::min(chunk_start + 100, count + 1);
 
-        struct S { id: i64, lc: Vec<u16>, lm: Vec<u16>, rc: Vec<u16>, rm: Vec<u16> }
+        struct S {
+            id: i64,
+            lc: Vec<u16>,
+            lm: Vec<u16>,
+            rc: Vec<u16>,
+            rm: Vec<u16>,
+        }
 
         let mut party_data: Vec<Vec<S>> = (0..NUM_PARTIES).map(|_| Vec::new()).collect();
         for serial_id in chunk_start..chunk_end {
@@ -186,16 +228,24 @@ pub async fn seed_three_party_db(harness: &TestHarness, count: usize) -> Result<
             for (pi, (left, right)) in [(l0, r0), (l1, r1), (l2, r2)].into_iter().enumerate() {
                 party_data[pi].push(S {
                     id: serial_id as i64,
-                    lc: left.code.coefs.to_vec(), lm: left.mask.coefs.to_vec(),
-                    rc: right.code.coefs.to_vec(), rm: right.mask.coefs.to_vec(),
+                    lc: left.code.coefs.to_vec(),
+                    lm: left.mask.coefs.to_vec(),
+                    rc: right.code.coefs.to_vec(),
+                    rm: right.mask.coefs.to_vec(),
                 });
             }
         }
         for (pi, shares) in party_data.iter().enumerate() {
-            let refs: Vec<StoredIrisRef> = shares.iter().map(|s| StoredIrisRef {
-                id: s.id, left_code: &s.lc, left_mask: &s.lm,
-                right_code: &s.rc, right_mask: &s.rm,
-            }).collect();
+            let refs: Vec<StoredIrisRef> = shares
+                .iter()
+                .map(|s| StoredIrisRef {
+                    id: s.id,
+                    left_code: &s.lc,
+                    left_mask: &s.lm,
+                    right_code: &s.rc,
+                    right_mask: &s.rm,
+                })
+                .collect();
             let store = harness.store(pi);
             let mut tx = store.tx().await?;
             store.insert_irises_overriding(&mut tx, &refs).await?;
@@ -226,10 +276,26 @@ pub async fn snapshot_all_fingerprints(harness: &TestHarness) -> Result<Plaintex
         }
         let mut hasher = blake3::Hasher::new();
         let fields: Vec<[&[u16]; 3]> = vec![
-            [shares[0].left_code(), shares[1].left_code(), shares[2].left_code()],
-            [shares[0].left_mask(), shares[1].left_mask(), shares[2].left_mask()],
-            [shares[0].right_code(), shares[1].right_code(), shares[2].right_code()],
-            [shares[0].right_mask(), shares[1].right_mask(), shares[2].right_mask()],
+            [
+                shares[0].left_code(),
+                shares[1].left_code(),
+                shares[2].left_code(),
+            ],
+            [
+                shares[0].left_mask(),
+                shares[1].left_mask(),
+                shares[2].left_mask(),
+            ],
+            [
+                shares[0].right_code(),
+                shares[1].right_code(),
+                shares[2].right_code(),
+            ],
+            [
+                shares[0].right_mask(),
+                shares[1].right_mask(),
+                shares[2].right_mask(),
+            ],
         ];
         for [s0, s1, s2] in &fields {
             let recon = reconstruct_shares(s0, s1, s2);
@@ -259,7 +325,11 @@ pub async fn verify_fingerprints(
         assert_eq!(exp, cur, "Plaintext fingerprint mismatch for id {}", id);
         checked += 1;
     }
-    println!("  verified {}/{} iris fingerprints", checked, expected.len());
+    println!(
+        "  verified {}/{} iris fingerprints",
+        checked,
+        expected.len()
+    );
     Ok(())
 }
 
@@ -276,18 +346,31 @@ pub async fn wait_epoch_done(harness: &TestHarness, epoch: i32) -> Result<()> {
         let mut done = true;
         let mut applied = [0usize; 3];
         for (i, party) in harness.parties.iter().enumerate() {
-            let rows: Vec<(bool,)> = sqlx::query_as(
-                "SELECT live_applied FROM rerand_progress WHERE epoch = $1",
-            ).bind(epoch).fetch_all(&party.store.pool).await?;
+            let rows: Vec<(bool,)> =
+                sqlx::query_as("SELECT live_applied FROM rerand_progress WHERE epoch = $1")
+                    .bind(epoch)
+                    .fetch_all(&party.store.pool)
+                    .await?;
             applied[i] = rows.iter().filter(|(a,)| *a).count();
-            if rows.is_empty() || !rows.iter().all(|(a,)| *a) { done = false; }
+            if rows.is_empty() || !rows.iter().all(|(a,)| *a) {
+                done = false;
+            }
         }
         if done {
-            println!("  epoch {} done in {:.1}s", epoch, start.elapsed().as_secs_f64());
+            println!(
+                "  epoch {} done in {:.1}s",
+                epoch,
+                start.elapsed().as_secs_f64()
+            );
             return Ok(());
         }
         if last_print.elapsed() > Duration::from_secs(5) {
-            println!("  waiting epoch {}: applied {:?} ({:.0}s)", epoch, applied, start.elapsed().as_secs_f64());
+            println!(
+                "  waiting epoch {}: applied {:?} ({:.0}s)",
+                epoch,
+                applied,
+                start.elapsed().as_secs_f64()
+            );
             last_print = std::time::Instant::now();
         }
         tokio::time::sleep(Duration::from_millis(500)).await;
@@ -305,11 +388,19 @@ pub async fn wait_chunks_staged(harness: &TestHarness, epoch: i32, n: i32) -> Re
         for party in &harness.parties {
             let (count,): (i64,) = sqlx::query_as(
                 "SELECT COUNT(*) FROM rerand_progress WHERE epoch = $1 AND staging_written = TRUE",
-            ).bind(epoch).fetch_one(&party.store.pool).await?;
+            )
+            .bind(epoch)
+            .fetch_one(&party.store.pool)
+            .await?;
             max_count = max_count.max(count);
         }
         if max_count >= n as i64 {
-            println!("  {} chunks staged for epoch {} in {:.1}s", max_count, epoch, start.elapsed().as_secs_f64());
+            println!(
+                "  {} chunks staged for epoch {} in {:.1}s",
+                max_count,
+                epoch,
+                start.elapsed().as_secs_f64()
+            );
             return Ok(());
         }
         tokio::time::sleep(Duration::from_millis(200)).await;
@@ -323,7 +414,9 @@ pub async fn simulate_server_startup(harness: &TestHarness, party: usize) -> Res
     let pool = &harness.parties[party].store.pool;
     let schema = &harness.parties[party].schema_name;
     let lock_conn = rerand_store::rerand_catchup_and_lock(pool, schema, &sync_result).await?;
-    let _count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM irises").fetch_one(pool).await?;
+    let _count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM irises")
+        .fetch_one(pool)
+        .await?;
     rerand_store::release_rerand_lock(lock_conn).await?;
     Ok(())
 }
@@ -331,7 +424,9 @@ pub async fn simulate_server_startup(harness: &TestHarness, party: usize) -> Res
 async fn build_test_sync_result(harness: &TestHarness, party: usize) -> Result<SyncResult> {
     let mut all_states = Vec::new();
     for p in &harness.parties {
-        let rerand_state = rerand_store::build_rerand_sync_state(&p.store.pool).await.ok();
+        let rerand_state = rerand_store::build_rerand_sync_state(&p.store.pool)
+            .await
+            .ok();
         all_states.push(SyncState {
             db_len: p.store.count_irises().await? as u64,
             modifications: vec![],
@@ -341,20 +436,34 @@ async fn build_test_sync_result(harness: &TestHarness, party: usize) -> Result<S
         });
     }
     let my_state = all_states[party].clone();
-    Ok(SyncResult { my_state, all_states })
+    Ok(SyncResult {
+        my_state,
+        all_states,
+    })
 }
 
 pub async fn assert_consistent_rerand_epoch(harness: &TestHarness) -> Result<i32> {
     let mut all: Vec<Vec<(i64, i32)>> = Vec::new();
     for party in &harness.parties {
-        all.push(sqlx::query_as("SELECT id, rerand_epoch FROM irises ORDER BY id")
-            .fetch_all(&party.store.pool).await?);
+        all.push(
+            sqlx::query_as("SELECT id, rerand_epoch FROM irises ORDER BY id")
+                .fetch_all(&party.store.pool)
+                .await?,
+        );
     }
     assert_eq!(all[0].len(), all[1].len());
     assert_eq!(all[1].len(), all[2].len());
     for i in 0..all[0].len() {
-        assert_eq!(all[0][i].1, all[1][i].1, "epoch mismatch id {} p0 vs p1", all[0][i].0);
-        assert_eq!(all[0][i].1, all[2][i].1, "epoch mismatch id {} p0 vs p2", all[0][i].0);
+        assert_eq!(
+            all[0][i].1, all[1][i].1,
+            "epoch mismatch id {} p0 vs p1",
+            all[0][i].0
+        );
+        assert_eq!(
+            all[0][i].1, all[2][i].1,
+            "epoch mismatch id {} p0 vs p2",
+            all[0][i].0
+        );
     }
     Ok(all[0].first().map(|(_, e)| *e).unwrap_or(0))
 }
@@ -363,9 +472,14 @@ async fn cleanup(harness: &TestHarness) -> Result<()> {
     for party in &harness.parties {
         let staging = rerand_store::staging_schema_name(&party.schema_name);
         let _ = sqlx::query(&format!(r#"DROP SCHEMA IF EXISTS "{}" CASCADE"#, staging))
-            .execute(&party.store.pool).await;
-        let _ = sqlx::query(&format!(r#"DROP SCHEMA IF EXISTS "{}" CASCADE"#, party.schema_name))
-            .execute(&party.store.pool).await;
+            .execute(&party.store.pool)
+            .await;
+        let _ = sqlx::query(&format!(
+            r#"DROP SCHEMA IF EXISTS "{}" CASCADE"#,
+            party.schema_name
+        ))
+        .execute(&party.store.pool)
+        .await;
     }
     Ok(())
 }
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index 1a28435eec..334b1747b5 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -397,7 +397,9 @@ async fn build_sync_state(
 
     tracing::info!("Database store length is: {}", db_len);
 
-    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool).await.ok();
+    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool)
+        .await
+        .ok();
 
     Ok(SyncState {
         db_len,

From b8985bf48d2bac13186ccf570008f69fb0336183 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 12:54:07 +0100
Subject: [PATCH 04/76] fixes

---
 iris-mpc-store/src/rerand.rs                  | 64 +++++++++----------
 iris-mpc-upgrade/src/continuous_rerand.rs     | 56 ++++++++--------
 iris-mpc-upgrade/src/epoch.rs                 | 30 ++++++---
 .../tests/continuous_rerand_e2e.rs            | 43 ++++++++++---
 iris-mpc-upgrade/tests/test_utils.rs          | 29 +++++++--
 5 files changed, 135 insertions(+), 87 deletions(-)

diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index a52372d268..318358f2c0 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -217,16 +217,13 @@ pub async fn get_rerand_progress(
 /// Returns the highest chunk_id where all_confirmed = TRUE for a given epoch,
 /// or None if no chunks are confirmed.
 pub async fn get_max_confirmed_chunk(pool: &PgPool, epoch: i32) -> Result<Option<i32>> {
-    let row: Option<(i32,)> = sqlx::query_as(
+    let row: (Option<i32>,) = sqlx::query_as(
         "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND all_confirmed = TRUE",
     )
     .bind(epoch)
-    .fetch_optional(pool)
+    .fetch_one(pool)
     .await?;
-    match row {
-        Some((max,)) => Ok(Some(max)),
-        None => Ok(None),
-    }
+    Ok(row.0)
 }
 
 /// Returns the highest epoch that has any rerand_progress rows.
@@ -324,40 +321,39 @@ pub async fn rerand_catchup_and_lock(
     schema_name: &str,
     sync_result: &SyncResult,
 ) -> Result<Option<sqlx::pool::PoolConnection<sqlx::Postgres>>> {
-    let (epoch, chunk_id) = match compute_rerand_catchup_chunk(sync_result)? {
-        Some(v) => v,
-        None => return Ok(None),
-    };
-
-    let staging_schema = staging_schema_name(schema_name);
-    tracing::info!(
-        "Rerand catch-up: applying epoch {} chunk {}",
-        epoch,
-        chunk_id,
-    );
-
     let mut conn = pool.acquire().await?;
     sqlx::query("SELECT pg_advisory_lock($1)")
         .bind(RERAND_APPLY_LOCK)
         .execute(&mut *conn)
         .await?;
 
-    let rows = match apply_staging_chunk(pool, &staging_schema, epoch, chunk_id).await {
-        Ok(r) => r,
-        Err(e) => {
-            let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
-                .bind(RERAND_APPLY_LOCK)
-                .execute(&mut *conn)
-                .await;
-            return Err(e);
-        }
-    };
-    tracing::info!(
-        "Rerand catch-up: applied epoch {} chunk {} ({} rows)",
-        epoch,
-        chunk_id,
-        rows
-    );
+    if let Some((epoch, chunk_id)) = compute_rerand_catchup_chunk(sync_result)? {
+        let staging_schema = staging_schema_name(schema_name);
+        tracing::info!(
+            "Rerand catch-up: applying epoch {} chunk {}",
+            epoch,
+            chunk_id,
+        );
+
+        let rows = match apply_staging_chunk(pool, &staging_schema, epoch, chunk_id).await {
+            Ok(r) => r,
+            Err(e) => {
+                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
+                    .bind(RERAND_APPLY_LOCK)
+                    .execute(&mut *conn)
+                    .await;
+                return Err(e);
+            }
+        };
+        tracing::info!(
+            "Rerand catch-up: applied epoch {} chunk {} ({} rows)",
+            epoch,
+            chunk_id,
+            rows
+        );
+    } else {
+        tracing::info!("Rerand catch-up: no chunk to apply");
+    }
 
     Ok(Some(conn))
 }
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index b6152948e8..12984e5577 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -2,7 +2,7 @@ use aws_sdk_s3::Client as S3Client;
 use aws_sdk_secretsmanager::Client as SecretsManagerClient;
 use bytemuck::cast_slice;
 use eyre::Result;
-use futures::TryStreamExt;
+use futures::StreamExt;
 use iris_mpc_store::rerand::{
     apply_staging_chunk, ensure_staging_schema, get_rerand_progress, insert_staging_irises,
     set_all_confirmed, set_staging_written, staging_schema_name, upsert_rerand_progress,
@@ -256,34 +256,36 @@ async fn process_chunk_staging(
 ) -> Result<()> {
     let (start, end) = manifest.chunk_range(chunk_id);
 
-    let entries: Vec<_> = store
-        .stream_irises_in_range(start..end)
-        .try_collect()
-        .await?;
+    const BATCH_SIZE: usize = 500;
 
-    let staging_entries: Vec<StagingIrisEntry> = entries
-        .into_iter()
-        .map(|iris| {
-            let version_id = iris.version_id();
-            let iris_id = iris.id();
-            let (_, lc, lm, rc, rm) = randomize_iris(iris, shared_secret, party_id as usize);
-            StagingIrisEntry {
-                epoch: epoch as i32,
-                id: iris_id,
-                chunk_id: chunk_id as i32,
-                left_code: cast_slice::<u16, u8>(&lc.coefs).to_vec(),
-                left_mask: cast_slice::<u16, u8>(&lm.coefs).to_vec(),
-                right_code: cast_slice::<u16, u8>(&rc.coefs).to_vec(),
-                right_mask: cast_slice::<u16, u8>(&rm.coefs).to_vec(),
-                original_version_id: version_id,
-                rerand_epoch: (epoch + 1) as i32,
-            }
-        })
-        .collect();
+    let mut stream = store.stream_irises_in_range(start..end);
+    let mut batch: Vec<StagingIrisEntry> = Vec::with_capacity(BATCH_SIZE);
+
+    while let Some(iris) = stream.next().await.transpose()? {
+        let version_id = iris.version_id();
+        let iris_id = iris.id();
+        let (_, lc, lm, rc, rm) = randomize_iris(iris, shared_secret, party_id as usize);
+
+        batch.push(StagingIrisEntry {
+            epoch: epoch as i32,
+            id: iris_id,
+            chunk_id: chunk_id as i32,
+            left_code: cast_slice::<u16, u8>(&lc.coefs).to_vec(),
+            left_mask: cast_slice::<u16, u8>(&lm.coefs).to_vec(),
+            right_code: cast_slice::<u16, u8>(&rc.coefs).to_vec(),
+            right_mask: cast_slice::<u16, u8>(&rm.coefs).to_vec(),
+            original_version_id: version_id,
+            rerand_epoch: (epoch + 1) as i32,
+        });
+
+        if batch.len() >= BATCH_SIZE {
+            insert_staging_irises(pool, staging_schema, &batch).await?;
+            batch.clear();
+        }
+    }
 
-    const BATCH_SIZE: usize = 500;
-    for batch in staging_entries.chunks(BATCH_SIZE) {
-        insert_staging_irises(pool, staging_schema, batch).await?;
+    if !batch.is_empty() {
+        insert_staging_irises(pool, staging_schema, &batch).await?;
     }
 
     Ok(())
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index 00bca5c1bd..daddb68e98 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -56,7 +56,7 @@ async fn save_private_key_to_sm(
     epoch: u32,
     party_id: u8,
     key: &tripartite_dh::PrivateKey,
-) -> Result<()> {
+) -> Result<bool> {
     let sid = secret_id(env, epoch, party_id);
     let b64 = STANDARD.encode(key.serialize());
 
@@ -67,17 +67,11 @@ async fn save_private_key_to_sm(
         .send()
         .await
     {
-        Ok(_) => Ok(()),
+        Ok(_) => Ok(true),
         Err(e) => {
             let svc = e.into_service_error();
             if svc.is_resource_exists_exception() {
-                sm.put_secret_value()
-                    .secret_id(&sid)
-                    .secret_string(&b64)
-                    .send()
-                    .await
-                    .map_err(|e| eyre!("SM PutSecretValue failed for {}: {}", sid, e))?;
-                Ok(())
+                Ok(false)
             } else {
                 Err(eyre!("SM CreateSecret failed for {}: {}", sid, svc))
             }
@@ -134,7 +128,23 @@ pub async fn idempotent_keygen(
     let mut rng = rand::rngs::OsRng;
     let private_key = tripartite_dh::PrivateKey::random(&mut rng);
 
-    save_private_key_to_sm(sm, env, epoch, party_id, &private_key).await?;
+    let saved = save_private_key_to_sm(sm, env, epoch, party_id, &private_key).await?;
+    let private_key = if saved {
+        private_key
+    } else {
+        tracing::warn!(
+            "Epoch {}: private key already exists in SM (likely concurrent start); reloading it",
+            epoch
+        );
+        load_private_key_from_sm(sm, env, epoch, party_id)
+            .await?
+            .ok_or_else(|| {
+                eyre!(
+                    "Secret existed but could not be loaded: {}",
+                    secret_id(env, epoch, party_id)
+                )
+            })?
+    };
 
     let public_key = private_key.public_key();
     let pk_b64 = STANDARD.encode(public_key.serialize());
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index b2a1082dec..5e0126a299 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -48,7 +48,7 @@ fn phase1_clean_epoch() {
         wait_epoch_done(&env.harness, 0).await?;
         stop_all(t, h).await;
 
-        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1, "Expected rerand_epoch >= 1, got {}", ep);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
         println!("[phase 1] PASSED (epoch={})", ep);
@@ -80,7 +80,7 @@ fn phase2_kill_and_resume() {
         wait_epoch_done(&env.harness, 0).await?;
         stop_all(t, h).await;
 
-        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
         println!("[phase 2] PASSED (epoch={})", ep);
@@ -108,10 +108,33 @@ fn phase3_concurrent_modifications() {
         // Bump version_id on a few rows (simulates a reauth)
         for &id in &modified_ids {
             for party in &env.harness.parties {
-                sqlx::query("UPDATE irises SET left_code = left_code WHERE id = $1")
-                    .bind(id)
-                    .execute(&party.store.pool)
-                    .await?;
+                let (before,): (i16,) =
+                    sqlx::query_as("SELECT version_id FROM irises WHERE id = $1")
+                        .bind(id)
+                        .fetch_one(&party.store.pool)
+                        .await?;
+
+                // Flip one byte so the `increment_version_id` trigger fires.
+                sqlx::query(
+                    r#"
+                    UPDATE irises
+                    SET left_code = set_byte(left_code, 0, get_byte(left_code, 0) # 1)
+                    WHERE id = $1
+                    "#,
+                )
+                .bind(id)
+                .execute(&party.store.pool)
+                .await?;
+
+                let (after,): (i16,) =
+                    sqlx::query_as("SELECT version_id FROM irises WHERE id = $1")
+                        .bind(id)
+                        .fetch_one(&party.store.pool)
+                        .await?;
+                eyre::ensure!(
+                    after > before,
+                    "Expected version_id to increase for id={id}"
+                );
             }
         }
         println!("[phase 3]   bumped version_id on {:?}", modified_ids);
@@ -119,7 +142,7 @@ fn phase3_concurrent_modifications() {
         wait_epoch_done(&env.harness, 0).await?;
         stop_all(t, h).await;
 
-        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        let ep = assert_consistent_rerand_epoch(&env.harness, &modified_ids).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &modified_ids).await?;
         println!("[phase 3] PASSED (epoch={})", ep);
@@ -151,7 +174,7 @@ fn phase4_server_restart_during_rerand() {
         wait_epoch_done(&env.harness, 0).await?;
         stop_all(t, h).await;
 
-        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
         println!("[phase 4] PASSED (epoch={})", ep);
@@ -191,7 +214,7 @@ fn phase5_staggered_restart() {
         let _ = h0.await;
         stop_all(t, h).await;
 
-        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
         println!("[phase 5] PASSED (epoch={})", ep);
@@ -224,7 +247,7 @@ fn phase6_multiple_epochs() {
 
         stop_all(t, h).await;
 
-        let ep = assert_consistent_rerand_epoch(&env.harness).await?;
+        let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 2, "Expected rerand_epoch >= 2, got {}", ep);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
         println!("[phase 6] PASSED (epoch={})", ep);
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index 6a6280eafa..ca98990ba7 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -96,7 +96,7 @@ impl TestEnv {
             DB_SIZE, prefix, bucket
         );
         seed_three_party_db(&harness, DB_SIZE).await?;
-        let fingerprints = snapshot_all_fingerprints(&harness).await?;
+        let fingerprints = snapshot_all_fingerprints(&harness, &[]).await?;
 
         Ok(Self {
             harness,
@@ -262,14 +262,21 @@ pub async fn seed_three_party_db(harness: &TestHarness, count: usize) -> Result<
 pub type PlaintextFingerprints = HashMap<i64, [u8; 32]>;
 
 /// Compute a fingerprint for every iris in the DB by reconstructing shares
-/// from all 3 parties.
-pub async fn snapshot_all_fingerprints(harness: &TestHarness) -> Result<PlaintextFingerprints> {
+/// from all 3 parties. IDs in `skip_ids` are excluded (their shares may be
+/// inconsistent across parties due to concurrent modifications).
+pub async fn snapshot_all_fingerprints(
+    harness: &TestHarness,
+    skip_ids: &[i64],
+) -> Result<PlaintextFingerprints> {
     let ids: Vec<(i64,)> = sqlx::query_as("SELECT id FROM irises ORDER BY id")
         .fetch_all(&harness.store(0).pool)
         .await?;
 
     let mut fps = PlaintextFingerprints::new();
     for (id,) in ids {
+        if skip_ids.contains(&id) {
+            continue;
+        }
         let mut shares = Vec::new();
         for party in 0..NUM_PARTIES {
             shares.push(harness.store(party).get_iris_data_by_id(id).await?);
@@ -313,7 +320,7 @@ pub async fn verify_fingerprints(
     expected: &PlaintextFingerprints,
     skip_ids: &[i64],
 ) -> Result<()> {
-    let current = snapshot_all_fingerprints(harness).await?;
+    let current = snapshot_all_fingerprints(harness, skip_ids).await?;
     let mut checked = 0;
     for (id, exp) in expected {
         if skip_ids.contains(id) {
@@ -442,7 +449,10 @@ async fn build_test_sync_result(harness: &TestHarness, party: usize) -> Result<S
     })
 }
 
-pub async fn assert_consistent_rerand_epoch(harness: &TestHarness) -> Result<i32> {
+pub async fn assert_consistent_rerand_epoch(
+    harness: &TestHarness,
+    skip_ids: &[i64],
+) -> Result<i32> {
     let mut all: Vec<Vec<(i64, i32)>> = Vec::new();
     for party in &harness.parties {
         all.push(
@@ -454,6 +464,9 @@ pub async fn assert_consistent_rerand_epoch(harness: &TestHarness) -> Result<i32
     assert_eq!(all[0].len(), all[1].len());
     assert_eq!(all[1].len(), all[2].len());
     for i in 0..all[0].len() {
+        if skip_ids.contains(&all[0][i].0) {
+            continue;
+        }
         assert_eq!(
             all[0][i].1, all[1][i].1,
             "epoch mismatch id {} p0 vs p1",
@@ -465,7 +478,11 @@ pub async fn assert_consistent_rerand_epoch(harness: &TestHarness) -> Result<i32
             all[0][i].0
         );
     }
-    Ok(all[0].first().map(|(_, e)| *e).unwrap_or(0))
+    Ok(all[0]
+        .iter()
+        .find(|(id, _)| !skip_ids.contains(id))
+        .map(|(_, e)| *e)
+        .unwrap_or(0))
 }
 
 async fn cleanup(harness: &TestHarness) -> Result<()> {

From 8a017c9d4d889d2056fb3538b00c810e9017e002 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 13:06:17 +0100
Subject: [PATCH 05/76] migrations for schema

---
 iris-mpc-store/src/rerand.rs                  | 26 -------------------
 iris-mpc-upgrade/src/continuous_rerand.rs     |  9 +++----
 iris-mpc-upgrade/tests/test_utils.rs          |  7 +----
 ...60226000003_create_rerand_staging.down.sql |  7 +++++
 ...0260226000003_create_rerand_staging.up.sql | 19 ++++++++++++++
 5 files changed, 30 insertions(+), 38 deletions(-)
 create mode 100644 migrations/20260226000003_create_rerand_staging.down.sql
 create mode 100644 migrations/20260226000003_create_rerand_staging.up.sql

diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 318358f2c0..3c5dedd334 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -42,32 +42,6 @@ fn validate_identifier(name: &str) -> Result<()> {
     Ok(())
 }
 
-pub async fn ensure_staging_schema(pool: &PgPool, staging_schema: &str) -> Result<()> {
-    validate_identifier(staging_schema)?;
-    let create_schema = format!(r#"CREATE SCHEMA IF NOT EXISTS "{}""#, staging_schema);
-    sqlx::query(&create_schema).execute(pool).await?;
-
-    let create_table = format!(
-        r#"
-        CREATE TABLE IF NOT EXISTS "{}".irises (
-            epoch               INTEGER NOT NULL,
-            id                  BIGINT NOT NULL,
-            chunk_id            INTEGER NOT NULL,
-            left_code           BYTEA,
-            left_mask           BYTEA,
-            right_code          BYTEA,
-            right_mask          BYTEA,
-            original_version_id SMALLINT,
-            rerand_epoch        INTEGER,
-            PRIMARY KEY (epoch, id)
-        )
-        "#,
-        staging_schema,
-    );
-    sqlx::query(&create_table).execute(pool).await?;
-    Ok(())
-}
-
 pub async fn insert_staging_irises(
     pool: &PgPool,
     staging_schema: &str,
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 12984e5577..74cf433eea 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -4,9 +4,9 @@ use bytemuck::cast_slice;
 use eyre::Result;
 use futures::StreamExt;
 use iris_mpc_store::rerand::{
-    apply_staging_chunk, ensure_staging_schema, get_rerand_progress, insert_staging_irises,
-    set_all_confirmed, set_staging_written, staging_schema_name, upsert_rerand_progress,
-    StagingIrisEntry, RERAND_APPLY_LOCK,
+    apply_staging_chunk, get_rerand_progress, insert_staging_irises, set_all_confirmed,
+    set_staging_written, staging_schema_name, upsert_rerand_progress, StagingIrisEntry,
+    RERAND_APPLY_LOCK,
 };
 use iris_mpc_store::Store;
 use sqlx::PgPool;
@@ -36,9 +36,6 @@ pub async fn run_continuous_rerand(
     let poll_interval = Duration::from_millis(config.s3_poll_interval_ms);
     let chunk_delay = Duration::from_secs(config.chunk_delay_secs);
 
-    ensure_staging_schema(pool, &staging_schema).await?;
-    tracing::info!("Staging schema ensured: {}", staging_schema);
-
     loop {
         if is_cancelled(cancel) {
             return Ok(());
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index ca98990ba7..b344e0f965 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -8,7 +8,7 @@ use iris_mpc_common::{
     iris_db::iris::IrisCode,
     postgres::{AccessMode, PostgresClient},
 };
-use iris_mpc_store::rerand::{self as rerand_store};
+use iris_mpc_store::rerand as rerand_store;
 use iris_mpc_store::{Store, StoredIrisRef};
 use iris_mpc_upgrade::config::RerandomizeContinuousConfig;
 use iris_mpc_upgrade::continuous_rerand::run_continuous_rerand;
@@ -44,11 +44,6 @@ impl TestHarness {
             let schema = format!("{}_{}", schema_prefix, i);
             let pg = PostgresClient::new(url, &schema, AccessMode::ReadWrite).await?;
             let store = Store::new(&pg).await?;
-            rerand_store::ensure_staging_schema(
-                &store.pool,
-                &rerand_store::staging_schema_name(&schema),
-            )
-            .await?;
             parties.push(PartyDb {
                 store,
                 schema_name: schema,
diff --git a/migrations/20260226000003_create_rerand_staging.down.sql b/migrations/20260226000003_create_rerand_staging.down.sql
new file mode 100644
index 0000000000..f5df9a0640
--- /dev/null
+++ b/migrations/20260226000003_create_rerand_staging.down.sql
@@ -0,0 +1,7 @@
+DO $$
+DECLARE
+    staging_schema TEXT;
+BEGIN
+    staging_schema := current_schema() || '_rerand_staging';
+    EXECUTE format('DROP SCHEMA IF EXISTS %I CASCADE', staging_schema);
+END $$;
diff --git a/migrations/20260226000003_create_rerand_staging.up.sql b/migrations/20260226000003_create_rerand_staging.up.sql
new file mode 100644
index 0000000000..c036da5154
--- /dev/null
+++ b/migrations/20260226000003_create_rerand_staging.up.sql
@@ -0,0 +1,19 @@
+DO $$
+DECLARE
+    staging_schema TEXT;
+BEGIN
+    staging_schema := current_schema() || '_rerand_staging';
+    EXECUTE format('CREATE SCHEMA IF NOT EXISTS %I', staging_schema);
+    EXECUTE format('CREATE TABLE IF NOT EXISTS %I.irises (
+        epoch               INTEGER NOT NULL,
+        id                  BIGINT NOT NULL,
+        chunk_id            INTEGER NOT NULL,
+        left_code           BYTEA,
+        left_mask           BYTEA,
+        right_code          BYTEA,
+        right_mask          BYTEA,
+        original_version_id SMALLINT,
+        rerand_epoch        INTEGER,
+        PRIMARY KEY (epoch, id)
+    )', staging_schema);
+END $$;

From ed710adb33613fa546f5f10e34382f53ff261a2d Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 13:39:38 +0100
Subject: [PATCH 06/76] fix chunk sync

---
 docs/specs/rerandomization.md             |  36 ++++---
 iris-mpc-store/src/rerand.rs              | 115 +++++++++++++++++-----
 iris-mpc-upgrade/src/continuous_rerand.rs |  19 +---
 3 files changed, 112 insertions(+), 58 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 2ab365614a..6ad5ae7bd2 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -95,7 +95,7 @@ When `rerand_epoch` changes (rerandomization), share data changes but `version_i
 
 ### Staging schema
 
-Each party has a staging schema (e.g. `SMPC_rerand_staging`) with:
+Each party has a staging schema (`{live_schema}_rerand_staging`), created automatically by a migration that derives the name from `current_schema()`:
 
 ```sql
 CREATE TABLE irises (
@@ -147,10 +147,10 @@ Runs continuously:
 8. Upload S3 marker after staging commit: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged`
 9. Poll S3 until all 3 party markers exist for chunk K
 10. Set `all_confirmed = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
-11. Acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection, then copy from staging to live DB, delete staging, and mark applied — all in one transaction (scoped to epoch and chunk):
+11. Copy from staging to live DB, delete staging, and mark applied — all in one transaction that holds `pg_advisory_xact_lock(RERAND_APPLY_LOCK)` (automatically released on commit, rollback, or connection drop):
     ```sql
-    SELECT pg_advisory_lock(RERAND_APPLY_LOCK);   -- on dedicated connection
     BEGIN;
+    SELECT pg_advisory_xact_lock(RERAND_APPLY_LOCK);
     UPDATE irises SET
       left_code = staging.left_code,
       left_mask = staging.left_mask,
@@ -164,8 +164,7 @@ Runs continuously:
       AND irises.version_id = staging.original_version_id;
     DELETE FROM staging_schema.irises WHERE epoch = E AND chunk_id = K;
     UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = E AND chunk_id = K;
-    COMMIT;
-    SELECT pg_advisory_unlock(RERAND_APPLY_LOCK);  -- release after commit
+    COMMIT;  -- xact lock released here
     ```
 12. Proceed to next chunk (or start epoch transition if all chunks done)
 
@@ -181,7 +180,10 @@ At startup, before `load_iris_db`:
    - **Why at most 1 chunk**: the rerand loop has a strict per-chunk synchronization barrier — a node cannot stage chunk K+1 until all three parties have confirmed chunk K via S3 markers. Therefore it is impossible for any peer to be more than 1 confirmed chunk ahead. The implementation enforces this with a fatal bail if the gap exceeds 1 (indicates DB corruption).
    - **Why `max` across peers**: `all_confirmed = TRUE` at any party means that party observed all three S3 `staged` markers, which means all three parties successfully committed the chunk to their staging schemas. A slower party may not have polled S3 yet, but its staging data is already there.
    - Edge case: if all parties report the same `max_confirmed_chunk`, there is nothing to catch up and the step is skipped.
-3. **New (DB-only catch-up)**: acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection. If step 2 identified a chunk to apply, run the same apply transaction as Step 1.11. **Keep the lock held** through step 4.
+3. **New (DB-only catch-up)**: two phases, in order:
+   - **Phase A — apply locally pending chunks**: query `rerand_progress` for any chunks where `all_confirmed = TRUE AND live_applied = FALSE`. Apply each one via the same apply transaction as Step 1.11 (each acquires its own `pg_advisory_xact_lock`). This handles the crash window where the rerand server set `all_confirmed` but crashed before applying to live — without this, the node would advertise itself as caught up (watermark based on `all_confirmed`) while its live DB is stale.
+   - **Phase B — apply peer-ahead chunk**: if step 2 identified a chunk where a peer is strictly ahead, apply it (skipped if already handled in Phase A).
+   - **Phase C — hold session lock**: acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection. **Keep the lock held** through step 4. This prevents the rerand loop from applying new chunks while the DB snapshot is being read.
 4. **Existing**: `load_iris_db` — loads from live DB into GPU memory. The advisory lock is still held, so the rerand server cannot apply new chunks while the DB is being read into memory.
 5. Release the advisory lock: `SELECT pg_advisory_unlock(RERAND_APPLY_LOCK)` on the dedicated connection, then drop the connection.
 
@@ -196,23 +198,29 @@ If either check fails, the main server refuses to start. This catches DB corrupt
 
 ### Advisory lock: startup vs rerand server concurrency
 
-Both the rerand server (Step 1.11) and the main server startup (Steps 2.3–2.4) acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` before applying chunks. This ensures:
+Two lock scopes are used, both keyed on `RERAND_APPLY_LOCK`:
 
-- Only one process applies chunks at a time (no interleaving).
-- The main server holds the lock from catch-up through `load_iris_db`, so the rerand server cannot sneak in applies between catch-up and memory load.
-- If either process crashes, the connection drops and Postgres automatically releases the session-level lock. No stale locks.
+- **Transaction-level (`pg_advisory_xact_lock`)** — acquired inside `apply_staging_chunk`'s transaction. Automatically released on commit, rollback, or connection drop (including task abort). Used by both the rerand server loop (Step 1.11) and startup catch-up (Step 2.3 Phases A/B).
+- **Session-level (`pg_advisory_lock`)** — acquired on a dedicated connection during startup (Step 2.3 Phase C) and held through `load_iris_db` (Step 2.4). Prevents the rerand loop from applying new chunks while the DB snapshot is being read. Released explicitly after load.
 
-**Implementation with connection pools (sqlx)**: session-level advisory locks are tied to a specific Postgres connection. When using a connection pool, acquire a **dedicated connection** (`pool.acquire()`) and hold it (do not drop/return it) for the entire lock window. The catch-up queries and `load_iris_db` can use the pool normally — the dedicated connection just sits idle holding the lock. Release with `pg_advisory_unlock(...)` on the same connection after `load_iris_db` completes, then drop the connection.
+These two scopes serialize correctly: `pg_advisory_xact_lock` and `pg_advisory_lock` on the same key conflict across sessions, so the rerand loop's apply transaction blocks while the main server holds the session lock, and vice versa.
+
+**Why `pg_advisory_xact_lock` for applies**: session-level locks are tied to a connection. If a process is killed while holding a session-level lock on a pooled connection, the connection may be returned to the pool with the lock still held, blocking future acquirers indefinitely. Transaction-level locks avoid this: when the connection is dropped, the transaction rolls back and the lock is released automatically.
+
+**Implementation with connection pools (sqlx)**:
 
 ```rust
+// Catch-up applies (each self-protected by xact lock inside apply_staging_chunk):
+for (epoch, chunk_id) in pending_chunks {
+    apply_staging_chunk(&pool, epoch, chunk_id).await?;
+}
+
+// Session lock for load_iris_db window:
 let mut lock_conn = pool.acquire().await?;
 sqlx::query("SELECT pg_advisory_lock($1)")
     .bind(RERAND_APPLY_LOCK)
     .execute(&mut *lock_conn).await?;
 
-if let Some((epoch, chunk_id)) = catchup_chunk {
-    apply_staging_chunk(&pool, epoch, chunk_id).await?;
-}
 load_iris_db(&pool).await?;
 
 sqlx::query("SELECT pg_advisory_unlock($1)")
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 3c5dedd334..ba1dda8828 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -79,9 +79,11 @@ pub async fn insert_staging_irises(
 /// Apply a confirmed staging chunk to the live DB.
 ///
 /// Within a single transaction:
-///   1. UPDATE live irises from staging (optimistic lock on version_id)
-///   2. DELETE staging rows for this chunk
-///   3. Mark live_applied in rerand_progress
+///   1. Acquire `pg_advisory_xact_lock(RERAND_APPLY_LOCK)` (released
+///      automatically on commit/rollback/connection-drop).
+///   2. UPDATE live irises from staging (optimistic lock on version_id)
+///   3. DELETE staging rows for this chunk
+///   4. Mark live_applied in rerand_progress
 pub async fn apply_staging_chunk(
     pool: &PgPool,
     staging_schema: &str,
@@ -91,6 +93,11 @@ pub async fn apply_staging_chunk(
     validate_identifier(staging_schema)?;
     let mut tx = pool.begin().await?;
 
+    sqlx::query("SELECT pg_advisory_xact_lock($1)")
+        .bind(RERAND_APPLY_LOCK)
+        .execute(&mut *tx)
+        .await?;
+
     let update_sql = format!(
         r#"
         UPDATE irises SET
@@ -200,6 +207,20 @@ pub async fn get_max_confirmed_chunk(pool: &PgPool, epoch: i32) -> Result<Option
     Ok(row.0)
 }
 
+/// Returns chunks that are confirmed but not yet applied to the live DB.
+/// In normal operation there is at most 1 such chunk (the crash window
+/// between `set_all_confirmed` and `apply_staging_chunk`).
+pub async fn get_confirmed_unapplied_chunks(pool: &PgPool) -> Result<Vec<(i32, i32)>> {
+    let rows: Vec<(i32, i32)> = sqlx::query_as(
+        "SELECT epoch, chunk_id FROM rerand_progress \
+         WHERE all_confirmed = TRUE AND live_applied = FALSE \
+         ORDER BY epoch, chunk_id",
+    )
+    .fetch_all(pool)
+    .await?;
+    Ok(rows)
+}
+
 /// Returns the highest epoch that has any rerand_progress rows.
 pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
     let row: (Option<i32>,) = sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
@@ -285,51 +306,91 @@ pub fn compute_rerand_catchup_chunk(sync_result: &SyncResult) -> Result<Option<(
 
 /// Perform rerand catch-up and acquire the advisory lock.
 ///
-/// 1. Determines whether this node is 1 chunk behind a peer.
-/// 2. If so, acquires `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated
-///    connection and applies the single missing chunk.
-/// 3. Returns the lock-holding connection (caller keeps it alive through
-///    `load_iris_db`, then calls [`release_rerand_lock`]).
+/// 1. Applies any locally confirmed-but-unapplied chunks (covers the crash
+///    window between `set_all_confirmed` and `apply_staging_chunk`).
+///    Each apply is self-protected by `pg_advisory_xact_lock` inside its
+///    transaction.
+/// 2. If a peer advertises a strictly higher watermark, applies that one
+///    additional chunk (same xact-lock protection).
+/// 3. Acquires a session-level `pg_advisory_lock(RERAND_APPLY_LOCK)` that
+///    the caller holds through `load_iris_db`, preventing the rerand loop
+///    from applying new chunks while the DB snapshot is being read.
+/// 4. Returns the lock-holding connection (caller calls
+///    [`release_rerand_lock`] when done).
 pub async fn rerand_catchup_and_lock(
     pool: &PgPool,
     schema_name: &str,
     sync_result: &SyncResult,
 ) -> Result<Option<sqlx::pool::PoolConnection<sqlx::Postgres>>> {
+    let staging_schema = staging_schema_name(schema_name);
+
+    // Steps 1+2: apply pending chunks. Each `apply_staging_chunk` acquires
+    // pg_advisory_xact_lock inside its own transaction; we must not hold
+    // the session-level lock yet (it would deadlock across connections).
+    rerand_catchup_inner(pool, &staging_schema, sync_result).await?;
+
+    // Step 3: hold the session-level lock through load_iris_db.
     let mut conn = pool.acquire().await?;
     sqlx::query("SELECT pg_advisory_lock($1)")
         .bind(RERAND_APPLY_LOCK)
         .execute(&mut *conn)
         .await?;
 
-    if let Some((epoch, chunk_id)) = compute_rerand_catchup_chunk(sync_result)? {
-        let staging_schema = staging_schema_name(schema_name);
+    Ok(Some(conn))
+}
+
+async fn rerand_catchup_inner(
+    pool: &PgPool,
+    staging_schema: &str,
+    sync_result: &SyncResult,
+) -> Result<()> {
+    // If the rerand tables haven't been migrated yet, the sync state will
+    // be None and there is nothing to catch up.  Skip unconditionally so
+    // a rolling deploy doesn't crash before migrations run.
+    if sync_result.my_state.rerand_state.is_none() {
+        return Ok(());
+    }
+
+    // Step 1: apply any locally confirmed-but-unapplied chunks.
+    // This closes the crash window where all_confirmed was persisted but
+    // apply_staging_chunk had not yet run.
+    let pending = get_confirmed_unapplied_chunks(pool).await?;
+    for (epoch, chunk_id) in &pending {
         tracing::info!(
-            "Rerand catch-up: applying epoch {} chunk {}",
+            "Rerand catch-up: applying locally pending epoch {} chunk {}",
             epoch,
             chunk_id,
         );
-
-        let rows = match apply_staging_chunk(pool, &staging_schema, epoch, chunk_id).await {
-            Ok(r) => r,
-            Err(e) => {
-                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
-                    .bind(RERAND_APPLY_LOCK)
-                    .execute(&mut *conn)
-                    .await;
-                return Err(e);
-            }
-        };
+        let rows = apply_staging_chunk(pool, staging_schema, *epoch, *chunk_id).await?;
         tracing::info!(
-            "Rerand catch-up: applied epoch {} chunk {} ({} rows)",
+            "Rerand catch-up: applied locally pending epoch {} chunk {} ({} rows)",
             epoch,
             chunk_id,
-            rows
+            rows,
         );
-    } else {
-        tracing::info!("Rerand catch-up: no chunk to apply");
     }
 
-    Ok(Some(conn))
+    // Step 2: if a peer is one chunk ahead, apply that chunk too.
+    if let Some((epoch, chunk_id)) = compute_rerand_catchup_chunk(sync_result)? {
+        if !pending.contains(&(epoch, chunk_id)) {
+            tracing::info!(
+                "Rerand catch-up: applying peer-ahead epoch {} chunk {}",
+                epoch,
+                chunk_id,
+            );
+            let rows = apply_staging_chunk(pool, staging_schema, epoch, chunk_id).await?;
+            tracing::info!(
+                "Rerand catch-up: applied peer-ahead epoch {} chunk {} ({} rows)",
+                epoch,
+                chunk_id,
+                rows,
+            );
+        }
+    } else if pending.is_empty() {
+        tracing::info!("Rerand catch-up: no chunks to apply");
+    }
+
+    Ok(())
 }
 
 /// Release the advisory lock acquired by [`rerand_catchup_and_lock`].
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 74cf433eea..d8356cfe35 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -6,7 +6,6 @@ use futures::StreamExt;
 use iris_mpc_store::rerand::{
     apply_staging_chunk, get_rerand_progress, insert_staging_irises, set_all_confirmed,
     set_staging_written, staging_schema_name, upsert_rerand_progress, StagingIrisEntry,
-    RERAND_APPLY_LOCK,
 };
 use iris_mpc_store::Store;
 use sqlx::PgPool;
@@ -139,23 +138,9 @@ pub async fn run_continuous_rerand(
                 return Ok(());
             }
 
-            let mut lock_conn = pool.acquire().await?;
-            sqlx::query("SELECT pg_advisory_lock($1)")
-                .bind(RERAND_APPLY_LOCK)
-                .execute(&mut *lock_conn)
-                .await?;
-
-            let apply_res =
+            let rows =
                 apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32)
-                    .await;
-
-            sqlx::query("SELECT pg_advisory_unlock($1)")
-                .bind(RERAND_APPLY_LOCK)
-                .execute(&mut *lock_conn)
-                .await?;
-            drop(lock_conn);
-
-            let rows = apply_res?;
+                    .await?;
             tracing::info!(
                 "Epoch {} chunk {}: applied to live DB ({} rows updated)",
                 active_epoch,

From b608592e89e7dfe2480dfc5dcbe398471f201bce Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 14:11:55 +0100
Subject: [PATCH 07/76] more fixes

---
 iris-mpc-store/src/rerand.rs                  |  8 ++++-
 iris-mpc-upgrade/src/continuous_rerand.rs     | 16 +++++++---
 iris-mpc-upgrade/src/epoch.rs                 | 32 +++++++++++++++----
 iris-mpc-upgrade/src/s3_coordination.rs       | 20 +++++++++++-
 ...0260226000003_create_rerand_staging.up.sql |  4 +++
 5 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index ba1dda8828..0eb35cc4a8 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -371,8 +371,14 @@ async fn rerand_catchup_inner(
     }
 
     // Step 2: if a peer is one chunk ahead, apply that chunk too.
+    // Skip if already handled in step 1, or if no progress row exists
+    // (ghost chunk at epoch boundary — the chunk doesn't actually exist).
     if let Some((epoch, chunk_id)) = compute_rerand_catchup_chunk(sync_result)? {
-        if !pending.contains(&(epoch, chunk_id)) {
+        let dominated_by_step1 = pending.contains(&(epoch, chunk_id));
+        let has_progress = get_rerand_progress(pool, epoch, chunk_id)
+            .await?
+            .is_some();
+        if !dominated_by_step1 && has_progress {
             tracing::info!(
                 "Rerand catch-up: applying peer-ahead epoch {} chunk {}",
                 epoch,
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index d8356cfe35..4ddde00ad5 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -4,8 +4,9 @@ use bytemuck::cast_slice;
 use eyre::Result;
 use futures::StreamExt;
 use iris_mpc_store::rerand::{
-    apply_staging_chunk, get_rerand_progress, insert_staging_irises, set_all_confirmed,
-    set_staging_written, staging_schema_name, upsert_rerand_progress, StagingIrisEntry,
+    apply_staging_chunk, get_current_epoch, get_rerand_progress, insert_staging_irises,
+    set_all_confirmed, set_staging_written, staging_schema_name, upsert_rerand_progress,
+    StagingIrisEntry,
 };
 use iris_mpc_store::Store;
 use sqlx::PgPool;
@@ -40,7 +41,9 @@ pub async fn run_continuous_rerand(
             return Ok(());
         }
 
-        let active_epoch = epoch::determine_active_epoch(s3, &config.s3_bucket).await?;
+        let epoch_hint = get_current_epoch(pool).await?.map(|e| e as u32);
+        let active_epoch =
+            epoch::determine_active_epoch(s3, &config.s3_bucket, epoch_hint).await?;
         tracing::info!("Active epoch: {}", active_epoch);
 
         let shared_secret = epoch::derive_shared_secret(
@@ -96,7 +99,12 @@ pub async fn run_continuous_rerand(
                 .await?;
 
                 set_staging_written(pool, active_epoch as i32, chunk_id as i32).await?;
+            }
 
+            // Always (re-)upload the S3 staged marker. This is idempotent
+            // and covers the crash window between set_staging_written and
+            // the previous upload attempt.
+            if !progress.as_ref().is_some_and(|p| p.all_confirmed) {
                 s3_coordination::upload_chunk_staged(
                     s3,
                     &config.s3_bucket,
@@ -106,7 +114,7 @@ pub async fn run_continuous_rerand(
                 )
                 .await?;
                 tracing::info!(
-                    "Epoch {} chunk {}: staging written, S3 marker uploaded",
+                    "Epoch {} chunk {}: S3 staged marker uploaded",
                     active_epoch,
                     chunk_id
                 );
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index daddb68e98..222ea13031 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -98,9 +98,11 @@ async fn delete_private_key_from_sm(
 
 /// Idempotent key generation for an epoch.
 ///
-/// 1. Check SM for existing private key
-/// 2. If found: load it, derive public key, re-upload to S3 (covers crash between SM write and S3 upload)
-/// 3. If not found: generate new keypair, write to SM first, then upload public key to S3
+/// 1. Best-effort cleanup of previous epoch's key (covers crash between
+///    `poll_epoch_complete_all` and `delete_private_key_from_sm`)
+/// 2. Check SM for existing private key
+/// 3. If found: load it, derive public key, re-upload to S3 (covers crash between SM write and S3 upload)
+/// 4. If not found: generate new keypair, write to SM first, then upload public key to S3
 pub async fn idempotent_keygen(
     sm: &SecretsManagerClient,
     s3: &S3Client,
@@ -109,6 +111,16 @@ pub async fn idempotent_keygen(
     epoch: u32,
     party_id: u8,
 ) -> Result<tripartite_dh::PrivateKey> {
+    if epoch > 0 {
+        if let Err(e) = delete_private_key_from_sm(sm, env, epoch - 1, party_id).await {
+            tracing::debug!(
+                "Cleanup of epoch {} key (best-effort): {}",
+                epoch - 1,
+                e
+            );
+        }
+    }
+
     if let Some(existing) = load_private_key_from_sm(sm, env, epoch, party_id).await? {
         tracing::info!(
             "Epoch {}: private key found in SM, re-uploading public key to S3",
@@ -203,10 +215,16 @@ pub async fn derive_shared_secret(
 }
 
 /// Determine the active epoch by scanning S3 for the highest epoch with a
-/// manifest but without all three `complete` markers. Falls back to 0 if
-/// no epochs exist.
-pub async fn determine_active_epoch(s3: &S3Client, bucket: &str) -> Result<u32> {
-    let mut epoch: u32 = 0;
+/// manifest but without all three `complete` markers.
+///
+/// `start_hint` allows callers to skip already-completed epochs (e.g. from
+/// `get_current_epoch`). Falls back to 0 if no hint is available.
+pub async fn determine_active_epoch(
+    s3: &S3Client,
+    bucket: &str,
+    start_hint: Option<u32>,
+) -> Result<u32> {
+    let mut epoch: u32 = start_hint.unwrap_or(0);
     loop {
         if !s3_coordination::manifest_exists(s3, bucket, epoch).await? {
             break;
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index b391a654d7..d6313b278c 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -2,9 +2,10 @@ use aws_sdk_s3::Client as S3Client;
 use eyre::{eyre, Result};
 use serde::{Deserialize, Serialize};
 use std::time::Duration;
-use tokio::time::sleep;
+use tokio::time::{sleep, Instant};
 
 const NUM_PARTIES: u8 = 3;
+const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_secs(30 * 60);
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Manifest {
@@ -86,10 +87,18 @@ pub async fn poll_until_marker_exists(
     key: &str,
     poll_interval: Duration,
 ) -> Result<()> {
+    let deadline = Instant::now() + DEFAULT_POLL_TIMEOUT;
     loop {
         if marker_exists(s3, bucket, key).await? {
             return Ok(());
         }
+        if Instant::now() > deadline {
+            eyre::bail!(
+                "Timeout after {:?} waiting for S3 marker: {}",
+                DEFAULT_POLL_TIMEOUT,
+                key
+            );
+        }
         tracing::debug!("Waiting for S3 marker: {}", key);
         sleep(poll_interval).await;
     }
@@ -103,6 +112,7 @@ pub async fn poll_until_all_parties_marker(
     marker_suffix: &str,
     poll_interval: Duration,
 ) -> Result<()> {
+    let deadline = Instant::now() + DEFAULT_POLL_TIMEOUT;
     loop {
         let mut all_present = true;
         for party in 0..NUM_PARTIES {
@@ -115,6 +125,14 @@ pub async fn poll_until_all_parties_marker(
         if all_present {
             return Ok(());
         }
+        if Instant::now() > deadline {
+            eyre::bail!(
+                "Timeout after {:?} waiting for all parties' {} markers for epoch {}",
+                DEFAULT_POLL_TIMEOUT,
+                marker_suffix,
+                epoch
+            );
+        }
         tracing::debug!(
             "Waiting for all parties' {} markers for epoch {}",
             marker_suffix,
diff --git a/migrations/20260226000003_create_rerand_staging.up.sql b/migrations/20260226000003_create_rerand_staging.up.sql
index c036da5154..9f944816a6 100644
--- a/migrations/20260226000003_create_rerand_staging.up.sql
+++ b/migrations/20260226000003_create_rerand_staging.up.sql
@@ -16,4 +16,8 @@ BEGIN
         rerand_epoch        INTEGER,
         PRIMARY KEY (epoch, id)
     )', staging_schema);
+    EXECUTE format(
+        'CREATE INDEX IF NOT EXISTS idx_staging_irises_epoch_chunk ON %I.irises (epoch, chunk_id)',
+        staging_schema
+    );
 END $$;

From d7cfd399a4c5cbad4b83f2aff6fc34262fd2db61 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Fri, 27 Feb 2026 22:14:05 +0100
Subject: [PATCH 08/76] modification fence

---
 docs/specs/rerandomization.md                 | 119 ++++++++-------
 iris-mpc-bins/bin/iris-mpc/server.rs          |   4 +-
 iris-mpc-store/src/rerand.rs                  | 138 ++++++++++++++++--
 iris-mpc-upgrade/src/continuous_rerand.rs     |  92 ++++++++++--
 iris-mpc-upgrade/src/epoch.rs                 |   6 +-
 iris-mpc-upgrade/src/s3_coordination.rs       | 120 +++++++++++++++
 .../tests/continuous_rerand_e2e.rs            |  80 +++++++++-
 iris-mpc-upgrade/tests/test_utils.rs          |  85 ++++++++++-
 iris-mpc/src/server/mod.rs                    |   4 +-
 .../services/processors/modifications_sync.rs |   7 +
 10 files changed, 548 insertions(+), 107 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 6ad5ae7bd2..398ac47009 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -9,7 +9,7 @@ Key design decision: in-memory shares are less likely to be exfiltrated, so only
 ## Architecture
 
 1. **Rerand Server** (modified `iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs`, separate process, one per party) — rerandomizes shares, writes to staging, coordinates with peers via S3 markers, copies confirmed chunks to live DB. Replaces the existing one-off `RerandomizeDb` subcommand with a new `RerandomizeContinuous` subcommand. Core rerandomization logic in `iris-mpc-upgrade/src/rerandomization.rs` is reused; the new subcommand adds the continuous loop, S3 coordination, and staging management.
-2. **Main Server** (existing, minimal changes) — at startup, syncs rerand progress with peers and catches up any missing chunks from staging before loading the DB into memory.
+2. **Main Server** (existing, minimal changes) — at startup, syncs rerand progress with peers and catches up any missing chunks from staging before loading the DB into memory. Acquires `RERAND_MODIFY_LOCK` during modification writes to serialize with rerand applies.
 
 The GPU actor, batch processing, and result processor are completely untouched.
 
@@ -29,9 +29,10 @@ Only the rerand server needs access to the key. The main server never touches it
 
 When starting an epoch, the rerand server:
 
-1. Checks if an epoch-scoped private key already exists in Secrets Manager at `{env}/iris-mpc-db-rerandomization/epoch-{E}/private-key-party-{P}`
-2. If yes: loads it, derives the public key, and uploads the public key to S3 if not already present (covers crash-after-SM-write-before-S3-upload)
-3. If no: generates a new keypair, saves the private key to Secrets Manager first, then uploads the public key to S3
+1. Best-effort cleanup: attempts to delete the previous epoch's key from Secrets Manager (covers crash during epoch transition where deletion was skipped)
+2. Checks if an epoch-scoped private key already exists in Secrets Manager at `{env}/iris-mpc-db-rerandomization/epoch-{E}/private-key-party-{P}`
+3. If yes: loads it, derives the public key, and uploads the public key to S3 if not already present (covers crash-after-SM-write-before-S3-upload)
+4. If no: generates a new keypair, saves the private key to Secrets Manager first, then uploads the public key to S3
 
 Secrets Manager is checked first because the private key is written to SM before the public key is uploaded to S3. If we crash between the two writes, on restart we find the key in SM and re-upload to S3.
 
@@ -54,14 +55,16 @@ On restart mid-epoch: private key is still in SM, public keys and markers are st
 All cross-party coordination uses S3 markers in a shared bucket. Each party writes to its own prefixed paths. Marker layout:
 
 ```
-s3://bucket/rerand/epoch-{E}/party-{P}/public-key            # public key for DH
-s3://bucket/rerand/epoch-{E}/party-{P}/max-id                # party P watermark for manifest (MAX(id))
-s3://bucket/rerand/epoch-{E}/party-{P}/manifest.json         # epoch chunking manifest (party 0 writes, others read)
-s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged      # chunk K staging committed
-s3://bucket/rerand/epoch-{E}/party-{P}/complete              # epoch E fully done
+s3://bucket/rerand/epoch-{E}/party-{P}/public-key              # public key for DH
+s3://bucket/rerand/epoch-{E}/party-{P}/max-id                  # party P watermark for manifest (MAX(id))
+s3://bucket/rerand/epoch-{E}/party-{P}/manifest.json           # epoch chunking manifest (party 0 writes, others read)
+s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged        # chunk K staging committed
+s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/version-hash  # 32-byte blake3 hash of version map (fast-path comparison)
+s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/version-map   # chunk K [(id, version_id)] pairs (downloaded only on hash mismatch)
+s3://bucket/rerand/epoch-{E}/party-{P}/complete                # epoch E fully done
 ```
 
-Coordination is polling-based: a rerand server checks for peer markers by listing the S3 prefix. A few seconds of polling latency is fine for background work.
+Coordination is polling-based: a rerand server checks for peer markers by listing the S3 prefix. A few seconds of polling latency is fine for background work. All polling loops have a 30-minute timeout to surface permanently stuck peers.
 
 Authentication: the shared bucket uses IAM prefix policies to scope write access per party. Each party can only write to `s3://bucket/rerand/epoch-*/party-{P}/*`. All parties can read/list the full `s3://bucket/rerand/epoch-{E}/` prefix to observe peer markers. The manifest is written by the designated writer (party 0) under its own prefix (`party-0/manifest.json`) and is read-only for others.
 
@@ -110,6 +113,7 @@ CREATE TABLE irises (
     rerand_epoch         INTEGER,
     PRIMARY KEY (epoch, id)
 );
+CREATE INDEX idx_staging_irises_epoch_chunk ON irises (epoch, chunk_id);
 ```
 
 ### Coordination table
@@ -137,17 +141,22 @@ Lifecycle: `staging_written` → `all_confirmed` → `live_applied`.
 
 Runs continuously:
 
-1. Determine the active epoch E and load its manifest (the highest epoch with a manifest at `s3://bucket/rerand/epoch-{E}/party-0/manifest.json` but without all three completion markers). If no manifest exists for the next epoch, create it (party 0 only): collect watermarks, compute `max_id_inclusive`, write `manifest.json`.
+1. Determine the active epoch E (uses local `rerand_progress` as start hint, then scans S3 for the highest epoch with a manifest but without all three `complete` markers).
 2. Derive `shared_secret` for epoch E (keygen or resume — see above)
 3. Pick next chunk range `[start, end)` for chunk K from the manifest
-4. Read entries from live schema, recording each entry's `version_id`
-5. Rerandomize shares using `BLAKE3(shared_secret || iris_id)` XOF
-6. Write rerandomized shares to staging schema with `epoch = E`, `original_version_id`, `chunk_id = K`, and `rerand_epoch = E + 1`
-7. Set `staging_written = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
-8. Upload S3 marker after staging commit: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged`
-9. Poll S3 until all 3 party markers exist for chunk K
-10. Set `all_confirmed = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
-11. Copy from staging to live DB, delete staging, and mark applied — all in one transaction that holds `pg_advisory_xact_lock(RERAND_APPLY_LOCK)` (automatically released on commit, rollback, or connection drop):
+4. **Stage**: delete any partial staging data for this chunk (crash recovery clean slate), read entries from live schema recording each entry's `version_id`, rerandomize shares using `BLAKE3(shared_secret || iris_id)` XOF, write to staging schema with `epoch = E`, `original_version_id`, `chunk_id = K`, and `rerand_epoch = E + 1`
+5. Set `staging_written = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
+6. Upload version map `[(id, original_version_id)]` for the chunk to S3: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/version-map`
+7. Upload S3 staged marker: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged`
+8. Poll S3 until all 3 party staged markers exist for chunk K
+9. Set `all_confirmed = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
+10. **Modification fence** (see "Conflict Resolution" section below):
+    a. Download all 3 parties' version maps for chunk K. Compute cross-party disagreement set: IDs where any party captured a different `original_version_id`.
+    b. Acquire `pg_advisory_lock(RERAND_MODIFY_LOCK)` — blocks the main server from writing modifications to the `irises` table.
+    c. Query local divergences: IDs where `staging.original_version_id ≠ irises.version_id` (modifications that landed after staging).
+    d. Compute skip set = cross-party disagreements ∪ local divergences.
+    e. Delete skip set from staging.
+11. **Apply**: copy from staging to live DB, delete staging, and mark applied — all in one transaction that holds `pg_advisory_xact_lock(RERAND_APPLY_LOCK)`:
     ```sql
     BEGIN;
     SELECT pg_advisory_xact_lock(RERAND_APPLY_LOCK);
@@ -166,15 +175,20 @@ Runs continuously:
     UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = E AND chunk_id = K;
     COMMIT;  -- xact lock released here
     ```
-12. Proceed to next chunk (or start epoch transition if all chunks done)
+12. Release `pg_advisory_unlock(RERAND_MODIFY_LOCK)`.
+13. Proceed to next chunk (or start epoch transition if all chunks done).
+
+**Crash recovery for staging**: if the process crashes mid-staging, `staging_written` is still `FALSE`. On restart, the code re-enters the staging block and deletes any partial rows before re-reading. This ensures all staging rows come from one read pass (no mixed-snapshot version_ids). Inserts use `ON CONFLICT (epoch, id) DO NOTHING` as a safety net.
+
+**Crash recovery for S3 upload**: the S3 staged marker upload is outside the `if !staging_written` block. If the process crashes after `set_staging_written` but before the S3 upload, the marker is re-uploaded on restart (idempotent PUT).
 
 ### Step 2: Main Server Startup (minimal changes)
 
 At startup, before `load_iris_db`:
 
-1. **Existing**: modification sync (`sync_modifications`) — all parties catch up on modifications, producing identical `version_id` values
+1. **Existing**: modification sync (`sync_modifications`) — all parties catch up on modifications, producing identical `version_id` values. This transaction acquires `pg_advisory_xact_lock(RERAND_MODIFY_LOCK)` to serialize with rerand applies.
 2. **New**: rerand sync — parties exchange a compact rerand watermark during the existing startup sync (`SyncState` exchange):
-   - Each party computes `(epoch, max_confirmed_chunk)` from its local `rerand_progress` table: the active epoch E and the highest `chunk_id` where `all_confirmed = TRUE`.
+   - Each party computes `(epoch, max_confirmed_chunk)` from its local `rerand_progress` table: the active epoch E and the highest `chunk_id` where `all_confirmed = TRUE`. Returns `None` if the `rerand_progress` table doesn't exist yet (rolling deploy before migration); real DB errors propagate as `Err`.
    - Each party sends this single `(epoch, max_confirmed_chunk)` pair as part of `SyncState`.
    - Each party checks whether any peer is exactly 1 confirmed chunk ahead (within the same epoch, or has moved to the next epoch). If so, it applies that single chunk (`my_max_confirmed + 1`) from staging to the live DB.
    - **Why at most 1 chunk**: the rerand loop has a strict per-chunk synchronization barrier — a node cannot stage chunk K+1 until all three parties have confirmed chunk K via S3 markers. Therefore it is impossible for any peer to be more than 1 confirmed chunk ahead. The implementation enforces this with a fatal bail if the gap exceeds 1 (indicates DB corruption).
@@ -182,11 +196,13 @@ At startup, before `load_iris_db`:
    - Edge case: if all parties report the same `max_confirmed_chunk`, there is nothing to catch up and the step is skipped.
 3. **New (DB-only catch-up)**: two phases, in order:
    - **Phase A — apply locally pending chunks**: query `rerand_progress` for any chunks where `all_confirmed = TRUE AND live_applied = FALSE`. Apply each one via the same apply transaction as Step 1.11 (each acquires its own `pg_advisory_xact_lock`). This handles the crash window where the rerand server set `all_confirmed` but crashed before applying to live — without this, the node would advertise itself as caught up (watermark based on `all_confirmed`) while its live DB is stale.
-   - **Phase B — apply peer-ahead chunk**: if step 2 identified a chunk where a peer is strictly ahead, apply it (skipped if already handled in Phase A).
+   - **Phase B — apply peer-ahead chunk**: if step 2 identified a chunk where a peer is strictly ahead, apply it (skipped if already handled in Phase A). If no staging data exists for the chunk (ghost chunk at epoch boundary), the apply is a safe no-op.
    - **Phase C — hold session lock**: acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection. **Keep the lock held** through step 4. This prevents the rerand loop from applying new chunks while the DB snapshot is being read.
 4. **Existing**: `load_iris_db` — loads from live DB into GPU memory. The advisory lock is still held, so the rerand server cannot apply new chunks while the DB is being read into memory.
 5. Release the advisory lock: `SELECT pg_advisory_unlock(RERAND_APPLY_LOCK)` on the dedicated connection, then drop the connection.
 
+**Rollout note**: if the rerand tables haven't been migrated yet, `build_rerand_sync_state` returns `Ok(None)` for missing table. The catch-up logic is gated on `rerand_state.is_some()`, so startup proceeds without error.
+
 ### Epoch and chunk desync safety checks
 
 The startup sync validates two invariants derived from the protocol's synchronization barriers:
@@ -196,57 +212,38 @@ The startup sync validates two invariants derived from the protocol's synchroniz
 
 If either check fails, the main server refuses to start. This catches DB corruption, manual interference, or bugs in the rerand server early, before any data is loaded into memory.
 
-### Advisory lock: startup vs rerand server concurrency
-
-Two lock scopes are used, both keyed on `RERAND_APPLY_LOCK`:
-
-- **Transaction-level (`pg_advisory_xact_lock`)** — acquired inside `apply_staging_chunk`'s transaction. Automatically released on commit, rollback, or connection drop (including task abort). Used by both the rerand server loop (Step 1.11) and startup catch-up (Step 2.3 Phases A/B).
-- **Session-level (`pg_advisory_lock`)** — acquired on a dedicated connection during startup (Step 2.3 Phase C) and held through `load_iris_db` (Step 2.4). Prevents the rerand loop from applying new chunks while the DB snapshot is being read. Released explicitly after load.
+### Advisory locks
 
-These two scopes serialize correctly: `pg_advisory_xact_lock` and `pg_advisory_lock` on the same key conflict across sessions, so the rerand loop's apply transaction blocks while the main server holds the session lock, and vice versa.
+Three advisory lock keys are used:
 
-**Why `pg_advisory_xact_lock` for applies**: session-level locks are tied to a connection. If a process is killed while holding a session-level lock on a pooled connection, the connection may be returned to the pool with the lock still held, blocking future acquirers indefinitely. Transaction-level locks avoid this: when the connection is dropped, the transaction rolls back and the lock is released automatically.
+- **`RERAND_APPLY_LOCK`** — serializes chunk applies with `load_iris_db`. Used as `pg_advisory_xact_lock` inside `apply_staging_chunk`'s transaction (auto-released on commit/rollback/drop), and as session-level `pg_advisory_lock` during startup to hold through `load_iris_db`.
+- **`RERAND_MODIFY_LOCK`** — serializes modification writes with the rerand modification fence. The rerand server holds it (session-level) during the fence check + apply window (Steps 1.10–1.12). The main server acquires it (`pg_advisory_xact_lock`) inside its modification transaction to prevent writes during the fence window.
 
-**Implementation with connection pools (sqlx)**:
+**Why `pg_advisory_xact_lock` for applies and modifications**: session-level locks are tied to a connection. If a process is killed while holding a session-level lock on a pooled connection, the connection may be returned to the pool with the lock still held, blocking future acquirers indefinitely. Transaction-level locks avoid this: when the connection is dropped, the transaction rolls back and the lock is released automatically.
 
-```rust
-// Catch-up applies (each self-protected by xact lock inside apply_staging_chunk):
-for (epoch, chunk_id) in pending_chunks {
-    apply_staging_chunk(&pool, epoch, chunk_id).await?;
-}
+## Conflict Resolution: Rerandomization vs Modifications
 
-// Session lock for load_iris_db window:
-let mut lock_conn = pool.acquire().await?;
-sqlx::query("SELECT pg_advisory_lock($1)")
-    .bind(RERAND_APPLY_LOCK)
-    .execute(&mut *lock_conn).await?;
+### The problem
 
-load_iris_db(&pool).await?;
+Modifications (reauthentications, deletions) propagate asynchronously to each party via independent SQS queues. During continuous rerandomization, a modification can land on some parties but not others between the time different parties stage a chunk. Without protection, this causes cross-party share divergence: different parties apply the rerand to different underlying shares, breaking the MPC invariant that all 3 parties' shares reconstruct to the same plaintext.
 
-sqlx::query("SELECT pg_advisory_unlock($1)")
-    .bind(RERAND_APPLY_LOCK)
-    .execute(&mut *lock_conn).await?;
-drop(lock_conn);
-```
+### The modification fence
 
-### Why modification sync before rerand sync matters
+The modification fence ensures all parties agree on which rows to skip before applying a chunk. It has two components:
 
-Modification sync ensures all parties have the same `version_id` values before the rerand staging copy runs. This guarantees the optimistic lock (`WHERE version_id = original_version_id`) produces the same skip set on all parties — the same entries are updated, the same entries are skipped.
+1. **Cross-party version-map exchange** (Steps 1.6–1.8): after staging, each party uploads its `[(id, original_version_id)]` map for the chunk to S3, along with a 32-byte blake3 hash of the map. After the S3 barrier, each party first downloads only the 3 hashes (96 bytes total). If all hashes match, the maps are identical and the cross-party disagreement set is empty (fast path — no full map download needed). If any hash differs, the full maps are downloaded and diffed to compute the exact set of IDs where any party captured a different `original_version_id` (slow path). This catches modifications that arrived on some parties before staging but not others. In practice, disagreements are rare (only when a modification races with the staging window), so the fast path runs ~100% of the time.
 
-## Conflict Resolution: Rerandomization vs Modifications
+2. **Local divergence check under lock** (Steps 1.10b–1.10c): the rerand server acquires `RERAND_MODIFY_LOCK` (blocking the main server from writing modifications), then queries for IDs where `staging.original_version_id ≠ irises.version_id`. This catches modifications that arrived after staging but before apply. The lock ensures no new modifications can land during the check + apply window.
 
-### Why the optimistic lock is needed
+The union of both sets is deleted from staging before apply. All parties compute the same skip set (the cross-party exchange is deterministic, and the local check is under lock), so the apply produces consistent results across all parties.
 
-The rerand server reads entry X at time T with `version_id = V`. A modification (reauth/deletion) may happen later, bumping `version_id` to V+1. The staging still has `original_version_id = V`. The optimistic lock prevents overwriting the modification:
+### Why the optimistic lock is still needed
 
-```sql
-UPDATE irises SET ... WHERE version_id = original_version_id;
--- V ≠ V+1 → entry X skipped
-```
+The skip-set deletion removes divergent rows from staging before apply. The apply SQL still includes `WHERE irises.version_id = staging.original_version_id` as a final safety net — if a row somehow slipped through the fence (e.g., a modification landed in the narrow window between the local check and the apply within the same lock), the optimistic lock catches it. On its own the optimistic lock does NOT guarantee cross-party consistency (different parties can have different live `version_id` values), but combined with the fence it serves as defense-in-depth.
 
-### Why `rerand_epoch` and the trigger are needed
+### Why `rerand_epoch` and the trigger are kept
 
-Without the trigger change, the staging copy would bump `version_id` (because share data changed). The trigger change keeps `version_id` as a pure "user-facing modification counter," separate from rerandomization.
+Without the trigger change, the rerand apply would bump `version_id` (because share data changes). This is not a safety issue — the optimistic lock works correctly either way — but it inflates `version_id` by 1 per epoch per row. Since `version_id` is `SMALLINT` (max 32767), this limits the total number of rerandomizations + modifications before overflow. The trigger keeps `version_id` as a pure user-modification counter, preserving the full range for actual reauthentications.
 
 ## Chunking
 
@@ -257,8 +254,8 @@ Chunk boundaries must be identical across parties for chunk K to be meaningful.
 - **Watermark sync**: before the manifest is written, each party P uploads its local watermark `max_id_party_P = SELECT MAX(id) FROM irises` to `s3://bucket/rerand/epoch-{E}/party-{P}/max-id`.
 - The manifest writer waits until all three `max-id` markers exist, then sets `max_id_inclusive` as:
   - `M = min(max_id_party_0, max_id_party_1, max_id_party_2) - safety_buffer_ids`
-  - `safety_buffer_ids` is configurable (default 0 or one chunk) to avoid rerandomizing the “tip” where replication/ingest lag could differ across parties.
+  - `safety_buffer_ids` is configurable (default 0 or one chunk) to avoid rerandomizing the "tip" where replication/ingest lag could differ across parties.
 - New inserts with `id > M` are left for a future epoch.
 - Chunk K corresponds to `[start, end)` where `start = 1 + K * N` and `end = min(start + N, M + 1)`.
 
-A configurable delay (`--chunk-delay`, default e.g. 5s) is inserted between chunks to avoid sustained DB load. The rerand server should not stress the live DB with continuous writes — the delay spreads the I/O over time. The delay, chunk size, and number of parallel DB connections should all be configurable via CLI flags or environment variables.
\ No newline at end of file
+A configurable delay (`--chunk-delay`, default e.g. 5s) is inserted between chunks to avoid sustained DB load. The rerand server should not stress the live DB with continuous writes — the delay spreads the I/O over time. The delay, chunk size, and number of parallel DB connections should all be configurable via CLI flags or environment variables.
diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 6d68b5e0b4..8a59c541ce 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -982,9 +982,7 @@ async fn server_main(config: Config) -> Result<()> {
     let is_ready_flag = Arc::new(AtomicBool::new(false));
     let is_ready_flag_cloned = Arc::clone(&is_ready_flag);
 
-    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool)
-        .await
-        .ok();
+    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool).await?;
     let my_state = SyncState {
         db_len: store_len as u64,
         modifications: store.last_modifications(max_modification_lookback).await?,
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 0eb35cc4a8..390ff10898 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -3,6 +3,7 @@ use iris_mpc_common::helpers::sync::{RerandSyncState, SyncResult};
 use sqlx::PgPool;
 
 pub const RERAND_APPLY_LOCK: i64 = 0x5245_5241_4E44;
+pub const RERAND_MODIFY_LOCK: i64 = 0x5245_4D4F_4446;
 
 pub struct StagingIrisEntry {
     pub epoch: i32,
@@ -42,6 +43,97 @@ fn validate_identifier(name: &str) -> Result<()> {
     Ok(())
 }
 
+/// Delete any partial staging data for a chunk before (re-)staging.
+/// Ensures all rows come from one read pass, preventing mixed-snapshot
+/// version_ids after a crash-and-retry.
+pub async fn delete_staging_chunk(
+    pool: &PgPool,
+    staging_schema: &str,
+    epoch: i32,
+    chunk_id: i32,
+) -> Result<u64> {
+    validate_identifier(staging_schema)?;
+    let sql = format!(
+        r#"DELETE FROM "{}".irises WHERE epoch = $1 AND chunk_id = $2"#,
+        staging_schema,
+    );
+    let result = sqlx::query(&sql)
+        .bind(epoch)
+        .bind(chunk_id)
+        .execute(pool)
+        .await?;
+    Ok(result.rows_affected())
+}
+
+/// Return the (id, original_version_id) pairs from staging for a chunk.
+pub async fn get_staging_version_map(
+    pool: &PgPool,
+    staging_schema: &str,
+    epoch: i32,
+    chunk_id: i32,
+) -> Result<Vec<(i64, i16)>> {
+    validate_identifier(staging_schema)?;
+    let sql = format!(
+        r#"SELECT id, original_version_id FROM "{}".irises WHERE epoch = $1 AND chunk_id = $2 ORDER BY id"#,
+        staging_schema,
+    );
+    let rows: Vec<(i64, i16)> = sqlx::query_as(&sql)
+        .bind(epoch)
+        .bind(chunk_id)
+        .fetch_all(pool)
+        .await?;
+    Ok(rows)
+}
+
+/// Return IDs where the staging original_version_id no longer matches the
+/// live version_id (modifications landed after staging).
+pub async fn get_locally_divergent_ids(
+    pool: &PgPool,
+    staging_schema: &str,
+    epoch: i32,
+    chunk_id: i32,
+) -> Result<Vec<i64>> {
+    validate_identifier(staging_schema)?;
+    let sql = format!(
+        r#"
+        SELECT s.id FROM "{}".irises s
+        JOIN irises ON irises.id = s.id
+        WHERE s.epoch = $1 AND s.chunk_id = $2
+          AND irises.version_id != s.original_version_id
+        "#,
+        staging_schema,
+    );
+    let rows: Vec<(i64,)> = sqlx::query_as(&sql)
+        .bind(epoch)
+        .bind(chunk_id)
+        .fetch_all(pool)
+        .await?;
+    Ok(rows.into_iter().map(|(id,)| id).collect())
+}
+
+/// Delete specific IDs from a staging chunk.
+pub async fn delete_staging_ids(
+    pool: &PgPool,
+    staging_schema: &str,
+    epoch: i32,
+    ids: &[i64],
+) -> Result<u64> {
+    if ids.is_empty() {
+        return Ok(0);
+    }
+    validate_identifier(staging_schema)?;
+    let sql = format!(
+        r#"DELETE FROM "{}".irises WHERE epoch = $1 AND id = ANY($2)"#,
+        staging_schema,
+    );
+    let result = sqlx::query(&sql)
+        .bind(epoch)
+        .bind(ids)
+        .execute(pool)
+        .await?;
+    Ok(result.rows_affected())
+}
+
 pub async fn insert_staging_irises(
     pool: &PgPool,
     staging_schema: &str,
@@ -234,13 +326,35 @@ pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
 // ---------------------------------------------------------------------------
 
 /// Build the rerand sync state from the local `rerand_progress` table.
-pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<RerandSyncState> {
-    let epoch = get_current_epoch(pool).await?.unwrap_or(0);
+///
+/// Returns `Ok(None)` when the `rerand_progress` table does not exist yet
+/// (rolling deploy before migration). Returns `Err` for real DB failures
+/// so callers can distinguish "not migrated" from "broken".
+pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<Option<RerandSyncState>> {
+    let epoch = match get_current_epoch(pool).await {
+        Ok(e) => e.unwrap_or(0),
+        Err(e) => {
+            if is_undefined_table(&e) {
+                return Ok(None);
+            }
+            return Err(e);
+        }
+    };
     let max_confirmed = get_max_confirmed_chunk(pool, epoch).await?.unwrap_or(-1);
-    Ok(RerandSyncState {
+    Ok(Some(RerandSyncState {
         epoch,
         max_confirmed_chunk: max_confirmed,
-    })
+    }))
+}
+
+fn is_undefined_table(err: &eyre::Report) -> bool {
+    if let Some(db_err) = err.root_cause().downcast_ref::<sqlx::error::Error>() {
+        if let sqlx::error::Error::Database(ref pg) = db_err {
+            return pg.code().as_deref() == Some("42P01");
+        }
+    }
+    // Also check the direct error (not just root cause).
+    format!("{:?}", err).contains("42P01")
 }
 
 /// Compute the single chunk (if any) that needs to be applied during startup catch-up.
@@ -344,10 +458,11 @@ async fn rerand_catchup_inner(
     staging_schema: &str,
     sync_result: &SyncResult,
 ) -> Result<()> {
-    // If the rerand tables haven't been migrated yet, the sync state will
-    // be None and there is nothing to catch up.  Skip unconditionally so
-    // a rolling deploy doesn't crash before migrations run.
+    // If the rerand tables haven't been migrated yet, rerand_state is None
+    // (build_rerand_sync_state returns Ok(None) for missing table). Real DB
+    // errors propagate as Err before we get here. Safe to skip catch-up.
     if sync_result.my_state.rerand_state.is_none() {
+        tracing::info!("Rerand catch-up: skipped (rerand not yet migrated)");
         return Ok(());
     }
 
@@ -371,14 +486,9 @@ async fn rerand_catchup_inner(
     }
 
     // Step 2: if a peer is one chunk ahead, apply that chunk too.
-    // Skip if already handled in step 1, or if no progress row exists
-    // (ghost chunk at epoch boundary — the chunk doesn't actually exist).
+    // If there's no staging data for the chunk, apply is a safe no-op.
     if let Some((epoch, chunk_id)) = compute_rerand_catchup_chunk(sync_result)? {
-        let dominated_by_step1 = pending.contains(&(epoch, chunk_id));
-        let has_progress = get_rerand_progress(pool, epoch, chunk_id)
-            .await?
-            .is_some();
-        if !dominated_by_step1 && has_progress {
+        if !pending.contains(&(epoch, chunk_id)) {
             tracing::info!(
                 "Rerand catch-up: applying peer-ahead epoch {} chunk {}",
                 epoch,
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 4ddde00ad5..7423a4bb15 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -4,9 +4,10 @@ use bytemuck::cast_slice;
 use eyre::Result;
 use futures::StreamExt;
 use iris_mpc_store::rerand::{
-    apply_staging_chunk, get_current_epoch, get_rerand_progress, insert_staging_irises,
+    apply_staging_chunk, delete_staging_chunk, delete_staging_ids, get_current_epoch,
+    get_locally_divergent_ids, get_rerand_progress, get_staging_version_map, insert_staging_irises,
     set_all_confirmed, set_staging_written, staging_schema_name, upsert_rerand_progress,
-    StagingIrisEntry,
+    StagingIrisEntry, RERAND_MODIFY_LOCK,
 };
 use iris_mpc_store::Store;
 use sqlx::PgPool;
@@ -42,8 +43,7 @@ pub async fn run_continuous_rerand(
         }
 
         let epoch_hint = get_current_epoch(pool).await?.map(|e| e as u32);
-        let active_epoch =
-            epoch::determine_active_epoch(s3, &config.s3_bucket, epoch_hint).await?;
+        let active_epoch = epoch::determine_active_epoch(s3, &config.s3_bucket, epoch_hint).await?;
         tracing::info!("Active epoch: {}", active_epoch);
 
         let shared_secret = epoch::derive_shared_secret(
@@ -101,10 +101,24 @@ pub async fn run_continuous_rerand(
                 set_staging_written(pool, active_epoch as i32, chunk_id as i32).await?;
             }
 
-            // Always (re-)upload the S3 staged marker. This is idempotent
-            // and covers the crash window between set_staging_written and
-            // the previous upload attempt.
+            // Upload version map + staged marker (both idempotent).
             if !progress.as_ref().is_some_and(|p| p.all_confirmed) {
+                let version_map = get_staging_version_map(
+                    pool,
+                    &staging_schema,
+                    active_epoch as i32,
+                    chunk_id as i32,
+                )
+                .await?;
+                s3_coordination::upload_chunk_version_map(
+                    s3,
+                    &config.s3_bucket,
+                    active_epoch,
+                    config.party_id,
+                    chunk_id,
+                    &version_map,
+                )
+                .await?;
                 s3_coordination::upload_chunk_staged(
                     s3,
                     &config.s3_bucket,
@@ -114,7 +128,7 @@ pub async fn run_continuous_rerand(
                 )
                 .await?;
                 tracing::info!(
-                    "Epoch {} chunk {}: S3 staged marker uploaded",
+                    "Epoch {} chunk {}: version map + staged marker uploaded",
                     active_epoch,
                     chunk_id
                 );
@@ -146,14 +160,68 @@ pub async fn run_continuous_rerand(
                 return Ok(());
             }
 
+            // --- Modification fence ---
+            // 1. Compute cross-party version_id disagreements
+            let cross_party_divergent = s3_coordination::compute_cross_party_divergent_ids(
+                s3,
+                &config.s3_bucket,
+                active_epoch,
+                chunk_id,
+                poll_interval,
+            )
+            .await?;
+
+            // 2. Lock to prevent new modifications during apply
+            let mut modify_lock_conn = pool.acquire().await?;
+            sqlx::query("SELECT pg_advisory_lock($1)")
+                .bind(RERAND_MODIFY_LOCK)
+                .execute(&mut *modify_lock_conn)
+                .await?;
+
+            // 3. Check local staging vs live for post-staging modifications
+            let local_divergent = get_locally_divergent_ids(
+                pool,
+                &staging_schema,
+                active_epoch as i32,
+                chunk_id as i32,
+            )
+            .await?;
+
+            // 4. Union of both divergence sources
+            let mut skip_ids: Vec<i64> = cross_party_divergent;
+            skip_ids.extend(&local_divergent);
+            skip_ids.sort_unstable();
+            skip_ids.dedup();
+
+            if !skip_ids.is_empty() {
+                tracing::info!(
+                    "Epoch {} chunk {}: skipping {} IDs due to concurrent modifications: {:?}",
+                    active_epoch,
+                    chunk_id,
+                    skip_ids.len(),
+                    &skip_ids[..std::cmp::min(skip_ids.len(), 10)],
+                );
+                delete_staging_ids(pool, &staging_schema, active_epoch as i32, &skip_ids).await?;
+            }
+
+            // 5. Apply (now consistent across all parties)
             let rows =
                 apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32)
                     .await?;
+
+            // 6. Release modification lock
+            sqlx::query("SELECT pg_advisory_unlock($1)")
+                .bind(RERAND_MODIFY_LOCK)
+                .execute(&mut *modify_lock_conn)
+                .await?;
+            drop(modify_lock_conn);
+
             tracing::info!(
-                "Epoch {} chunk {}: applied to live DB ({} rows updated)",
+                "Epoch {} chunk {}: applied to live DB ({} rows updated, {} skipped)",
                 active_epoch,
                 chunk_id,
-                rows
+                rows,
+                skip_ids.len(),
             );
 
             chunk_id += 1;
@@ -244,6 +312,10 @@ async fn process_chunk_staging(
     chunk_id: u32,
     manifest: &Manifest,
 ) -> Result<()> {
+    // Delete any leftover rows from a previous partial run so all rows in
+    // staging come from one read pass (prevents mixed-snapshot version_ids).
+    delete_staging_chunk(pool, staging_schema, epoch as i32, chunk_id as i32).await?;
+
     let (start, end) = manifest.chunk_range(chunk_id);
 
     const BATCH_SIZE: usize = 500;
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index 222ea13031..92c2541da1 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -113,11 +113,7 @@ pub async fn idempotent_keygen(
 ) -> Result<tripartite_dh::PrivateKey> {
     if epoch > 0 {
         if let Err(e) = delete_private_key_from_sm(sm, env, epoch - 1, party_id).await {
-            tracing::debug!(
-                "Cleanup of epoch {} key (best-effort): {}",
-                epoch - 1,
-                e
-            );
+            tracing::debug!("Cleanup of epoch {} key (best-effort): {}", epoch - 1, e);
         }
     }
 
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index d6313b278c..fe483002d3 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -260,6 +260,126 @@ pub async fn poll_chunk_staged_all(
     poll_until_all_parties_marker(s3, bucket, epoch, &suffix, poll_interval).await
 }
 
+// ---- Chunk version map (modification fence) ----
+
+fn version_map_hash(version_map: &[(i64, i16)]) -> [u8; 32] {
+    let mut hasher = blake3::Hasher::new();
+    for (id, ver) in version_map {
+        hasher.update(&id.to_le_bytes());
+        hasher.update(&ver.to_le_bytes());
+    }
+    *hasher.finalize().as_bytes()
+}
+
+/// Upload the version map and its blake3 hash for a chunk.
+pub async fn upload_chunk_version_map(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    chunk_id: u32,
+    version_map: &[(i64, i16)],
+) -> Result<()> {
+    let prefix = format!("{}/chunk-{}", epoch_party_prefix(epoch, party), chunk_id);
+
+    let hash = version_map_hash(version_map);
+    upload_marker(s3, bucket, &format!("{prefix}/version-hash"), hash.to_vec()).await?;
+
+    let body = serde_json::to_vec(version_map)?;
+    upload_marker(s3, bucket, &format!("{prefix}/version-map"), body).await
+}
+
+async fn download_chunk_version_hash(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    chunk_id: u32,
+    poll_interval: Duration,
+) -> Result<[u8; 32]> {
+    let key = format!(
+        "{}/chunk-{}/version-hash",
+        epoch_party_prefix(epoch, party),
+        chunk_id
+    );
+    poll_until_marker_exists(s3, bucket, &key, poll_interval).await?;
+    let bytes = download_marker(s3, bucket, &key).await?;
+    let hash: [u8; 32] = bytes
+        .try_into()
+        .map_err(|b: Vec<u8>| eyre!("version-hash has wrong length: {}", b.len()))?;
+    Ok(hash)
+}
+
+async fn download_chunk_version_map(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    chunk_id: u32,
+    poll_interval: Duration,
+) -> Result<Vec<(i64, i16)>> {
+    let key = format!(
+        "{}/chunk-{}/version-map",
+        epoch_party_prefix(epoch, party),
+        chunk_id
+    );
+    poll_until_marker_exists(s3, bucket, &key, poll_interval).await?;
+    let bytes = download_marker(s3, bucket, &key).await?;
+    let map: Vec<(i64, i16)> = serde_json::from_slice(&bytes)?;
+    Ok(map)
+}
+
+/// Compare version maps across all 3 parties and return IDs where any
+/// party disagrees on the `original_version_id`.
+///
+/// Fast path: download only the 32-byte blake3 hashes. If all match,
+/// return empty (no disagreements). Slow path (hash mismatch): download
+/// the full maps and compute the exact disagreement set.
+pub async fn compute_cross_party_divergent_ids(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    chunk_id: u32,
+    poll_interval: Duration,
+) -> Result<Vec<i64>> {
+    let mut hashes = Vec::new();
+    for party in 0..NUM_PARTIES {
+        hashes.push(
+            download_chunk_version_hash(s3, bucket, epoch, party, chunk_id, poll_interval).await?,
+        );
+    }
+    if hashes[0] == hashes[1] && hashes[1] == hashes[2] {
+        return Ok(Vec::new());
+    }
+
+    tracing::info!(
+        "Epoch {} chunk {}: version-map hashes differ, downloading full maps",
+        epoch,
+        chunk_id,
+    );
+
+    use std::collections::HashMap;
+    let mut all_maps: Vec<HashMap<i64, i16>> = Vec::new();
+    for party in 0..NUM_PARTIES {
+        let map =
+            download_chunk_version_map(s3, bucket, epoch, party, chunk_id, poll_interval).await?;
+        all_maps.push(map.into_iter().collect());
+    }
+
+    let mut divergent = Vec::new();
+    let all_ids: std::collections::BTreeSet<i64> =
+        all_maps.iter().flat_map(|m| m.keys().copied()).collect();
+
+    for id in all_ids {
+        let versions: Vec<Option<&i16>> = all_maps.iter().map(|m| m.get(&id)).collect();
+        let first = versions[0];
+        if !versions.iter().all(|v| *v == first) {
+            divergent.push(id);
+        }
+    }
+    Ok(divergent)
+}
+
 // ---- Epoch completion ----
 
 pub async fn upload_epoch_complete(
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index 5e0126a299..a6fbdbd508 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -274,14 +274,14 @@ fn phase7_epoch_boundary_desync() {
             let pool = &env.harness.parties[p].store.pool;
             // Everyone completes Epoch 0
             sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (0, 0, TRUE, TRUE, TRUE)")
-                .execute(pool).await.unwrap();
+            .execute(pool).await.unwrap();
         }
 
         // P0 and P2 move to Epoch 1
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (1, 0, TRUE, TRUE, FALSE)")
-            .execute(&env.harness.parties[0].store.pool).await.unwrap();
+    .execute(&env.harness.parties[0].store.pool).await.unwrap();
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (1, 0, TRUE, TRUE, FALSE)")
-            .execute(&env.harness.parties[2].store.pool).await.unwrap();
+.execute(&env.harness.parties[2].store.pool).await.unwrap();
 
         // Now simulate P1 main server startup (P1 is behind on Epoch 0)
         // Should catch up using safe_up_to = i32::MAX
@@ -317,13 +317,13 @@ fn phase8_reject_desync() {
         for p in 0..NUM_PARTIES {
             let pool = &env.harness.parties[p].store.pool;
             sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (0, 0, TRUE, TRUE, TRUE)")
-                .execute(pool).await.unwrap();
+            .execute(pool).await.unwrap();
         }
 
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (2, 0, TRUE, TRUE, FALSE)")
-            .execute(&env.harness.parties[0].store.pool).await.unwrap();
+    .execute(&env.harness.parties[0].store.pool).await.unwrap();
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (2, 0, TRUE, TRUE, FALSE)")
-            .execute(&env.harness.parties[2].store.pool).await.unwrap();
+.execute(&env.harness.parties[2].store.pool).await.unwrap();
 
         let r1 = simulate_server_startup(&env.harness, 1).await;
         assert!(
@@ -342,10 +342,10 @@ fn phase8_reject_desync() {
         }
 
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (3, 0, TRUE, TRUE, TRUE)")
-            .execute(&env.harness.parties[1].store.pool).await.unwrap();
+.execute(&env.harness.parties[1].store.pool).await.unwrap();
 
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (3, 2, TRUE, TRUE, FALSE)")
-            .execute(&env.harness.parties[0].store.pool).await.unwrap();
+.execute(&env.harness.parties[0].store.pool).await.unwrap();
 
         let r1_chunk_desync = simulate_server_startup(&env.harness, 1).await;
         assert!(
@@ -358,3 +358,67 @@ fn phase8_reject_desync() {
         env.teardown().await
     });
 }
+
+// ============================================================================
+// Phase 9: Asymmetric modification — a modification landing on only one
+//          party's DB must NOT cause cross-party share divergence.
+//          The modification fence (version-map exchange + skip-set union)
+//          detects the asymmetry and excludes the affected row.
+// ============================================================================
+
+#[test]
+fn phase9_asymmetric_modification_consistency() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        let target_id: i64 = 20;
+        println!("[phase 9] Asymmetric modification consistency...");
+
+        // Modify iris on P0 ONLY — simulates a reauth that propagated to
+        // P0 via SQS but hasn't reached P1/P2 yet.
+        sqlx::query(
+            r#"
+            UPDATE irises
+            SET left_code = set_byte(left_code, 0, get_byte(left_code, 0) # 1)
+            WHERE id = $1
+            "#,
+        )
+        .bind(target_id)
+        .execute(&env.harness.parties[0].store.pool)
+        .await?;
+        println!("[phase 9]   modified id={} on P0 only", target_id);
+
+        // Run a full epoch across all 3 parties.
+        let (h, t) = env.spawn_all();
+        wait_epoch_done(&env.harness, 0).await?;
+        stop_all(t, h).await;
+
+        // The modification fence should have detected the asymmetric
+        // version_id and excluded id=20 from the apply on all parties.
+        // Non-modified rows should still be rerandomized consistently.
+        let ep = assert_consistent_rerand_epoch(&env.harness, &[target_id]).await?;
+        assert!(ep >= 1);
+
+        // The modified ID should have been skipped (rerand_epoch stays 0)
+        // on ALL parties, OR applied consistently. Either way shares must
+        // reconstruct.
+        let epochs = get_rerand_epochs_for_id(&env.harness, target_id).await?;
+        let epochs_consistent = epochs[0] == epochs[1] && epochs[1] == epochs[2];
+        println!(
+            "[phase 9]   rerand_epochs for id={}: {:?} (consistent={})",
+            target_id, epochs, epochs_consistent
+        );
+        assert!(
+            epochs_consistent,
+            "rerand_epoch diverged for id={}: {:?}",
+            target_id, epochs
+        );
+
+        // Verify non-modified rows reconstruct correctly.
+        verify_fingerprints(&env.harness, &env.fingerprints, &[target_id]).await?;
+
+        println!("[phase 9] PASSED (epoch={})", ep);
+
+        env.teardown().await
+    });
+}
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index b344e0f965..b4fd934fd0 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -426,9 +426,7 @@ pub async fn simulate_server_startup(harness: &TestHarness, party: usize) -> Res
 async fn build_test_sync_result(harness: &TestHarness, party: usize) -> Result<SyncResult> {
     let mut all_states = Vec::new();
     for p in &harness.parties {
-        let rerand_state = rerand_store::build_rerand_sync_state(&p.store.pool)
-            .await
-            .ok();
+        let rerand_state = rerand_store::build_rerand_sync_state(&p.store.pool).await?;
         all_states.push(SyncState {
             db_len: p.store.count_irises().await? as u64,
             modifications: vec![],
@@ -480,6 +478,87 @@ pub async fn assert_consistent_rerand_epoch(
         .unwrap_or(0))
 }
 
+/// Check whether shares for a specific iris ID reconstruct consistently
+/// across all 3 party-pair combinations. Returns false if the shares are
+/// divergent (reconstruction from different pairs disagrees).
+pub async fn shares_are_consistent(harness: &TestHarness, id: i64) -> Result<bool> {
+    let mut shares = Vec::new();
+    for party in 0..NUM_PARTIES {
+        shares.push(harness.store(party).get_iris_data_by_id(id).await?);
+    }
+
+    let pairs: Vec<[&[u16]; 3]> = vec![
+        [
+            shares[0].left_code(),
+            shares[1].left_code(),
+            shares[2].left_code(),
+        ],
+        [
+            shares[0].left_mask(),
+            shares[1].left_mask(),
+            shares[2].left_mask(),
+        ],
+        [
+            shares[0].right_code(),
+            shares[1].right_code(),
+            shares[2].right_code(),
+        ],
+        [
+            shares[0].right_mask(),
+            shares[1].right_mask(),
+            shares[2].right_mask(),
+        ],
+    ];
+
+    use iris_mpc_common::galois::degree4::ShamirGaloisRingShare;
+    use iris_mpc_common::galois::degree4::{basis::Monomial, GaloisRingElement};
+    use iris_mpc_common::id::PartyID;
+    use itertools::Itertools;
+
+    let lag_01 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID1);
+    let lag_10 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID0);
+    let lag_12 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID2);
+    let lag_21 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID1);
+
+    for [s0, s1, s2] in &pairs {
+        let recon01: Vec<u16> = s0
+            .chunks_exact(4)
+            .zip_eq(s1.chunks_exact(4))
+            .flat_map(|(a, b)| {
+                let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
+                let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
+                (a * lag_01 + b * lag_10).coefs
+            })
+            .collect();
+        let recon12: Vec<u16> = s1
+            .chunks_exact(4)
+            .zip_eq(s2.chunks_exact(4))
+            .flat_map(|(a, b)| {
+                let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
+                let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
+                (a * lag_12 + b * lag_21).coefs
+            })
+            .collect();
+        if recon01 != recon12 {
+            return Ok(false);
+        }
+    }
+    Ok(true)
+}
+
+/// Get the rerand_epoch for a specific iris ID across all parties.
+pub async fn get_rerand_epochs_for_id(harness: &TestHarness, id: i64) -> Result<[i32; 3]> {
+    let mut epochs = [0i32; 3];
+    for (i, party) in harness.parties.iter().enumerate() {
+        let (ep,): (i32,) = sqlx::query_as("SELECT rerand_epoch FROM irises WHERE id = $1")
+            .bind(id)
+            .fetch_one(&party.store.pool)
+            .await?;
+        epochs[i] = ep;
+    }
+    Ok(epochs)
+}
+
 async fn cleanup(harness: &TestHarness) -> Result<()> {
     for party in &harness.parties {
         let staging = rerand_store::staging_schema_name(&party.schema_name);
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index 334b1747b5..35d9ad91fc 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -397,9 +397,7 @@ async fn build_sync_state(
 
     tracing::info!("Database store length is: {}", db_len);
 
-    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool)
-        .await
-        .ok();
+    let rerand_state = rerand_store::build_rerand_sync_state(&store.pool).await?;
 
     Ok(SyncState {
         db_len,
diff --git a/iris-mpc/src/services/processors/modifications_sync.rs b/iris-mpc/src/services/processors/modifications_sync.rs
index a37015495b..cb3e742029 100644
--- a/iris-mpc/src/services/processors/modifications_sync.rs
+++ b/iris-mpc/src/services/processors/modifications_sync.rs
@@ -52,6 +52,13 @@ pub async fn sync_modifications(
 
     let mut iris_tx = store.tx().await?;
 
+    // Acquire the modification lock to serialize with rerand apply.
+    // Uses xact lock so it auto-releases on commit/rollback.
+    sqlx::query("SELECT pg_advisory_xact_lock($1)")
+        .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
+        .execute(&mut *iris_tx)
+        .await?;
+
     // Persist changes into modifications table
     store
         .update_modifications(&mut iris_tx, &to_update_refs)

From 42cbfd0419b0f60a195e426aea3dbf2dc3915c53 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sat, 28 Feb 2026 10:48:28 +0100
Subject: [PATCH 09/76] up

---
 docs/specs/rerandomization.md                 |  86 +--
 iris-mpc-bins/bin/iris-mpc/server.rs          | 176 +++---
 iris-mpc-store/src/rerand.rs                  | 503 +++++++++++++-----
 iris-mpc-upgrade/src/continuous_rerand.rs     |  53 +-
 .../tests/continuous_rerand_e2e.rs            |  42 +-
 iris-mpc-upgrade/tests/test_utils.rs          |   8 +-
 iris-mpc/src/server/mod.rs                    |  56 +-
 7 files changed, 563 insertions(+), 361 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 398ac47009..30bd5d59b5 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -150,33 +150,38 @@ Runs continuously:
 7. Upload S3 staged marker: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged`
 8. Poll S3 until all 3 party staged markers exist for chunk K
 9. Set `all_confirmed = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
-10. **Modification fence** (see "Conflict Resolution" section below):
+10. **Modification fence and Apply**:
     a. Download all 3 parties' version maps for chunk K. Compute cross-party disagreement set: IDs where any party captured a different `original_version_id`.
-    b. Acquire `pg_advisory_lock(RERAND_MODIFY_LOCK)` — blocks the main server from writing modifications to the `irises` table.
-    c. Query local divergences: IDs where `staging.original_version_id ≠ irises.version_id` (modifications that landed after staging).
-    d. Compute skip set = cross-party disagreements ∪ local divergences.
-    e. Delete skip set from staging.
-11. **Apply**: copy from staging to live DB, delete staging, and mark applied — all in one transaction that holds `pg_advisory_xact_lock(RERAND_APPLY_LOCK)`:
-    ```sql
-    BEGIN;
-    SELECT pg_advisory_xact_lock(RERAND_APPLY_LOCK);
-    UPDATE irises SET
-      left_code = staging.left_code,
-      left_mask = staging.left_mask,
-      right_code = staging.right_code,
-      right_mask = staging.right_mask,
-      rerand_epoch = staging.rerand_epoch
-    FROM staging_schema.irises AS staging
-    WHERE irises.id = staging.id
-      AND staging.epoch = E
-      AND staging.chunk_id = K
-      AND irises.version_id = staging.original_version_id;
-    DELETE FROM staging_schema.irises WHERE epoch = E AND chunk_id = K;
-    UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = E AND chunk_id = K;
-    COMMIT;  -- xact lock released here
-    ```
-12. Release `pg_advisory_unlock(RERAND_MODIFY_LOCK)`.
-13. Proceed to next chunk (or start epoch transition if all chunks done).
+    b. **Fenced Apply Transaction**: open a single transaction to check local divergence and apply safely:
+       ```sql
+       BEGIN;
+       -- Block the main server from writing modifications
+       SELECT pg_advisory_xact_lock(RERAND_MODIFY_LOCK);
+
+       -- Query local divergences (modifications that landed after staging)
+       -- ... compute skip set = cross-party disagreements ∪ local divergences ...
+       -- Delete skip set from staging schema
+
+       -- Proceed to apply
+       SELECT pg_advisory_xact_lock(RERAND_APPLY_LOCK);
+       
+       UPDATE irises SET
+         left_code = staging.left_code,
+         left_mask = staging.left_mask,
+         right_code = staging.right_code,
+         right_mask = staging.right_mask,
+         rerand_epoch = staging.rerand_epoch
+       FROM staging_schema.irises AS staging
+       WHERE irises.id = staging.id
+         AND staging.epoch = E
+         AND staging.chunk_id = K
+         AND irises.version_id = staging.original_version_id;
+         
+       DELETE FROM staging_schema.irises WHERE epoch = E AND chunk_id = K;
+       UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = E AND chunk_id = K;
+       COMMIT;  -- Both RERAND_MODIFY_LOCK and RERAND_APPLY_LOCK released here
+       ```
+11. Proceed to next chunk (or start epoch transition if all chunks done).
 
 **Crash recovery for staging**: if the process crashes mid-staging, `staging_written` is still `FALSE`. On restart, the code re-enters the staging block and deletes any partial rows before re-reading. This ensures all staging rows come from one read pass (no mixed-snapshot version_ids). Inserts use `ON CONFLICT (epoch, id) DO NOTHING` as a safety net.
 
@@ -187,21 +192,24 @@ Runs continuously:
 At startup, before `load_iris_db`:
 
 1. **Existing**: modification sync (`sync_modifications`) — all parties catch up on modifications, producing identical `version_id` values. This transaction acquires `pg_advisory_xact_lock(RERAND_MODIFY_LOCK)` to serialize with rerand applies.
-2. **New**: rerand sync — parties exchange a compact rerand watermark during the existing startup sync (`SyncState` exchange):
+2. **New**: rerand sync validation — parties exchange a compact rerand watermark during the existing startup sync (`SyncState` exchange):
    - Each party computes `(epoch, max_confirmed_chunk)` from its local `rerand_progress` table: the active epoch E and the highest `chunk_id` where `all_confirmed = TRUE`. Returns `None` if the `rerand_progress` table doesn't exist yet (rolling deploy before migration); real DB errors propagate as `Err`.
    - Each party sends this single `(epoch, max_confirmed_chunk)` pair as part of `SyncState`.
-   - Each party checks whether any peer is exactly 1 confirmed chunk ahead (within the same epoch, or has moved to the next epoch). If so, it applies that single chunk (`my_max_confirmed + 1`) from staging to the live DB.
-   - **Why at most 1 chunk**: the rerand loop has a strict per-chunk synchronization barrier — a node cannot stage chunk K+1 until all three parties have confirmed chunk K via S3 markers. Therefore it is impossible for any peer to be more than 1 confirmed chunk ahead. The implementation enforces this with a fatal bail if the gap exceeds 1 (indicates DB corruption).
-   - **Why `max` across peers**: `all_confirmed = TRUE` at any party means that party observed all three S3 `staged` markers, which means all three parties successfully committed the chunk to their staging schemas. A slower party may not have polled S3 yet, but its staging data is already there.
-   - Edge case: if all parties report the same `max_confirmed_chunk`, there is nothing to catch up and the step is skipped.
-3. **New (DB-only catch-up)**: two phases, in order:
-   - **Phase A — apply locally pending chunks**: query `rerand_progress` for any chunks where `all_confirmed = TRUE AND live_applied = FALSE`. Apply each one via the same apply transaction as Step 1.11 (each acquires its own `pg_advisory_xact_lock`). This handles the crash window where the rerand server set `all_confirmed` but crashed before applying to live — without this, the node would advertise itself as caught up (watermark based on `all_confirmed`) while its live DB is stale.
-   - **Phase B — apply peer-ahead chunk**: if step 2 identified a chunk where a peer is strictly ahead, apply it (skipped if already handled in Phase A). If no staging data exists for the chunk (ghost chunk at epoch boundary), the apply is a safe no-op.
-   - **Phase C — hold session lock**: acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection. **Keep the lock held** through step 4. This prevents the rerand loop from applying new chunks while the DB snapshot is being read.
+   - The startup validator checks two invariants: epoch gap ≤ 1 and chunk gap ≤ 1 (within the same epoch). If either is violated, startup fails (indicates DB corruption).
+   - If a peer is at most 1 chunk or 1 epoch ahead, this is within protocol tolerance. **Startup does NOT apply any chunks.** The rerand worker is responsible for catch-up through the full modification-fence path. If the node is behind, the rerand worker must run and complete the pending chunk before the next startup succeeds.
+3. **New**: lock-first startup readiness loop:
+   - Acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection.
+   - While the lock is held (applies frozen), verify local DB readiness: no `all_confirmed=TRUE AND live_applied=FALSE` rows remain, and local applied watermark has reached the startup target from step 2.
+   - Special case: if startup target is `(epoch = E, max_confirmed_chunk = -1)`, no chunk is confirmed in epoch E yet, so startup does not wait for an apply in epoch E.
+   - If behind, release the lock and retry after a short sleep (lets rerand worker apply the pending chunk).
+   - If local watermark has already advanced past the startup target, fail startup (snapshot stale; restart and resync).
+   - Once ready, keep the lock held through step 4. This prevents the rerand loop from applying new chunks while the DB snapshot is being read.
 4. **Existing**: `load_iris_db` — loads from live DB into GPU memory. The advisory lock is still held, so the rerand server cannot apply new chunks while the DB is being read into memory.
 5. Release the advisory lock: `SELECT pg_advisory_unlock(RERAND_APPLY_LOCK)` on the dedicated connection, then drop the connection.
 
-**Rollout note**: if the rerand tables haven't been migrated yet, `build_rerand_sync_state` returns `Ok(None)` for missing table. The catch-up logic is gated on `rerand_state.is_some()`, so startup proceeds without error.
+**Why startup does not apply chunks**: every chunk apply must go through the modification fence (cross-party version-map exchange + local divergence check under `RERAND_MODIFY_LOCK`). The startup path has no access to the S3 coordination bus and cannot perform the fence. Applying unfenced chunks at startup would create the same cross-party share divergence the fence was designed to prevent.
+
+**Rollout note**: if the rerand tables haven't been migrated yet, `build_rerand_sync_state` returns `Ok(None)` for missing table. The validation is skipped and startup proceeds without error.
 
 ### Epoch and chunk desync safety checks
 
@@ -217,9 +225,9 @@ If either check fails, the main server refuses to start. This catches DB corrupt
 Three advisory lock keys are used:
 
 - **`RERAND_APPLY_LOCK`** — serializes chunk applies with `load_iris_db`. Used as `pg_advisory_xact_lock` inside `apply_staging_chunk`'s transaction (auto-released on commit/rollback/drop), and as session-level `pg_advisory_lock` during startup to hold through `load_iris_db`.
-- **`RERAND_MODIFY_LOCK`** — serializes modification writes with the rerand modification fence. The rerand server holds it (session-level) during the fence check + apply window (Steps 1.10–1.12). The main server acquires it (`pg_advisory_xact_lock`) inside its modification transaction to prevent writes during the fence window.
+- **`RERAND_MODIFY_LOCK`** — serializes modification writes with the rerand modification fence. The rerand server acquires it (`pg_advisory_xact_lock`) at the start of its unified fence+apply transaction (Step 1.10) to hold through the fence check and apply window. The main server acquires it (`pg_advisory_xact_lock`) inside its modification transaction to prevent writes during the fence window.
 
-**Why `pg_advisory_xact_lock` for applies and modifications**: session-level locks are tied to a connection. If a process is killed while holding a session-level lock on a pooled connection, the connection may be returned to the pool with the lock still held, blocking future acquirers indefinitely. Transaction-level locks avoid this: when the connection is dropped, the transaction rolls back and the lock is released automatically.
+**Why `pg_advisory_xact_lock` for applies and modifications**: session-level locks are tied to a connection. If a process is killed while holding a session-level lock on a pooled connection, the connection may be returned to the pool with the lock still held, blocking future acquirers indefinitely. Transaction-level locks avoid this: when the connection is dropped, the transaction rolls back and the lock is released automatically. This is why the entire modification fence and apply step was combined into a single transaction.
 
 ## Conflict Resolution: Rerandomization vs Modifications
 
@@ -233,7 +241,7 @@ The modification fence ensures all parties agree on which rows to skip before ap
 
 1. **Cross-party version-map exchange** (Steps 1.6–1.8): after staging, each party uploads its `[(id, original_version_id)]` map for the chunk to S3, along with a 32-byte blake3 hash of the map. After the S3 barrier, each party first downloads only the 3 hashes (96 bytes total). If all hashes match, the maps are identical and the cross-party disagreement set is empty (fast path — no full map download needed). If any hash differs, the full maps are downloaded and diffed to compute the exact set of IDs where any party captured a different `original_version_id` (slow path). This catches modifications that arrived on some parties before staging but not others. In practice, disagreements are rare (only when a modification races with the staging window), so the fast path runs ~100% of the time.
 
-2. **Local divergence check under lock** (Steps 1.10b–1.10c): the rerand server acquires `RERAND_MODIFY_LOCK` (blocking the main server from writing modifications), then queries for IDs where `staging.original_version_id ≠ irises.version_id`. This catches modifications that arrived after staging but before apply. The lock ensures no new modifications can land during the check + apply window.
+2. **Local divergence check under lock**: as part of the single apply transaction, the rerand server acquires `pg_advisory_xact_lock(RERAND_MODIFY_LOCK)` (blocking the main server from writing modifications), then queries for IDs where `staging.original_version_id ≠ irises.version_id`. This catches modifications that arrived after staging but before apply. The lock ensures no new modifications can land during the check + apply window.
 
 The union of both sets is deleted from staging before apply. All parties compute the same skip set (the cross-party exchange is deterministic, and the local check is under lock), so the apply produces consistent results across all parties.
 
diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 8a59c541ce..768871a9b9 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -1319,103 +1319,109 @@ async fn server_main(config: Config) -> Result<()> {
     }
 
     let rerand_lock_conn =
-        rerand_store::rerand_catchup_and_lock(&store.pool, &store.schema_name, &sync_result)
-            .await?;
+        rerand_store::rerand_validate_and_lock(&store.pool, &sync_result).await?;
 
     if download_shutdown_handler.is_shutting_down() {
         tracing::warn!("Shutting down has been triggered");
+        rerand_store::release_rerand_lock(rerand_lock_conn).await?;
         return Ok(());
     }
 
-    // refetch store_len in case we rolled back
-    let store_len = store.count_irises().await?;
-    tracing::info!("Database store length after sync: {}", store_len);
-
-    let runtime_handle = tokio::runtime::Handle::current();
-    let anon_stats_writer = if let Some(url) = config.get_anon_stats_db_url() {
-        let schema = config.get_anon_stats_db_schema();
-        let anon_client =
-            AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
-        let anon_store = AnonStatsStore::new(&anon_client).await?;
-        Some((anon_store, runtime_handle.clone()))
-    } else {
-        tracing::warn!("No database URL configured for anon stats; skipping DB persistence");
-        None
-    };
-    let anon_stats_writer_for_actor = anon_stats_writer.clone();
-
-    let (tx, rx) = oneshot::channel();
-    let config_clone = config.clone();
-    background_tasks.spawn_blocking(move || {
-        let config = config_clone;
-        // --------------------------------------------------------------------------
-        // ANCHOR: Load the database
-        // --------------------------------------------------------------------------
-        tracing::info!("⚓️ ANCHOR: Starting server actor");
-        match ServerActor::new(
-            config.party_id,
-            chacha_seeds,
-            8,
-            config.max_db_size,
-            config.max_batch_size,
-            config.match_distances_buffer_size,
-            config.match_distances_buffer_size_extra_percent,
-            config.return_partial_results,
-            config.disable_persistence,
-            config.enable_debug_timing,
-            config.full_scan_side,
-            config.full_scan_side_switching_enabled,
-            anon_stats_writer_for_actor,
-        ) {
-            Ok((mut actor, handle)) => {
-                tracing::info!("⚓️ ANCHOR: Load the database");
-                let res = if config.fake_db_size > 0 {
-                    // TODO: does this even still work, since we do not page-lock the memory here?
-                    actor.fake_db(config.fake_db_size);
-                    Ok(())
-                } else {
-                    tracing::info!(
-                        "Initialize iris db: Loading from DB (parallelism: {})",
-                        parallelism
-                    );
-                    let download_shutdown_handler = Arc::clone(&download_shutdown_handler);
-
-                    tokio::runtime::Handle::current().block_on(async {
-                        load_iris_db(
-                            &mut actor,
-                            &store,
-                            store_len,
-                            parallelism,
-                            &config,
-                            download_shutdown_handler,
-                        )
-                        .await
-                    })
-                };
+    let startup_result = async {
+        // refetch store_len in case we rolled back
+        let store_len = store.count_irises().await?;
+        tracing::info!("Database store length after sync: {}", store_len);
+
+        let runtime_handle = tokio::runtime::Handle::current();
+        let anon_stats_writer = if let Some(url) = config.get_anon_stats_db_url() {
+            let schema = config.get_anon_stats_db_schema();
+            let anon_client =
+                AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
+            let anon_store = AnonStatsStore::new(&anon_client).await?;
+            Some((anon_store, runtime_handle.clone()))
+        } else {
+            tracing::warn!("No database URL configured for anon stats; skipping DB persistence");
+            None
+        };
+        let anon_stats_writer_for_actor = anon_stats_writer.clone();
+
+        let (tx, rx) = oneshot::channel();
+        let config_clone = config.clone();
+        background_tasks.spawn_blocking(move || {
+            let config = config_clone;
+            // --------------------------------------------------------------------------
+            // ANCHOR: Load the database
+            // --------------------------------------------------------------------------
+            tracing::info!("⚓️ ANCHOR: Starting server actor");
+            match ServerActor::new(
+                config.party_id,
+                chacha_seeds,
+                8,
+                config.max_db_size,
+                config.max_batch_size,
+                config.match_distances_buffer_size,
+                config.match_distances_buffer_size_extra_percent,
+                config.return_partial_results,
+                config.disable_persistence,
+                config.enable_debug_timing,
+                config.full_scan_side,
+                config.full_scan_side_switching_enabled,
+                anon_stats_writer_for_actor,
+            ) {
+                Ok((mut actor, handle)) => {
+                    tracing::info!("⚓️ ANCHOR: Load the database");
+                    let res = if config.fake_db_size > 0 {
+                        // TODO: does this even still work, since we do not page-lock the memory here?
+                        actor.fake_db(config.fake_db_size);
+                        Ok(())
+                    } else {
+                        tracing::info!(
+                            "Initialize iris db: Loading from DB (parallelism: {})",
+                            parallelism
+                        );
+                        let download_shutdown_handler = Arc::clone(&download_shutdown_handler);
+
+                        tokio::runtime::Handle::current().block_on(async {
+                            load_iris_db(
+                                &mut actor,
+                                &store,
+                                store_len,
+                                parallelism,
+                                &config,
+                                download_shutdown_handler,
+                            )
+                            .await
+                        })
+                    };
 
-                match res {
-                    Ok(_) => {
-                        tx.send(Ok((handle, store))).unwrap();
-                    }
-                    Err(e) => {
-                        tx.send(Err(e)).unwrap();
-                        return Ok(());
+                    match res {
+                        Ok(_) => {
+                            tx.send(Ok((handle, store))).unwrap();
+                        }
+                        Err(e) => {
+                            tx.send(Err(e)).unwrap();
+                            return Ok(());
+                        }
                     }
-                }
 
-                actor.run(); // forever
-            }
-            Err(e) => {
-                tx.send(Err(e)).unwrap();
-                return Ok(());
-            }
-        };
-        Ok(())
-    });
+                    actor.run(); // forever
+                }
+                Err(e) => {
+                    tx.send(Err(e)).unwrap();
+                    return Ok(());
+                }
+            };
+            Ok(())
+        });
 
-    let (mut handle, store) = rx.await??;
+        let startup_result = rx.await;
+        let (handle, store) = startup_result??;
+        Ok::<_, eyre::Report>((handle, store))
+    }
+    .await;
 
     rerand_store::release_rerand_lock(rerand_lock_conn).await?;
+    let (mut handle, store) = startup_result?;
 
     background_tasks.check_tasks();
 
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 390ff10898..8b7ab575fc 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -1,3 +1,6 @@
+use std::cmp::Ordering;
+use std::time::Duration;
+
 use eyre::Result;
 use iris_mpc_common::helpers::sync::{RerandSyncState, SyncResult};
 use sqlx::PgPool;
@@ -111,6 +114,30 @@ pub async fn get_locally_divergent_ids(
     Ok(rows.into_iter().map(|(id,)| id).collect())
 }
 
+async fn get_locally_divergent_ids_tx(
+    tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
+    staging_schema: &str,
+    epoch: i32,
+    chunk_id: i32,
+) -> Result<Vec<i64>> {
+    validate_identifier(staging_schema)?;
+    let sql = format!(
+        r#"
+        SELECT s.id FROM "{}".irises s
+        JOIN irises ON irises.id = s.id
+        WHERE s.epoch = $1 AND s.chunk_id = $2
+          AND irises.version_id != s.original_version_id
+        "#,
+        staging_schema,
+    );
+    let rows: Vec<(i64,)> = sqlx::query_as(&sql)
+        .bind(epoch)
+        .bind(chunk_id)
+        .fetch_all(&mut **tx)
+        .await?;
+    Ok(rows.into_iter().map(|(id,)| id).collect())
+}
+
 /// Delete specific IDs from a staging chunk.
 pub async fn delete_staging_ids(
     pool: &PgPool,
@@ -134,6 +161,28 @@ pub async fn delete_staging_ids(
     Ok(result.rows_affected())
 }
 
+async fn delete_staging_ids_tx(
+    tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
+    staging_schema: &str,
+    epoch: i32,
+    ids: &[i64],
+) -> Result<u64> {
+    if ids.is_empty() {
+        return Ok(0);
+    }
+    validate_identifier(staging_schema)?;
+    let sql = format!(
+        r#"DELETE FROM "{}".irises WHERE epoch = $1 AND id = ANY($2)"#,
+        staging_schema,
+    );
+    let result = sqlx::query(&sql)
+        .bind(epoch)
+        .bind(ids)
+        .execute(&mut **tx)
+        .await?;
+    Ok(result.rows_affected())
+}
+
 pub async fn insert_staging_irises(
     pool: &PgPool,
     staging_schema: &str,
@@ -184,10 +233,21 @@ pub async fn apply_staging_chunk(
 ) -> Result<u64> {
     validate_identifier(staging_schema)?;
     let mut tx = pool.begin().await?;
+    let rows_updated = apply_staging_chunk_in_tx(&mut tx, staging_schema, epoch, chunk_id).await?;
+    tx.commit().await?;
+    Ok(rows_updated)
+}
 
+async fn apply_staging_chunk_in_tx(
+    tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
+    staging_schema: &str,
+    epoch: i32,
+    chunk_id: i32,
+) -> Result<u64> {
+    validate_identifier(staging_schema)?;
     sqlx::query("SELECT pg_advisory_xact_lock($1)")
         .bind(RERAND_APPLY_LOCK)
-        .execute(&mut *tx)
+        .execute(&mut **tx)
         .await?;
 
     let update_sql = format!(
@@ -209,7 +269,7 @@ pub async fn apply_staging_chunk(
     let result = sqlx::query(&update_sql)
         .bind(epoch)
         .bind(chunk_id)
-        .execute(&mut *tx)
+        .execute(&mut **tx)
         .await?;
     let rows_updated = result.rows_affected();
 
@@ -220,7 +280,7 @@ pub async fn apply_staging_chunk(
     sqlx::query(&delete_sql)
         .bind(epoch)
         .bind(chunk_id)
-        .execute(&mut *tx)
+        .execute(&mut **tx)
         .await?;
 
     sqlx::query(
@@ -228,13 +288,52 @@ pub async fn apply_staging_chunk(
     )
     .bind(epoch)
     .bind(chunk_id)
-    .execute(&mut *tx)
+    .execute(&mut **tx)
     .await?;
 
-    tx.commit().await?;
     Ok(rows_updated)
 }
 
+/// Apply a chunk under the modification fence in one transaction.
+///
+/// Transaction scope:
+///   1. Acquire `pg_advisory_xact_lock(RERAND_MODIFY_LOCK)`
+///   2. Compute local diverged IDs
+///   3. Prune union(cross_party_diverged, local_diverged) from staging
+///   4. Apply staging chunk to live (`RERAND_APPLY_LOCK` is acquired inside)
+///   5. Commit (releasing both transaction locks)
+pub async fn fenced_apply_chunk(
+    pool: &PgPool,
+    staging_schema: &str,
+    epoch: i32,
+    chunk_id: i32,
+    cross_party_divergent: Vec<i64>,
+) -> Result<(u64, usize)> {
+    validate_identifier(staging_schema)?;
+    let mut tx = pool.begin().await?;
+    sqlx::query("SELECT pg_advisory_xact_lock($1)")
+        .bind(RERAND_MODIFY_LOCK)
+        .execute(&mut *tx)
+        .await?;
+
+    let local_divergent =
+        get_locally_divergent_ids_tx(&mut tx, staging_schema, epoch, chunk_id).await?;
+
+    let mut skip_ids = cross_party_divergent;
+    skip_ids.extend(&local_divergent);
+    skip_ids.sort_unstable();
+    skip_ids.dedup();
+    let skip_count = skip_ids.len();
+
+    if !skip_ids.is_empty() {
+        delete_staging_ids_tx(&mut tx, staging_schema, epoch, &skip_ids).await?;
+    }
+
+    let rows = apply_staging_chunk_in_tx(&mut tx, staging_schema, epoch, chunk_id).await?;
+    tx.commit().await?;
+    Ok((rows, skip_count))
+}
+
 pub async fn upsert_rerand_progress(pool: &PgPool, epoch: i32, chunk_id: i32) -> Result<()> {
     sqlx::query(
         r#"
@@ -299,20 +398,6 @@ pub async fn get_max_confirmed_chunk(pool: &PgPool, epoch: i32) -> Result<Option
     Ok(row.0)
 }
 
-/// Returns chunks that are confirmed but not yet applied to the live DB.
-/// In normal operation there is at most 1 such chunk (the crash window
-/// between `set_all_confirmed` and `apply_staging_chunk`).
-pub async fn get_confirmed_unapplied_chunks(pool: &PgPool) -> Result<Vec<(i32, i32)>> {
-    let rows: Vec<(i32, i32)> = sqlx::query_as(
-        "SELECT epoch, chunk_id FROM rerand_progress \
-         WHERE all_confirmed = TRUE AND live_applied = FALSE \
-         ORDER BY epoch, chunk_id",
-    )
-    .fetch_all(pool)
-    .await?;
-    Ok(rows)
-}
-
 /// Returns the highest epoch that has any rerand_progress rows.
 pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
     let row: (Option<i32>,) = sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
@@ -348,33 +433,81 @@ pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<Option<RerandSyncS
 }
 
 fn is_undefined_table(err: &eyre::Report) -> bool {
-    if let Some(db_err) = err.root_cause().downcast_ref::<sqlx::error::Error>() {
-        if let sqlx::error::Error::Database(ref pg) = db_err {
-            return pg.code().as_deref() == Some("42P01");
-        }
+    if let Some(db_err) = err.root_cause().downcast_ref::<sqlx::Error>() {
+        return is_undefined_table_sqlx(db_err);
     }
     // Also check the direct error (not just root cause).
     format!("{:?}", err).contains("42P01")
 }
 
-/// Compute the single chunk (if any) that needs to be applied during startup catch-up.
-///
-/// Because the rerand loop has a strict per-chunk synchronization barrier (all 3 parties
-/// must confirm chunk K before any party can stage chunk K+1), peers can be at most
-/// 1 confirmed chunk ahead. Therefore, catch-up is always 0 or 1 chunks.
+fn is_undefined_table_sqlx(err: &sqlx::Error) -> bool {
+    if let sqlx::Error::Database(pg) = err {
+        return pg.code().as_deref() == Some("42P01");
+    }
+    false
+}
+
+/// Check whether all locally confirmed chunks have been applied to live.
 ///
-/// Returns `Some((epoch, chunk_id))` if there is exactly one chunk to catch up,
-/// `None` otherwise.
-pub fn compute_rerand_catchup_chunk(sync_result: &SyncResult) -> Result<Option<(i32, i32)>> {
+/// Returns `Ok(true)` when no confirmed-but-unapplied chunks remain,
+/// `Ok(true)` when the `rerand_progress` table doesn't exist yet
+/// (rolling deploy), and `Err` on real DB failures.
+async fn check_pending_chunks_applied(conn: &mut sqlx::PgConnection) -> Result<bool> {
+    let pending: (i64,) = match sqlx::query_as(
+        "SELECT COUNT(*) FROM rerand_progress \
+         WHERE all_confirmed = TRUE AND live_applied = FALSE",
+    )
+    .fetch_one(&mut *conn)
+    .await
+    {
+        Ok(row) => row,
+        Err(e) if is_undefined_table_sqlx(&e) => return Ok(true),
+        Err(e) => return Err(e.into()),
+    };
+    Ok(pending.0 == 0)
+}
+
+/// Highest `(epoch, chunk_id)` where `live_applied = TRUE`.
+/// Returns `None` when no chunks have been applied yet.
+async fn get_applied_watermark(conn: &mut sqlx::PgConnection) -> Result<Option<(i32, i32)>> {
+    let row: Option<(i32, i32)> = match sqlx::query_as(
+        "SELECT epoch, chunk_id FROM rerand_progress \
+         WHERE live_applied = TRUE \
+         ORDER BY epoch DESC, chunk_id DESC \
+         LIMIT 1",
+    )
+    .fetch_optional(&mut *conn)
+    .await
+    {
+        Ok(row) => row,
+        Err(e) if is_undefined_table_sqlx(&e) => return Ok(None),
+        Err(e) => return Err(e.into()),
+    };
+    Ok(row)
+}
+
+/// Highest (epoch, max_confirmed_chunk) reported by any peer in the
+/// startup snapshot. Returns `None` when no peer has rerand state
+/// (pre-migration rolling deploy).
+fn peer_rerand_target(sync_result: &SyncResult) -> Option<(i32, i32)> {
+    sync_result
+        .all_states
+        .iter()
+        .filter_map(|s| s.rerand_state.as_ref())
+        .map(|s| (s.epoch, s.max_confirmed_chunk))
+        .max() // lexicographic: epoch first, then chunk
+}
+
+/// Returns `Ok(())` if the peer snapshot is within protocol tolerance
+/// and `Err` if fatally desynchronized (gap > 1).
+fn validate_rerand_sync_inner(sync_result: &SyncResult) -> Result<()> {
     let my_state = match sync_result.my_state.rerand_state.as_ref() {
         Some(s) => s,
-        None => return Ok(None),
+        None => return Ok(()),
     };
     let my_epoch = my_state.epoch;
     let my_chunk = my_state.max_confirmed_chunk;
 
-    let mut any_peer_ahead = false;
-
     for s in sync_result
         .all_states
         .iter()
@@ -392,13 +525,8 @@ pub fn compute_rerand_catchup_chunk(sync_result: &SyncResult) -> Result<Option<(
                         my_chunk
                     );
                 }
-                if chunk_diff == 1 {
-                    any_peer_ahead = true;
-                }
-            }
-            1 => {
-                any_peer_ahead = true;
             }
+            1 => {}
             -1 => {}
             _ => {
                 eyre::bail!(
@@ -410,114 +538,176 @@ pub fn compute_rerand_catchup_chunk(sync_result: &SyncResult) -> Result<Option<(
         }
     }
 
-    if !any_peer_ahead {
-        return Ok(None);
-    }
+    Ok(())
+}
 
-    let catchup_chunk = my_chunk + 1;
-    Ok(Some((my_epoch, catchup_chunk)))
+const RERAND_READY_TIMEOUT: Duration = Duration::from_secs(60);
+const RERAND_READY_POLL: Duration = Duration::from_secs(2);
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+enum StartupReadiness {
+    Ready,
+    Behind,
+    Ahead {
+        local_applied: (i32, i32),
+        target: (i32, i32),
+    },
 }
 
-/// Perform rerand catch-up and acquire the advisory lock.
-///
-/// 1. Applies any locally confirmed-but-unapplied chunks (covers the crash
-///    window between `set_all_confirmed` and `apply_staging_chunk`).
-///    Each apply is self-protected by `pg_advisory_xact_lock` inside its
-///    transaction.
-/// 2. If a peer advertises a strictly higher watermark, applies that one
-///    additional chunk (same xact-lock protection).
-/// 3. Acquires a session-level `pg_advisory_lock(RERAND_APPLY_LOCK)` that
-///    the caller holds through `load_iris_db`, preventing the rerand loop
-///    from applying new chunks while the DB snapshot is being read.
-/// 4. Returns the lock-holding connection (caller calls
-///    [`release_rerand_lock`] when done).
-pub async fn rerand_catchup_and_lock(
-    pool: &PgPool,
-    schema_name: &str,
-    sync_result: &SyncResult,
-) -> Result<Option<sqlx::pool::PoolConnection<sqlx::Postgres>>> {
-    let staging_schema = staging_schema_name(schema_name);
+fn classify_startup_readiness_for_target(
+    local_applied: (i32, i32),
+    target: (i32, i32),
+) -> StartupReadiness {
+    if local_applied.cmp(&target) == Ordering::Greater {
+        return StartupReadiness::Ahead {
+            local_applied,
+            target,
+        };
+    }
 
-    // Steps 1+2: apply pending chunks. Each `apply_staging_chunk` acquires
-    // pg_advisory_xact_lock inside its own transaction; we must not hold
-    // the session-level lock yet (it would deadlock across connections).
-    rerand_catchup_inner(pool, &staging_schema, sync_result).await?;
+    if target.1 < 0 {
+        // No confirmed chunks exist in target_epoch yet.
+        return StartupReadiness::Ready;
+    }
 
-    // Step 3: hold the session-level lock through load_iris_db.
-    let mut conn = pool.acquire().await?;
-    sqlx::query("SELECT pg_advisory_lock($1)")
-        .bind(RERAND_APPLY_LOCK)
-        .execute(&mut *conn)
-        .await?;
+    if local_applied == target {
+        StartupReadiness::Ready
+    } else {
+        StartupReadiness::Behind
+    }
+}
+
+async fn get_startup_readiness(
+    conn: &mut sqlx::PgConnection,
+    target: Option<(i32, i32)>,
+) -> Result<StartupReadiness> {
+    if !check_pending_chunks_applied(conn).await? {
+        return Ok(StartupReadiness::Behind);
+    }
+
+    let Some(target) = target else {
+        return Ok(StartupReadiness::Ready);
+    };
 
-    Ok(Some(conn))
+    let local_applied = get_applied_watermark(conn).await?.unwrap_or((-1, -1));
+    Ok(classify_startup_readiness_for_target(local_applied, target))
 }
 
-async fn rerand_catchup_inner(
+/// Wait for local rerand progress to reach the startup snapshot target,
+/// then hold `RERAND_APPLY_LOCK` through DB load.
+///
+/// The loop is lock-first:
+/// 1. acquire `pg_advisory_lock(RERAND_APPLY_LOCK)`,
+/// 2. check readiness while applies are frozen,
+/// 3. if behind, unlock and retry after a short sleep.
+///
+/// This avoids startup/apply races without a separate startup-cap table.
+pub async fn rerand_validate_and_lock(
     pool: &PgPool,
-    staging_schema: &str,
     sync_result: &SyncResult,
-) -> Result<()> {
-    // If the rerand tables haven't been migrated yet, rerand_state is None
-    // (build_rerand_sync_state returns Ok(None) for missing table). Real DB
-    // errors propagate as Err before we get here. Safe to skip catch-up.
+) -> Result<Option<sqlx::pool::PoolConnection<sqlx::Postgres>>> {
     if sync_result.my_state.rerand_state.is_none() {
-        tracing::info!("Rerand catch-up: skipped (rerand not yet migrated)");
-        return Ok(());
+        tracing::info!("Rerand startup lock: skipped (rerand tables not yet migrated)");
+        return Ok(None);
     }
 
-    // Step 1: apply any locally confirmed-but-unapplied chunks.
-    // This closes the crash window where all_confirmed was persisted but
-    // apply_staging_chunk had not yet run.
-    let pending = get_confirmed_unapplied_chunks(pool).await?;
-    for (epoch, chunk_id) in &pending {
-        tracing::info!(
-            "Rerand catch-up: applying locally pending epoch {} chunk {}",
-            epoch,
-            chunk_id,
-        );
-        let rows = apply_staging_chunk(pool, staging_schema, *epoch, *chunk_id).await?;
-        tracing::info!(
-            "Rerand catch-up: applied locally pending epoch {} chunk {} ({} rows)",
-            epoch,
-            chunk_id,
-            rows,
-        );
-    }
+    // One-shot fatal desync check (gap > 1 -> bail).
+    validate_rerand_sync_inner(sync_result)?;
 
-    // Step 2: if a peer is one chunk ahead, apply that chunk too.
-    // If there's no staging data for the chunk, apply is a safe no-op.
-    if let Some((epoch, chunk_id)) = compute_rerand_catchup_chunk(sync_result)? {
-        if !pending.contains(&(epoch, chunk_id)) {
-            tracing::info!(
-                "Rerand catch-up: applying peer-ahead epoch {} chunk {}",
-                epoch,
-                chunk_id,
-            );
-            let rows = apply_staging_chunk(pool, staging_schema, epoch, chunk_id).await?;
-            tracing::info!(
-                "Rerand catch-up: applied peer-ahead epoch {} chunk {} ({} rows)",
-                epoch,
-                chunk_id,
-                rows,
+    let target = peer_rerand_target(sync_result);
+    let deadline = tokio::time::Instant::now() + RERAND_READY_TIMEOUT;
+
+    loop {
+        let mut conn = pool.acquire().await?;
+        let got_lock: (bool,) = sqlx::query_as("SELECT pg_try_advisory_lock($1)")
+            .bind(RERAND_APPLY_LOCK)
+            .fetch_one(&mut *conn)
+            .await?;
+        if !got_lock.0 {
+            drop(conn);
+            if tokio::time::Instant::now() >= deadline {
+                eyre::bail!(
+                    "Rerand lock not available after {:?} (target={:?}); \
+                     ensure the rerand worker is healthy.",
+                    RERAND_READY_TIMEOUT,
+                    target
+                );
+            }
+            tokio::time::sleep(RERAND_READY_POLL).await;
+            continue;
+        }
+
+        let readiness = match get_startup_readiness(&mut conn, target).await {
+            Ok(readiness) => readiness,
+            Err(e) => {
+                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
+                    .bind(RERAND_APPLY_LOCK)
+                    .execute(&mut *conn)
+                    .await;
+                drop(conn);
+                return Err(e);
+            }
+        };
+
+        match readiness {
+            StartupReadiness::Ready => return Ok(Some(conn)),
+            StartupReadiness::Ahead {
+                local_applied,
+                target,
+            } => {
+                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
+                    .bind(RERAND_APPLY_LOCK)
+                    .execute(&mut *conn)
+                    .await;
+                drop(conn);
+                eyre::bail!(
+                    "Rerand advanced past startup snapshot target: local_applied={:?}, target={:?}. \
+                     Restart and retry startup.",
+                    local_applied,
+                    target,
+                );
+            }
+            StartupReadiness::Behind => {
+                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
+                    .bind(RERAND_APPLY_LOCK)
+                    .execute(&mut *conn)
+                    .await;
+                drop(conn);
+            }
+        }
+
+        if tokio::time::Instant::now() >= deadline {
+            eyre::bail!(
+                "Rerand not caught up after {:?} (target={:?}); \
+                 ensure the rerand worker is running.",
+                RERAND_READY_TIMEOUT,
+                target
             );
         }
-    } else if pending.is_empty() {
-        tracing::info!("Rerand catch-up: no chunks to apply");
-    }
 
-    Ok(())
+        tracing::info!(
+            "Waiting for rerand worker catch-up (target={:?}, {:.0}s left)...",
+            target,
+            deadline
+                .saturating_duration_since(tokio::time::Instant::now())
+                .as_secs_f64(),
+        );
+        tokio::time::sleep(RERAND_READY_POLL).await;
+    }
 }
 
-/// Release the advisory lock acquired by [`rerand_catchup_and_lock`].
+/// Release the advisory lock and close the connection.
+///
+/// Explicit release keeps the lock lifecycle clear in logs and avoids
+/// returning a locked connection to the pool.
 pub async fn release_rerand_lock(
     lock_conn: Option<sqlx::pool::PoolConnection<sqlx::Postgres>>,
 ) -> Result<()> {
     if let Some(mut conn) = lock_conn {
-        sqlx::query("SELECT pg_advisory_unlock($1)")
+        let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
             .bind(RERAND_APPLY_LOCK)
             .execute(&mut *conn)
-            .await?;
+            .await;
         drop(conn);
         tracing::info!("Rerand advisory lock released after DB load");
     }
@@ -544,7 +734,7 @@ mod tests {
     }
 
     #[test]
-    fn test_catchup_peer_one_chunk_ahead() {
+    fn test_validate_peer_one_chunk_ahead_ok() {
         let p0 = dummy_sync_state(1, 4);
         let p1 = dummy_sync_state(1, 4);
         let p2 = dummy_sync_state(1, 5);
@@ -552,14 +742,11 @@ mod tests {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert_eq!(
-            compute_rerand_catchup_chunk(&sync_result).unwrap(),
-            Some((1, 5))
-        );
+        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
     }
 
     #[test]
-    fn test_catchup_all_same() {
+    fn test_validate_all_same_ok() {
         let p0 = dummy_sync_state(1, 5);
         let p1 = dummy_sync_state(1, 5);
         let p2 = dummy_sync_state(1, 5);
@@ -567,11 +754,11 @@ mod tests {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert_eq!(compute_rerand_catchup_chunk(&sync_result).unwrap(), None);
+        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
     }
 
     #[test]
-    fn test_catchup_peer_epoch_ahead() {
+    fn test_validate_peer_epoch_ahead_ok() {
         let p0 = dummy_sync_state(0, 5);
         let p1 = dummy_sync_state(1, 0);
         let p2 = dummy_sync_state(0, 5);
@@ -579,14 +766,11 @@ mod tests {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert_eq!(
-            compute_rerand_catchup_chunk(&sync_result).unwrap(),
-            Some((0, 6))
-        );
+        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
     }
 
     #[test]
-    fn test_catchup_peer_epoch_behind() {
+    fn test_validate_peer_epoch_behind_ok() {
         let p0 = dummy_sync_state(1, 2);
         let p1 = dummy_sync_state(0, 10);
         let p2 = dummy_sync_state(1, 2);
@@ -594,11 +778,11 @@ mod tests {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert_eq!(compute_rerand_catchup_chunk(&sync_result).unwrap(), None);
+        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
     }
 
     #[test]
-    fn test_catchup_fatal_chunk_desync() {
+    fn test_validate_fatal_chunk_desync() {
         let p0 = dummy_sync_state(1, 2);
         let p1 = dummy_sync_state(1, 4);
         let p2 = dummy_sync_state(1, 2);
@@ -606,11 +790,11 @@ mod tests {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert!(compute_rerand_catchup_chunk(&sync_result).is_err());
+        assert!(validate_rerand_sync_inner(&sync_result).is_err());
     }
 
     #[test]
-    fn test_catchup_fatal_epoch_desync() {
+    fn test_validate_fatal_epoch_desync() {
         let p0 = dummy_sync_state(1, 2);
         let p1 = dummy_sync_state(3, 10);
         let p2 = dummy_sync_state(1, 2);
@@ -618,6 +802,43 @@ mod tests {
             my_state: p0.clone(),
             all_states: vec![p0, p1, p2],
         };
-        assert!(compute_rerand_catchup_chunk(&sync_result).is_err());
+        assert!(validate_rerand_sync_inner(&sync_result).is_err());
+    }
+
+    #[test]
+    fn test_classify_target_chunk_minus_one_previous_epoch_applied_is_ready() {
+        let readiness = classify_startup_readiness_for_target((0, 42), (1, -1));
+        assert_eq!(readiness, StartupReadiness::Ready);
+    }
+
+    #[test]
+    fn test_classify_target_chunk_minus_one_same_epoch_applied_is_ahead() {
+        let readiness = classify_startup_readiness_for_target((1, 0), (1, -1));
+        assert_eq!(
+            readiness,
+            StartupReadiness::Ahead {
+                local_applied: (1, 0),
+                target: (1, -1)
+            }
+        );
+    }
+
+    #[test]
+    fn test_classify_target_positive_behind_ready_ahead() {
+        assert_eq!(
+            classify_startup_readiness_for_target((1, 2), (1, 3)),
+            StartupReadiness::Behind
+        );
+        assert_eq!(
+            classify_startup_readiness_for_target((1, 3), (1, 3)),
+            StartupReadiness::Ready
+        );
+        assert_eq!(
+            classify_startup_readiness_for_target((1, 4), (1, 3)),
+            StartupReadiness::Ahead {
+                local_applied: (1, 4),
+                target: (1, 3)
+            }
+        );
     }
 }
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 7423a4bb15..84e8430cab 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -4,10 +4,9 @@ use bytemuck::cast_slice;
 use eyre::Result;
 use futures::StreamExt;
 use iris_mpc_store::rerand::{
-    apply_staging_chunk, delete_staging_chunk, delete_staging_ids, get_current_epoch,
-    get_locally_divergent_ids, get_rerand_progress, get_staging_version_map, insert_staging_irises,
-    set_all_confirmed, set_staging_written, staging_schema_name, upsert_rerand_progress,
-    StagingIrisEntry, RERAND_MODIFY_LOCK,
+    delete_staging_chunk, fenced_apply_chunk, get_current_epoch, get_rerand_progress,
+    get_staging_version_map, insert_staging_irises, set_all_confirmed, set_staging_written,
+    staging_schema_name, upsert_rerand_progress, StagingIrisEntry,
 };
 use iris_mpc_store::Store;
 use sqlx::PgPool;
@@ -161,7 +160,7 @@ pub async fn run_continuous_rerand(
             }
 
             // --- Modification fence ---
-            // 1. Compute cross-party version_id disagreements
+            // 1. Compute cross-party version_id disagreements (before lock)
             let cross_party_divergent = s3_coordination::compute_cross_party_divergent_ids(
                 s3,
                 &config.s3_bucket,
@@ -171,57 +170,23 @@ pub async fn run_continuous_rerand(
             )
             .await?;
 
-            // 2. Lock to prevent new modifications during apply
-            let mut modify_lock_conn = pool.acquire().await?;
-            sqlx::query("SELECT pg_advisory_lock($1)")
-                .bind(RERAND_MODIFY_LOCK)
-                .execute(&mut *modify_lock_conn)
-                .await?;
-
-            // 3. Check local staging vs live for post-staging modifications
-            let local_divergent = get_locally_divergent_ids(
+            // 2-6. Lock, check, prune, apply, unlock — helper guarantees
+            //      unlock on all error paths.
+            let (rows, skip_count) = fenced_apply_chunk(
                 pool,
                 &staging_schema,
                 active_epoch as i32,
                 chunk_id as i32,
+                cross_party_divergent,
             )
             .await?;
 
-            // 4. Union of both divergence sources
-            let mut skip_ids: Vec<i64> = cross_party_divergent;
-            skip_ids.extend(&local_divergent);
-            skip_ids.sort_unstable();
-            skip_ids.dedup();
-
-            if !skip_ids.is_empty() {
-                tracing::info!(
-                    "Epoch {} chunk {}: skipping {} IDs due to concurrent modifications: {:?}",
-                    active_epoch,
-                    chunk_id,
-                    skip_ids.len(),
-                    &skip_ids[..std::cmp::min(skip_ids.len(), 10)],
-                );
-                delete_staging_ids(pool, &staging_schema, active_epoch as i32, &skip_ids).await?;
-            }
-
-            // 5. Apply (now consistent across all parties)
-            let rows =
-                apply_staging_chunk(pool, &staging_schema, active_epoch as i32, chunk_id as i32)
-                    .await?;
-
-            // 6. Release modification lock
-            sqlx::query("SELECT pg_advisory_unlock($1)")
-                .bind(RERAND_MODIFY_LOCK)
-                .execute(&mut *modify_lock_conn)
-                .await?;
-            drop(modify_lock_conn);
-
             tracing::info!(
                 "Epoch {} chunk {}: applied to live DB ({} rows updated, {} skipped)",
                 active_epoch,
                 chunk_id,
                 rows,
-                skip_ids.len(),
+                skip_count,
             );
 
             chunk_id += 1;
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index a6fbdbd508..1bc49ff8a3 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -257,41 +257,41 @@ fn phase6_multiple_epochs() {
 }
 
 // ============================================================================
-// Phase 7: Epoch boundary desync -- simulate epoch mismatch
+// Phase 7: Startup validation rejects fatal desync and accepts in-sync state.
 // ============================================================================
 
 #[test]
-fn phase7_epoch_boundary_desync() {
+fn phase7_startup_validation() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
         let env = TestEnv::setup().await?;
-        println!("[phase 7] Epoch boundary desync...");
+        println!("[phase 7] Startup validation...");
 
-        // Setup the exact boundary desync state in DB manually to test catch-up logic
-        // P1 is on Epoch 0 (has max epoch 0)
-        // P0 and P2 are on Epoch 1 (have max epoch 1)
+        // Fatal desync (gap > 1) → immediate bail
         for p in 0..NUM_PARTIES {
             let pool = &env.harness.parties[p].store.pool;
-            // Everyone completes Epoch 0
             sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (0, 0, TRUE, TRUE, TRUE)")
-            .execute(pool).await.unwrap();
+                .execute(pool).await.unwrap();
         }
+        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (2, 0, TRUE, TRUE, TRUE)")
+            .execute(&env.harness.parties[0].store.pool).await.unwrap();
 
-        // P0 and P2 move to Epoch 1
-        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (1, 0, TRUE, TRUE, FALSE)")
-    .execute(&env.harness.parties[0].store.pool).await.unwrap();
-        sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (1, 0, TRUE, TRUE, FALSE)")
-.execute(&env.harness.parties[2].store.pool).await.unwrap();
+        let r_fatal = simulate_server_startup(&env.harness, 1).await;
+        assert!(r_fatal.is_err(), "Fatal epoch gap should bail immediately");
 
-        // Now simulate P1 main server startup (P1 is behind on Epoch 0)
-        // Should catch up using safe_up_to = i32::MAX
-        let r1 = simulate_server_startup(&env.harness, 1).await;
-        assert!(r1.is_ok(), "P1 startup failed during epoch mismatch");
+        // In-sync → startup succeeds immediately
+        for p in 0..NUM_PARTIES {
+            let pool = &env.harness.parties[p].store.pool;
+            sqlx::query("DELETE FROM rerand_progress")
+                .execute(pool)
+                .await
+                .unwrap();
+            sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (0, 0, TRUE, TRUE, TRUE)")
+                .execute(pool).await.unwrap();
+        }
 
-        // Now simulate P0 main server startup (P0 is ahead on Epoch 1)
-        // Should catch up using safe_up_to = -1 (nobody confirmed Epoch 1 yet since P1 hasn't started it)
-        let r0 = simulate_server_startup(&env.harness, 0).await;
-        assert!(r0.is_ok(), "P0 startup failed during epoch mismatch");
+        let r_ok = simulate_server_startup(&env.harness, 0).await;
+        assert!(r_ok.is_ok(), "In-sync startup should succeed");
 
         println!("[phase 7] PASSED");
 
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index b4fd934fd0..7387c4a310 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -414,12 +414,12 @@ pub async fn wait_chunks_staged(harness: &TestHarness, epoch: i32, n: i32) -> Re
 pub async fn simulate_server_startup(harness: &TestHarness, party: usize) -> Result<()> {
     let sync_result = build_test_sync_result(harness, party).await?;
     let pool = &harness.parties[party].store.pool;
-    let schema = &harness.parties[party].schema_name;
-    let lock_conn = rerand_store::rerand_catchup_and_lock(pool, schema, &sync_result).await?;
-    let _count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM irises")
+    let lock_conn = rerand_store::rerand_validate_and_lock(pool, &sync_result).await?;
+    let query_result: Result<(i64,), sqlx::Error> = sqlx::query_as("SELECT COUNT(*) FROM irises")
         .fetch_one(pool)
-        .await?;
+        .await;
     rerand_store::release_rerand_lock(lock_conn).await?;
+    let _count = query_result?;
     Ok(())
 }
 
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index 35d9ad91fc..53976743fb 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -139,42 +139,44 @@ pub async fn server_main(config: Config) -> Result<()> {
 
     sync_sqs_queues(&config, &sync_result, &aws_clients).await?;
 
-    let rerand_lock_conn = rerand_store::rerand_catchup_and_lock(
-        &iris_store.pool,
-        &iris_store.schema_name,
-        &sync_result,
-    )
-    .await?;
+    let rerand_lock_conn =
+        rerand_store::rerand_validate_and_lock(&iris_store.pool, &sync_result).await?;
 
     if shutdown_handler.is_shutting_down() {
         tracing::warn!("Shutting down has been triggered");
+        rerand_store::release_rerand_lock(rerand_lock_conn).await?;
         return Ok(());
     }
 
-    let mut hawk_actor = init_hawk_actor(&config, &shutdown_handler).await?;
-
-    if let Some(url) = config.get_anon_stats_db_url() {
-        let schema = config.get_anon_stats_db_schema();
-        let anon_client =
-            AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
-        let anon_store = AnonStatsStore::new(&anon_client).await?;
-        hawk_actor.set_anon_stats_store(Some(anon_store));
-    } else {
-        tracing::warn!(
-                "Anon stats persistence enabled but no anon stats database configured; skipping DB writes"
-            );
-    }
+    let startup_result = async {
+        let mut hawk_actor = init_hawk_actor(&config, &shutdown_handler).await?;
+
+        if let Some(url) = config.get_anon_stats_db_url() {
+            let schema = config.get_anon_stats_db_schema();
+            let anon_client =
+                AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
+            let anon_store = AnonStatsStore::new(&anon_client).await?;
+            hawk_actor.set_anon_stats_store(Some(anon_store));
+        } else {
+            tracing::warn!(
+                    "Anon stats persistence enabled but no anon stats database configured; skipping DB writes"
+                );
+        }
 
-    load_database(
-        &config,
-        &iris_store,
-        &graph_store,
-        &shutdown_handler,
-        &mut hawk_actor,
-    )
-    .await?;
+        load_database(
+            &config,
+            &iris_store,
+            &graph_store,
+            &shutdown_handler,
+            &mut hawk_actor,
+        )
+        .await?;
+        Ok::<_, eyre::Report>(hawk_actor)
+    }
+    .await;
 
     rerand_store::release_rerand_lock(rerand_lock_conn).await?;
+    let hawk_actor = startup_result?;
 
     background_tasks.check_tasks();
 

From 82a306b41987292cf2d0f544d057ce4081a88963 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sat, 28 Feb 2026 23:11:31 +0100
Subject: [PATCH 10/76] update design

---
 Cargo.lock                                    |  28 +-
 Cargo.toml                                    |   8 +-
 docs/specs/rerandomization.md                 | 415 +++++++--
 iris-mpc-bins/bin/iris-mpc/server.rs          | 346 ++++---
 iris-mpc-common/src/helpers/sync.rs           |  56 +-
 iris-mpc-store/Cargo.toml                     |   4 +
 iris-mpc-store/src/lib.rs                     |  33 +
 iris-mpc-store/src/rerand.rs                  | 863 ++++++++----------
 iris-mpc-upgrade/src/continuous_rerand.rs     | 116 ++-
 iris-mpc-upgrade/src/s3_coordination.rs       |  51 +-
 iris-mpc-upgrade/tests/test_utils.rs          |  26 +-
 iris-mpc/Cargo.toml                           |   2 +
 iris-mpc/src/server/mod.rs                    | 132 ++-
 iris-mpc/src/services/processors/batch.rs     |  17 +-
 iris-mpc/src/services/processors/job.rs       |   7 +
 .../services/processors/modifications_sync.rs | 245 +++--
 ...60226000004_create_rerand_control.down.sql |   1 +
 ...0260226000004_create_rerand_control.up.sql |   8 +
 18 files changed, 1438 insertions(+), 920 deletions(-)
 create mode 100644 migrations/20260226000004_create_rerand_control.down.sql
 create mode 100644 migrations/20260226000004_create_rerand_control.up.sql

diff --git a/Cargo.lock b/Cargo.lock
index b11660e34b..03166ad567 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -78,7 +78,7 @@ checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
 [[package]]
 name = "ampc-actor-utils"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=a44f978cc0c39942ef360a447f91e76769e800bb#a44f978cc0c39942ef360a447f91e76769e800bb"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=9e19ea57b1f7db11843d39ada2599557ce028a7b#9e19ea57b1f7db11843d39ada2599557ce028a7b"
 dependencies = [
  "aes-prng",
  "ampc-secret-sharing",
@@ -109,7 +109,7 @@ dependencies = [
 [[package]]
 name = "ampc-anon-stats"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=a44f978cc0c39942ef360a447f91e76769e800bb#a44f978cc0c39942ef360a447f91e76769e800bb"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=9e19ea57b1f7db11843d39ada2599557ce028a7b#9e19ea57b1f7db11843d39ada2599557ce028a7b"
 dependencies = [
  "ampc-actor-utils",
  "ampc-secret-sharing",
@@ -135,7 +135,7 @@ dependencies = [
 [[package]]
 name = "ampc-secret-sharing"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=a44f978cc0c39942ef360a447f91e76769e800bb#a44f978cc0c39942ef360a447f91e76769e800bb"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=9e19ea57b1f7db11843d39ada2599557ce028a7b#9e19ea57b1f7db11843d39ada2599557ce028a7b"
 dependencies = [
  "aes-prng",
  "bytemuck",
@@ -150,7 +150,7 @@ dependencies = [
 [[package]]
 name = "ampc-server-utils"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=a44f978cc0c39942ef360a447f91e76769e800bb#a44f978cc0c39942ef360a447f91e76769e800bb"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=9e19ea57b1f7db11843d39ada2599557ce028a7b#9e19ea57b1f7db11843d39ada2599557ce028a7b"
 dependencies = [
  "aws-sdk-secretsmanager",
  "aws-sdk-sqs",
@@ -1237,7 +1237,7 @@ dependencies = [
  "bitflags 2.6.0",
  "cexpr",
  "clang-sys",
- "itertools 0.13.0",
+ "itertools 0.10.5",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -2919,12 +2919,14 @@ dependencies = [
  "aws-sdk-secretsmanager",
  "aws-sdk-sns",
  "aws-sdk-sqs",
+ "axum 0.7.7",
  "base64",
  "bincode",
  "bytemuck",
  "chrono",
  "clap",
  "eyre",
+ "futures",
  "iris-mpc-common",
  "iris-mpc-cpu",
  "iris-mpc-store",
@@ -3170,9 +3172,13 @@ dependencies = [
  "iris-mpc-common",
  "itertools 0.13.0",
  "rand 0.8.5",
+ "reqwest",
+ "serde_json",
  "sqlx",
  "tokio",
+ "tokio-util",
  "tracing",
+ "uuid",
 ]
 
 [[package]]
@@ -3382,7 +3388,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -4348,7 +4354,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
  "heck",
- "itertools 0.13.0",
+ "itertools 0.10.5",
  "log",
  "multimap",
  "once_cell",
@@ -4381,7 +4387,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
  "anyhow",
- "itertools 0.13.0",
+ "itertools 0.10.5",
  "proc-macro2",
  "quote",
  "syn",
@@ -4505,7 +4511,7 @@ dependencies = [
  "quinn-udp",
  "rustc-hash",
  "rustls 0.23.35",
- "socket2 0.6.0",
+ "socket2 0.5.7",
  "thiserror 2.0.16",
  "tokio",
  "tracing",
@@ -4542,7 +4548,7 @@ dependencies = [
  "cfg_aliases",
  "libc",
  "once_cell",
- "socket2 0.6.0",
+ "socket2 0.5.7",
  "tracing",
  "windows-sys 0.59.0",
 ]
@@ -6573,7 +6579,7 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 29fba22e33..fe8e6ea1ae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -79,10 +79,10 @@ tokio-util = "0.7.15"
 toml = { version = "0.8.23", features = ["preserve_order"] }
 uuid = { version = "1", features = ["v4"] }
 iris-mpc-cpu = { path = "./iris-mpc-cpu" }
-ampc-anon-stats =  { git = "https://github.com/worldcoin/ampc-common.git", rev = "a44f978cc0c39942ef360a447f91e76769e800bb" }
-ampc-actor-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "a44f978cc0c39942ef360a447f91e76769e800bb" }
-ampc-secret-sharing = { git = "https://github.com/worldcoin/ampc-common.git", rev = "a44f978cc0c39942ef360a447f91e76769e800bb" }
-ampc-server-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "a44f978cc0c39942ef360a447f91e76769e800bb" }
+ampc-anon-stats =  { git = "https://github.com/worldcoin/ampc-common.git", rev = "9e19ea57b1f7db11843d39ada2599557ce028a7b" }
+ampc-actor-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "9e19ea57b1f7db11843d39ada2599557ce028a7b" }
+ampc-secret-sharing = { git = "https://github.com/worldcoin/ampc-common.git", rev = "9e19ea57b1f7db11843d39ada2599557ce028a7b" }
+ampc-server-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "9e19ea57b1f7db11843d39ada2599557ce028a7b" }
 
 # Abort on panics rather than unwinding.
 # This improves performance and makes panic propagation more reliable.
diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 30bd5d59b5..0ff6a712d4 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -6,10 +6,22 @@ Replaces the existing, one-off rerandomization protocol by a continuous, online
 
 Key design decision: in-memory shares are less likely to be exfiltrated, so only the DB (at-rest persistence) is rerandomized. The actor is completely unmodified. The rerand server handles everything, writing to a staging schema and then copying to live once all parties confirm.
 
+## Critical assumption: reliable modification delivery
+
+The correctness of this protocol depends on **every modification (reauth, deletion, reset) eventually arriving at every party via SQS**. This is a pre-existing system invariant — without it, the MPC shares diverge regardless of rerandomization. Rerandomization does not weaken this guarantee, but it does create a new transient inconsistency window (see [Post-staging modifications](#post-staging-modifications-transient-inconsistency)) that relies on modification delivery to self-correct.
+
+Three mechanisms enforce this invariant:
+
+1. **SQS delete after persist** — the SQS message is only deleted *after* the modification row is durably written to the DB. If the process crashes between receiving and persisting, SQS redelivers the message. This eliminates the window where a message could be lost between delete and persist.
+
+2. **Startup reconciliation recovers missing modifications** — `sync_modifications` compares modification state across all three parties. If a modification is completed on peers but missing locally (e.g., from a historical race before the delete-after-persist fix), it is now recovered from the peer's copy and applied locally. Large modification-ID drift across nodes is logged as an error but does not crash the process, allowing best-effort reconciliation.
+
+3. **Remaining gap**: `sync_modifications` is a startup procedure, not a continuous background loop. A running node that permanently lost a modification (and never restarts) will stay inconsistent for the affected row until the next epoch re-randomizes it. Periodic rolling restarts or a future continuous reconciliation loop would close this gap entirely.
+
 ## Architecture
 
-1. **Rerand Server** (modified `iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs`, separate process, one per party) — rerandomizes shares, writes to staging, coordinates with peers via S3 markers, copies confirmed chunks to live DB. Replaces the existing one-off `RerandomizeDb` subcommand with a new `RerandomizeContinuous` subcommand. Core rerandomization logic in `iris-mpc-upgrade/src/rerandomization.rs` is reused; the new subcommand adds the continuous loop, S3 coordination, and staging management.
-2. **Main Server** (existing, minimal changes) — at startup, syncs rerand progress with peers and catches up any missing chunks from staging before loading the DB into memory. Acquires `RERAND_MODIFY_LOCK` during modification writes to serialize with rerand applies.
+1. **Rerand Server** (separate process, one per party) — rerandomizes shares, writes to staging, coordinates with peers via S3 markers, copies confirmed chunks to live DB.
+2. **Main Server** (existing, minimal changes) — acquires `RERAND_APPLY_LOCK` at startup to freeze applies during `load_iris_db`. Acquires `RERAND_MODIFY_LOCK` during modification writes to serialize with rerand applies.
 
 The GPU actor, batch processing, and result processor are completely untouched.
 
@@ -135,36 +147,51 @@ Chunk ranges are derived from the manifest (`chunk_size`, `max_id_inclusive`) an
 
 Lifecycle: `staging_written` → `all_confirmed` → `live_applied`.
 
+### Control table (freeze protocol)
+
+A `rerand_control` table with a single row, used for the coordinated freeze between the main server and the rerand worker:
+
+```sql
+CREATE TABLE rerand_control (
+    id                  INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1),
+    freeze_requested    BOOLEAN NOT NULL DEFAULT FALSE,
+    freeze_generation   TEXT,
+    frozen_generation   TEXT
+);
+INSERT INTO rerand_control (id) VALUES (1) ON CONFLICT DO NOTHING;
+```
+
+- `freeze_requested`: set to `TRUE` by the main server during startup; the rerand worker checks this between chunks.
+- `freeze_generation`: a unique UUID per freeze request (fencing token); prevents stale acknowledgements from prior startups.
+- `frozen_generation`: written by the rerand worker to acknowledge the freeze; main server polls until this matches `freeze_generation`.
+
 ## Flow
 
 ### Step 1: Rerand Server (per party, separate process)
 
-Runs continuously:
+Runs continuously. **Between every chunk boundary** (and between epochs), the worker checks the `rerand_control` table for a freeze request from the main server. If `freeze_requested = TRUE`, it writes `frozen_generation = <generation>` to acknowledge the freeze, then blocks until `freeze_requested = FALSE`. This guarantees the worker is quiesced and not holding any locks during DB load.
 
 1. Determine the active epoch E (uses local `rerand_progress` as start hint, then scans S3 for the highest epoch with a manifest but without all three `complete` markers).
 2. Derive `shared_secret` for epoch E (keygen or resume — see above)
 3. Pick next chunk range `[start, end)` for chunk K from the manifest
 4. **Stage**: delete any partial staging data for this chunk (crash recovery clean slate), read entries from live schema recording each entry's `version_id`, rerandomize shares using `BLAKE3(shared_secret || iris_id)` XOF, write to staging schema with `epoch = E`, `original_version_id`, `chunk_id = K`, and `rerand_epoch = E + 1`
 5. Set `staging_written = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
-6. Upload version map `[(id, original_version_id)]` for the chunk to S3: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/version-map`
+6. Upload version map `[(id, original_version_id)]` and its blake3 hash for the chunk to S3
 7. Upload S3 staged marker: `s3://bucket/rerand/epoch-{E}/party-{P}/chunk-{K}/staged`
 8. Poll S3 until all 3 party staged markers exist for chunk K
 9. Set `all_confirmed = TRUE` in local `rerand_progress` for `(epoch = E, chunk_id = K)`
-10. **Modification fence and Apply**:
-    a. Download all 3 parties' version maps for chunk K. Compute cross-party disagreement set: IDs where any party captured a different `original_version_id`.
-    b. **Fenced Apply Transaction**: open a single transaction to check local divergence and apply safely:
+10. **Apply**:
+    a. Download all 3 parties' version-map hashes. If all match (fast path), the staging-divergent set is empty. If any differ, download full maps and compute cross-party disagreements: IDs where any party captured a different `original_version_id`. This is purely S3 reads — no DB lock is held.
+    b. **Apply transaction**: open a single transaction, acquire locks, delete staging-divergent rows, apply with `version_id` CAS, clean up:
        ```sql
        BEGIN;
-       -- Block the main server from writing modifications
        SELECT pg_advisory_xact_lock(RERAND_MODIFY_LOCK);
+       SELECT pg_advisory_xact_lock(RERAND_APPLY_LOCK);
 
-       -- Query local divergences (modifications that landed after staging)
-       -- ... compute skip set = cross-party disagreements ∪ local divergences ...
-       -- Delete skip set from staging schema
+       -- Delete staging-divergent IDs (cross-party disagreements)
+       DELETE FROM staging.irises WHERE epoch = E AND chunk_id = K AND id = ANY(staging_divergent);
 
-       -- Proceed to apply
-       SELECT pg_advisory_xact_lock(RERAND_APPLY_LOCK);
-       
+       -- Apply with version_id CAS — silently skips post-staging modifications
        UPDATE irises SET
          left_code = staging.left_code,
          left_mask = staging.left_mask,
@@ -176,82 +203,94 @@ Runs continuously:
          AND staging.epoch = E
          AND staging.chunk_id = K
          AND irises.version_id = staging.original_version_id;
-         
+
        DELETE FROM staging_schema.irises WHERE epoch = E AND chunk_id = K;
        UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = E AND chunk_id = K;
-       COMMIT;  -- Both RERAND_MODIFY_LOCK and RERAND_APPLY_LOCK released here
+       COMMIT;  -- Both locks released here
        ```
 11. Proceed to next chunk (or start epoch transition if all chunks done).
 
+**Key property: no S3 I/O while holding DB locks.** The version-map comparison (step 10a) completes before the transaction opens. Lock hold time is bounded by DB I/O only.
+
 **Crash recovery for staging**: if the process crashes mid-staging, `staging_written` is still `FALSE`. On restart, the code re-enters the staging block and deletes any partial rows before re-reading. This ensures all staging rows come from one read pass (no mixed-snapshot version_ids). Inserts use `ON CONFLICT (epoch, id) DO NOTHING` as a safety net.
 
 **Crash recovery for S3 upload**: the S3 staged marker upload is outside the `if !staging_written` block. If the process crashes after `set_staging_written` but before the S3 upload, the marker is re-uploaded on restart (idempotent PUT).
 
-### Step 2: Main Server Startup (minimal changes)
+**Crash recovery for apply**: if the process crashes during the apply transaction, the transaction rolls back (releasing both locks). On restart, `live_applied` is still `FALSE`, so the apply is retried. The `version_id` CAS re-evaluates against current live values — safe and idempotent.
+
+### Step 2: Main Server Startup
 
 At startup, before `load_iris_db`:
 
 1. **Existing**: modification sync (`sync_modifications`) — all parties catch up on modifications, producing identical `version_id` values. This transaction acquires `pg_advisory_xact_lock(RERAND_MODIFY_LOCK)` to serialize with rerand applies.
-2. **New**: rerand sync validation — parties exchange a compact rerand watermark during the existing startup sync (`SyncState` exchange):
-   - Each party computes `(epoch, max_confirmed_chunk)` from its local `rerand_progress` table: the active epoch E and the highest `chunk_id` where `all_confirmed = TRUE`. Returns `None` if the `rerand_progress` table doesn't exist yet (rolling deploy before migration); real DB errors propagate as `Err`.
-   - Each party sends this single `(epoch, max_confirmed_chunk)` pair as part of `SyncState`.
-   - The startup validator checks two invariants: epoch gap ≤ 1 and chunk gap ≤ 1 (within the same epoch). If either is violated, startup fails (indicates DB corruption).
-   - If a peer is at most 1 chunk or 1 epoch ahead, this is within protocol tolerance. **Startup does NOT apply any chunks.** The rerand worker is responsible for catch-up through the full modification-fence path. If the node is behind, the rerand worker must run and complete the pending chunk before the next startup succeeds.
-3. **New**: lock-first startup readiness loop:
-   - Acquire `pg_advisory_lock(RERAND_APPLY_LOCK)` on a dedicated connection.
-   - While the lock is held (applies frozen), verify local DB readiness: no `all_confirmed=TRUE AND live_applied=FALSE` rows remain, and local applied watermark has reached the startup target from step 2.
-   - Special case: if startup target is `(epoch = E, max_confirmed_chunk = -1)`, no chunk is confirmed in epoch E yet, so startup does not wait for an apply in epoch E.
-   - If behind, release the lock and retry after a short sleep (lets rerand worker apply the pending chunk).
-   - If local watermark has already advanced past the startup target, fail startup (snapshot stale; restart and resync).
-   - Once ready, keep the lock held through step 4. This prevents the rerand loop from applying new chunks while the DB snapshot is being read.
-4. **Existing**: `load_iris_db` — loads from live DB into GPU memory. The advisory lock is still held, so the rerand server cannot apply new chunks while the DB is being read into memory.
-5. Release the advisory lock: `SELECT pg_advisory_unlock(RERAND_APPLY_LOCK)` on the dedicated connection, then drop the connection.
+2. **New — Coordinated freeze with watermark convergence**:
+   a. Write `freeze_requested = TRUE, freeze_generation = <uuid>` to `rerand_control`. This signals the rerand worker to pause at the next chunk boundary.
+   b. Poll `rerand_control` until `frozen_generation = <uuid>` (the worker has acknowledged the freeze and is not holding any locks or applying any chunks).
+   c. Fetch live applied watermarks from all peers via their `/rerand-watermark` endpoint (always queries the DB — not a stale snapshot).
+   d. Compare watermarks. Three cases:
+      - **All equal** → proceed to DB load.
+      - **Local is behind max(peers)** → release the local freeze so the worker can catch up (apply the pending chunk), sleep briefly, then re-freeze and re-check from step (a).
+      - **Local is at or ahead of max(peers)** → stay frozen and re-poll peers after a short sleep. The behind parties' startups will release their own freezes, letting their workers catch up.
+   e. This loop converges because: only behind parties release their freeze, leading parties stay frozen (can't advance), and the S3 barrier limits the gap to at most 1 chunk. Timeout after 2 minutes if convergence doesn't happen (indicates a stuck worker).
+3. **New**: acquire `RERAND_APPLY_LOCK` on a dedicated connection (belt-and-suspenders with the freeze).
+4. **Existing**: `load_iris_db` — loads from live DB into GPU/HNSW memory. Both the freeze and the advisory lock are held, so the rerand server cannot apply new chunks.
+5. Release `RERAND_APPLY_LOCK`.
+6. Clear `freeze_requested = FALSE` in `rerand_control`. The rerand worker resumes.
+
+**Rollout note**: if the `rerand_control` table doesn't exist yet (pre-migration), the freeze is skipped and startup proceeds without the freeze handshake.
+
+**Fail-closed invariant**: modification drift that exceeds the configured lookback window causes a hard panic (not a best-effort continue). This prevents startup with incomplete reconciliation.
 
-**Why startup does not apply chunks**: every chunk apply must go through the modification fence (cross-party version-map exchange + local divergence check under `RERAND_MODIFY_LOCK`). The startup path has no access to the S3 coordination bus and cannot perform the fence. Applying unfenced chunks at startup would create the same cross-party share divergence the fence was designed to prevent.
+### Advisory locks
 
-**Rollout note**: if the rerand tables haven't been migrated yet, `build_rerand_sync_state` returns `Ok(None)` for missing table. The validation is skipped and startup proceeds without error.
+Two advisory lock keys are used:
 
-### Epoch and chunk desync safety checks
+- **`RERAND_APPLY_LOCK`** — serializes chunk applies with `load_iris_db`. Used as `pg_advisory_xact_lock` inside the apply transaction (auto-released on commit/rollback), and as session-level `pg_advisory_lock` during startup to hold through `load_iris_db`.
+- **`RERAND_MODIFY_LOCK`** — serializes modification writes with the rerand apply. The rerand server acquires it (`pg_advisory_xact_lock`) at the start of the apply transaction. The main server acquires it (`pg_advisory_xact_lock`) inside its modification transaction to prevent writes during the apply window.
 
-The startup sync validates two invariants derived from the protocol's synchronization barriers:
+**Why `pg_advisory_xact_lock` for applies and modifications**: session-level locks are tied to a connection. If a process is killed while holding a session-level lock on a pooled connection, the connection may be returned to the pool with the lock still held, blocking future acquirers indefinitely. Transaction-level locks avoid this: when the connection is dropped, the transaction rolls back and the lock is released automatically.
 
-- **Epoch gap ≤ 1**: epochs transition via a 3-party S3 barrier (`complete` markers), so no peer can be more than 1 epoch ahead. A gap > 1 is fatal.
-- **Chunk gap ≤ 1** (within the same epoch): the per-chunk S3 barrier (`staged` markers) prevents any peer from confirming more than 1 chunk ahead. A gap > 1 is fatal.
+## Conflict Resolution: Rerandomization vs Modifications
 
-If either check fails, the main server refuses to start. This catches DB corruption, manual interference, or bugs in the rerand server early, before any data is loaded into memory.
+### The problem
 
-### Advisory locks
+Modifications (reauthentications, deletions) propagate asynchronously to each party via independent SQS queues. During continuous rerandomization, a modification can land on some parties but not others between the time different parties stage a chunk. Without protection, this causes cross-party share divergence: different parties apply the rerand to different underlying shares, breaking the MPC invariant that all 3 parties' shares reconstruct to the same plaintext.
 
-Three advisory lock keys are used:
+### Two-layer protection
 
-- **`RERAND_APPLY_LOCK`** — serializes chunk applies with `load_iris_db`. Used as `pg_advisory_xact_lock` inside `apply_staging_chunk`'s transaction (auto-released on commit/rollback/drop), and as session-level `pg_advisory_lock` during startup to hold through `load_iris_db`.
-- **`RERAND_MODIFY_LOCK`** — serializes modification writes with the rerand modification fence. The rerand server acquires it (`pg_advisory_xact_lock`) at the start of its unified fence+apply transaction (Step 1.10) to hold through the fence check and apply window. The main server acquires it (`pg_advisory_xact_lock`) inside its modification transaction to prevent writes during the fence window.
+The protocol uses two layers to handle modification races:
 
-**Why `pg_advisory_xact_lock` for applies and modifications**: session-level locks are tied to a connection. If a process is killed while holding a session-level lock on a pooled connection, the connection may be returned to the pool with the lock still held, blocking future acquirers indefinitely. Transaction-level locks avoid this: when the connection is dropped, the transaction rolls back and the lock is released automatically. This is why the entire modification fence and apply step was combined into a single transaction.
+#### Layer 1: Cross-party version-map exchange (staging-time disagreements)
 
-## Conflict Resolution: Rerandomization vs Modifications
+After staging, each party uploads its `[(id, original_version_id)]` map and a blake3 hash to S3. After the staged barrier, each party downloads the 3 hashes (96 bytes). If all match, the maps are identical — no disagreements (fast path, ~100% of the time). If any hash differs, full maps are downloaded and diffed to produce the exact set of IDs where parties captured different `original_version_id` values.
 
-### The problem
+These IDs are deleted from staging before apply. **All parties compute the same staging-divergent set** (the version maps are deterministic and downloaded after the barrier), so all parties skip the same rows. This prevents the dangerous case where parties apply rerandomization on top of different base data.
 
-Modifications (reauthentications, deletions) propagate asynchronously to each party via independent SQS queues. During continuous rerandomization, a modification can land on some parties but not others between the time different parties stage a chunk. Without protection, this causes cross-party share divergence: different parties apply the rerand to different underlying shares, breaking the MPC invariant that all 3 parties' shares reconstruct to the same plaintext.
+#### Layer 2: `version_id` CAS (post-staging modifications)
+
+Modifications that land between staging and apply are caught by the `WHERE irises.version_id = staging.original_version_id` clause in the UPDATE. Rows where `version_id` changed are silently skipped.
 
-### The modification fence
+**This layer does NOT guarantee cross-party consistency on its own.** Different parties may have different live `version_id` values for the same row (because the modification hasn't propagated to all parties yet), so different parties may apply rerand to different subsets of rows.
 
-The modification fence ensures all parties agree on which rows to skip before applying a chunk. It has two components:
+### Post-staging modifications: transient inconsistency
 
-1. **Cross-party version-map exchange** (Steps 1.6–1.8): after staging, each party uploads its `[(id, original_version_id)]` map for the chunk to S3, along with a 32-byte blake3 hash of the map. After the S3 barrier, each party first downloads only the 3 hashes (96 bytes total). If all hashes match, the maps are identical and the cross-party disagreement set is empty (fast path — no full map download needed). If any hash differs, the full maps are downloaded and diffed to compute the exact set of IDs where any party captured a different `original_version_id` (slow path). This catches modifications that arrived on some parties before staging but not others. In practice, disagreements are rare (only when a modification races with the staging window), so the fast path runs ~100% of the time.
+When a modification lands on party B between staging and apply, but hasn't yet reached parties A and C:
 
-2. **Local divergence check under lock**: as part of the single apply transaction, the rerand server acquires `pg_advisory_xact_lock(RERAND_MODIFY_LOCK)` (blocking the main server from writing modifications), then queries for IDs where `staging.original_version_id ≠ irises.version_id`. This catches modifications that arrived after staging but before apply. The lock ensures no new modifications can land during the check + apply window.
+- A and C's CAS succeeds → row is rerandomized
+- B's CAS fails → row keeps the modification's shares
+- The 3 parties' shares for that row are temporarily inconsistent
 
-The union of both sets is deleted from staging before apply. All parties compute the same skip set (the cross-party exchange is deterministic, and the local check is under lock), so the apply produces consistent results across all parties.
+**This self-corrects when the modification propagates to A and C** (via SQS). The modification is a full-row overwrite that replaces the rerandomized shares with the modification's shares, restoring consistency. The row loses its rerandomization for this epoch; the next epoch picks it up.
 
-### Why the optimistic lock is still needed
+**Window**: bounded by SQS delivery time (typically seconds). During this window, the DB shares are inconsistent. The in-memory shares (used for query processing) are unaffected — the actor loaded from DB at startup before the rerand applied.
 
-The skip-set deletion removes divergent rows from staging before apply. The apply SQL still includes `WHERE irises.version_id = staging.original_version_id` as a final safety net — if a row somehow slipped through the fence (e.g., a modification landed in the narrow window between the local check and the apply within the same lock), the optimistic lock catches it. On its own the optimistic lock does NOT guarantee cross-party consistency (different parties can have different live `version_id` values), but combined with the fence it serves as defense-in-depth.
+**Restart risk**: if a party restarts during this window, `sync_modifications` at startup replays pending modifications from peers, closing the gap before `load_iris_db` runs.
+
+**Permanent failure**: if SQS permanently drops a modification, the row stays inconsistent until the next epoch. This is a pre-existing system risk — without reliable SQS delivery, the MPC protocol is already broken regardless of rerandomization.
 
 ### Why `rerand_epoch` and the trigger are kept
 
-Without the trigger change, the rerand apply would bump `version_id` (because share data changes). This is not a safety issue — the optimistic lock works correctly either way — but it inflates `version_id` by 1 per epoch per row. Since `version_id` is `SMALLINT` (max 32767), this limits the total number of rerandomizations + modifications before overflow. The trigger keeps `version_id` as a pure user-modification counter, preserving the full range for actual reauthentications.
+Without the trigger change, the rerand apply would bump `version_id` (because share data changes). This is not a safety issue — the CAS works correctly either way — but it inflates `version_id` by 1 per epoch per row. Since `version_id` is `SMALLINT` (max 32767), this limits the total number of rerandomizations + modifications before overflow. The trigger keeps `version_id` as a pure user-modification counter, preserving the full range for actual reauthentications.
 
 ## Chunking
 
@@ -267,3 +306,271 @@ Chunk boundaries must be identical across parties for chunk K to be meaningful.
 - Chunk K corresponds to `[start, end)` where `start = 1 + K * N` and `end = min(start + N, M + 1)`.
 
 A configurable delay (`--chunk-delay`, default e.g. 5s) is inserted between chunks to avoid sustained DB load. The rerand server should not stress the live DB with continuous writes — the delay spreads the I/O over time. The delay, chunk size, and number of parallel DB connections should all be configurable via CLI flags or environment variables.
+
+## Sequence Diagrams
+
+### Chunk lifecycle (happy path)
+
+All three rerand workers process each chunk in lockstep via S3 barriers. No DB locks are held during S3 coordination.
+
+```mermaid
+sequenceDiagram
+    participant P0 as Rerand Worker 0
+    participant P1 as Rerand Worker 1
+    participant P2 as Rerand Worker 2
+    participant S3 as S3 Bucket
+    participant DB0 as DB (Party 0)
+
+    Note over P0,P2: Chunk K begins
+
+    par Stage (each party reads live DB, writes staging)
+        P0->>DB0: Read irises [start..end], record version_ids
+        P0->>DB0: Write to staging schema
+    end
+
+    P0->>S3: Upload version-map hash + version-map
+    P1->>S3: Upload version-map hash + version-map
+    P2->>S3: Upload version-map hash + version-map
+
+    P0->>S3: Upload staged marker
+    P1->>S3: Upload staged marker
+    P2->>S3: Upload staged marker
+
+    Note over P0,P2: S3 barrier — all parties poll until 3 staged markers exist
+
+    P0->>S3: Download 3 version-map hashes
+    alt All hashes match (fast path)
+        Note over P0: staging_divergent = empty
+    else Hash mismatch (slow path)
+        P0->>S3: Download 3 full version maps
+        Note over P0: staging_divergent = differing IDs
+    end
+
+    Note over P0: Apply transaction (no S3 I/O from here)
+
+    P0->>DB0: BEGIN
+    P0->>DB0: pg_advisory_xact_lock(MODIFY_LOCK)
+    P0->>DB0: pg_advisory_xact_lock(APPLY_LOCK)
+    P0->>DB0: DELETE staging_divergent from staging
+    P0->>DB0: UPDATE irises FROM staging WHERE version_id CAS
+    P0->>DB0: DELETE staging rows, mark live_applied
+    P0->>DB0: COMMIT (locks released)
+```
+
+### Startup with coordinated freeze
+
+The main server freezes the rerand worker, verifies watermark equality across all three parties, then loads the DB snapshot.
+
+```mermaid
+sequenceDiagram
+    participant MS as Main Server
+    participant RW as Rerand Worker
+    participant DB as Postgres
+    participant Peer1 as Peer Server 1
+    participant Peer2 as Peer Server 2
+
+    Note over MS: Startup begins (after sync_modifications)
+
+    MS->>DB: SET freeze_requested=TRUE, freeze_generation=G1
+
+    RW->>DB: (between chunks) Read rerand_control
+    Note over RW: Sees freeze_requested=TRUE, generation=G1
+    RW->>DB: SET frozen_generation=G1
+    Note over RW: Blocks in poll loop
+
+    MS->>DB: Poll until frozen_generation=G1
+    Note over MS: Worker is quiesced
+
+    MS->>DB: Read local applied watermark
+    MS->>Peer1: GET /rerand-watermark
+    Peer1-->>MS: {epoch: 3, max_applied_chunk: 7}
+    MS->>Peer2: GET /rerand-watermark
+    Peer2-->>MS: {epoch: 3, max_applied_chunk: 7}
+
+    alt All watermarks equal
+        MS->>DB: pg_advisory_lock(APPLY_LOCK)
+        MS->>DB: load_iris_db (full DB snapshot into memory)
+        MS->>DB: pg_advisory_unlock(APPLY_LOCK)
+        MS->>DB: SET freeze_requested=FALSE
+        Note over RW: Poll sees freeze_requested=FALSE
+        RW->>RW: Resume chunk processing
+    else Watermark mismatch
+        MS->>DB: SET freeze_requested=FALSE
+        Note over RW: Resume chunk processing
+        Note over MS: ABORT startup (fail closed)
+    end
+```
+
+### Freeze generation handoff (crash recovery)
+
+If the main server crashes while the worker is frozen, the new server instance writes a new generation. The worker detects the change and re-acknowledges.
+
+```mermaid
+sequenceDiagram
+    participant MS1 as Main Server (attempt 1)
+    participant MS2 as Main Server (attempt 2)
+    participant RW as Rerand Worker
+    participant DB as Postgres
+
+    MS1->>DB: SET freeze_requested=TRUE, freeze_generation=G1
+    RW->>DB: SET frozen_generation=G1
+    Note over RW: Blocked in freeze loop
+
+    MS1-xMS1: CRASH (freeze_requested still TRUE)
+
+    Note over MS2: Restart
+
+    MS2->>DB: SET freeze_requested=TRUE, freeze_generation=G2
+
+    RW->>DB: Poll: reads freeze_generation=G2 (≠ G1)
+    RW->>DB: SET frozen_generation=G2
+    Note over RW: Still blocked, now acked for G2
+
+    MS2->>DB: Poll until frozen_generation=G2
+    Note over MS2: Proceeds with watermark check + load
+
+    MS2->>DB: SET freeze_requested=FALSE
+    Note over RW: Resumes
+```
+
+### Modification conflict resolution
+
+Shows how the version-map exchange (Layer 1) and version_id CAS (Layer 2) handle a modification that arrives asymmetrically.
+
+```mermaid
+sequenceDiagram
+    participant PA as Party A
+    participant PB as Party B
+    participant PC as Party C
+    participant SQS as SQS
+
+    Note over PA,PC: Chunk K staging begins
+
+    SQS->>PB: Modification M (row 42)
+    Note over PB: version_id for row 42 bumps to V+1
+
+    PA->>PA: Stage row 42 with version_id=V
+    PB->>PB: Stage row 42 with version_id=V+1
+    PC->>PC: Stage row 42 with version_id=V
+
+    Note over PA,PC: Version-map exchange (Layer 1)
+    PA->>PA: version_map_hash differs from PB
+    Note over PA,PC: row 42 added to staging_divergent
+
+    Note over PA,PC: Apply — row 42 deleted from staging on all parties
+    Note over PA,PC: Row 42 is NOT rerandomized (safe)
+
+    SQS->>PA: Modification M arrives (later)
+    SQS->>PC: Modification M arrives (later)
+    Note over PA,PC: All parties now have M applied — consistent
+    Note over PA,PC: Row 42 will be rerandomized in next epoch
+```
+
+### Startup watermark convergence (freeze race)
+
+During a rolling deploy, all three main servers freeze their local rerand workers. Because the freeze is per-party (not a global barrier), workers may pause at different chunk boundaries. The convergence protocol handles this: only the behind party releases its freeze, leading parties stay frozen. This guarantees convergence without the leading parties advancing further.
+
+```mermaid
+sequenceDiagram
+    participant MSA as Main Server A
+    participant MSB as Main Server B
+    participant MSC as Main Server C
+    participant WA as Worker A
+    participant WB as Worker B
+    participant WC as Worker C
+
+    Note over WA,WC: Workers processing in lockstep via S3 barrier
+
+    Note over MSA,MSC: Deploy — all 3 main servers restart together
+
+    MSA->>WA: freeze_requested=TRUE
+    MSB->>WB: freeze_requested=TRUE
+    MSC->>WC: freeze_requested=TRUE
+
+    Note over WA: Finishes chunk 8 apply, THEN sees freeze
+    WA->>WA: Paused at watermark (E, 8)
+
+    Note over WB: Sees freeze BEFORE chunk 8 apply
+    WB->>WB: Paused at watermark (E, 7)
+
+    Note over WC: Finishes chunk 8 apply, THEN sees freeze
+    WC->>WC: Paused at watermark (E, 8)
+
+    MSA->>MSA: Local=(E,8), max=(E,8), Peer B=(E,7)
+    Note over MSA: Local at max → stay frozen, re-poll
+
+    MSB->>MSB: Local=(E,7), max=(E,8)
+    Note over MSB: Local behind → release freeze
+
+    MSC->>MSC: Local=(E,8), max=(E,8), Peer B=(E,7)
+    Note over MSC: Local at max → stay frozen, re-poll
+
+    Note over WB: Worker B resumes (only B unfreezes)
+    Note over WA,WC: Workers A & C stay frozen — cannot advance
+
+    WB->>WB: Apply chunk 8 (already confirmed via S3)
+
+    MSB->>WB: Re-freeze (new generation)
+    WB->>WB: Paused at watermark (E, 8)
+
+    MSA->>MSA: Re-poll peers: B=(E,8), C=(E,8)
+    Note over MSA: All watermarks = (E, 8) ✓
+
+    MSB->>MSB: Re-check: local=(E,8), A=(E,8), C=(E,8)
+    Note over MSB: All watermarks = (E, 8) ✓
+
+    MSC->>MSC: Re-poll peers: A=(E,8), B=(E,8)
+    Note over MSC: All watermarks = (E, 8) ✓
+
+    Note over MSA,MSC: All parties proceed with DB load
+```
+
+### Post-staging modification: transient DB inconsistency
+
+When a modification arrives at one party between staging and apply, but hasn't yet propagated to the others, the version_id CAS causes asymmetric application. The DB shares are temporarily inconsistent but self-correct when the modification propagates. In-memory shares (used for live queries) are unaffected.
+
+```mermaid
+sequenceDiagram
+    participant PA as Party A (DB)
+    participant PB as Party B (DB)
+    participant PC as Party C (DB)
+    participant SQS as SQS
+
+    Note over PA,PC: All parties staged row 42 with version_id=V
+
+    Note over PA,PC: S3 barrier passed, version maps match (row 42 NOT in staging_divergent)
+
+    SQS->>PB: Modification M arrives at Party B only
+    Note over PB: Row 42 version_id bumps V→V+1
+
+    Note over PA,PC: Apply transaction (under RERAND_MODIFY_LOCK)
+
+    PA->>PA: UPDATE WHERE version_id=V → CAS succeeds ✓
+    Note over PA: Row 42 rerandomized
+
+    PB->>PB: UPDATE WHERE version_id=V → CAS fails (V≠V+1) ✗
+    Note over PB: Row 42 keeps modification shares
+
+    PC->>PC: UPDATE WHERE version_id=V → CAS succeeds ✓
+    Note over PC: Row 42 rerandomized
+
+    rect rgb(255, 230, 230)
+        Note over PA,PC: ⚠ TRANSIENT INCONSISTENCY WINDOW
+        Note over PA: rerandomized shares
+        Note over PB: modification shares
+        Note over PC: rerandomized shares
+        Note over PA,PC: Shamir reconstruction would be WRONG for row 42
+        Note over PA,PC: But in-memory shares (serving queries) are unaffected
+    end
+
+    SQS->>PA: Modification M propagates to A
+    Note over PA: Row 42 overwritten with modification shares
+
+    SQS->>PC: Modification M propagates to C
+    Note over PC: Row 42 overwritten with modification shares
+
+    rect rgb(230, 255, 230)
+        Note over PA,PC: ✓ CONSISTENT — all parties have modification shares
+        Note over PA,PC: Row 42 will be rerandomized in next epoch
+    end
+```
diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 768871a9b9..2de58dcd1e 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -224,8 +224,6 @@ async fn receive_batch(
 
                 match request_type {
                     IDENTITY_DELETION_MESSAGE_TYPE => {
-                        // If it's a deletion request, we just store the serial_id and continue.
-                        // Deletion will take place when batch process starts.
                         let identity_deletion_request: IdentityDeletionRequest =
                             serde_json::from_str(&message.message).map_err(|e| {
                                 ReceiveRequestError::json_parse_error(
@@ -233,13 +231,6 @@ async fn receive_batch(
                                     e,
                                 )
                             })?;
-                        client
-                            .delete_message()
-                            .queue_url(queue_url)
-                            .receipt_handle(sqs_message.receipt_handle.unwrap())
-                            .send()
-                            .await
-                            .map_err(ReceiveRequestError::from)?;
                         metrics::counter!("request.received", "type" => "identity_deletion")
                             .increment(1);
                         if batch_query
@@ -251,6 +242,7 @@ async fn receive_batch(
                                 identity_deletion_request.serial_id,
                                 identity_deletion_request,
                             );
+                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                             continue;
                         }
                         let modification = store
@@ -260,6 +252,7 @@ async fn receive_batch(
                                 None,
                             )
                             .await?;
+                        client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                         batch_query.modifications.insert(
                             RequestSerialId(identity_deletion_request.serial_id),
                             modification,
@@ -284,14 +277,6 @@ async fn receive_batch(
                         metrics::counter!("request.received", "type" => "uniqueness_verification")
                             .increment(1);
 
-                        client
-                            .delete_message()
-                            .queue_url(queue_url)
-                            .receipt_handle(sqs_message.receipt_handle.unwrap())
-                            .send()
-                            .await
-                            .map_err(ReceiveRequestError::from)?;
-
                         if let Some(batch_size) = uniqueness_request.batch_size {
                             // Updating the batch size instantly makes it a bit unpredictable, since
                             // if we're already above the new limit, we'll still process the current
@@ -311,6 +296,7 @@ async fn receive_batch(
                                 Some(uniqueness_request.s3_key.as_str()),
                             )
                             .await?;
+                        client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                         batch_query.modifications.insert(
                             RequestId(uniqueness_request.signup_id.clone()),
                             modification,
@@ -393,13 +379,6 @@ async fn receive_batch(
                             .map_err(|e| {
                                 ReceiveRequestError::json_parse_error("Reauth request", e)
                             })?;
-                        client
-                            .delete_message()
-                            .queue_url(queue_url)
-                            .receipt_handle(sqs_message.receipt_handle.unwrap())
-                            .send()
-                            .await
-                            .map_err(ReceiveRequestError::from)?;
 
                         metrics::counter!("request.received", "type" => "reauth").increment(1);
 
@@ -413,6 +392,7 @@ async fn receive_batch(
                                 "Received a reauth request with use_or_rule set to true, but LUC \
                                  is not enabled. Skipping request."
                             );
+                                client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                                 continue;
                             }
 
@@ -425,6 +405,7 @@ async fn receive_batch(
                                 reauth_request.serial_id,
                                 reauth_request,
                             );
+                                client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                                 continue;
                             }
 
@@ -437,6 +418,7 @@ async fn receive_batch(
                                     Some(reauth_request.s3_key.as_str()),
                                 )
                                 .await?;
+                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                             batch_query
                                 .modifications
                                 .insert(RequestSerialId(reauth_request.serial_id), modification);
@@ -494,6 +476,7 @@ async fn receive_batch(
                             handles.push(handle);
                         } else {
                             tracing::warn!("Reauth is disabled, skipping reauth request");
+                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                         }
                     }
 
@@ -508,17 +491,10 @@ async fn receive_batch(
                                 )
                             })?;
 
-                        client
-                            .delete_message()
-                            .queue_url(queue_url)
-                            .receipt_handle(sqs_message.receipt_handle.unwrap())
-                            .send()
-                            .await
-                            .map_err(ReceiveRequestError::from)?;
-
                         if !is_enabled(&request_type, &config) {
                             metrics::counter!("request.skipped", "type" => request_type.to_string()).increment(1);
                             tracing::warn!("{} is disabled, skipping request", request_type);
+                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                             continue;
                         }
                         metrics::counter!("request.received", "type" => request_type.to_string())
@@ -526,9 +502,6 @@ async fn receive_batch(
 
                         msg_counter += 1;
 
-                        // Persist in progress identity match check message.
-                        // Note that identity match check is only a query and does not persist anything into the database.
-                        // We store modification so that the SNS result can be replayed.
                         let modification = store
                             .insert_modification(
                                 None,
@@ -536,6 +509,7 @@ async fn receive_batch(
                                 Some(identity_match_check_request.s3_key.as_str()),
                             )
                             .await?;
+                        client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                         batch_query.modifications.insert(
                             RequestId(identity_match_check_request.request_id.clone()),
                             modification,
@@ -582,14 +556,6 @@ async fn receive_batch(
                         metrics::counter!("request.received", "type" => "reset_update")
                             .increment(1);
 
-                        client
-                            .delete_message()
-                            .queue_url(queue_url)
-                            .receipt_handle(sqs_message.receipt_handle.unwrap())
-                            .send()
-                            .await
-                            .map_err(ReceiveRequestError::from)?;
-
                         if config.enable_reset {
                             // Fetch new iris shares from S3
                             let semaphore = Arc::clone(&semaphore);
@@ -634,6 +600,7 @@ async fn receive_batch(
                                 reset_update_request.serial_id,
                                 reset_update_request,
                             );
+                                client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                                 continue;
                             }
 
@@ -644,6 +611,7 @@ async fn receive_batch(
                                     Some(reset_update_request.s3_key.as_str()),
                                 )
                                 .await?;
+                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                             batch_query.modifications.insert(
                                 RequestSerialId(reset_update_request.serial_id),
                                 modification,
@@ -660,6 +628,9 @@ async fn receive_batch(
                                     mask_right: right_shares.mask,
                                 },
                             );
+                        } else {
+                            tracing::warn!("Reset is disabled, skipping reset update request");
+                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
                         }
                     }
 
@@ -1006,7 +977,6 @@ async fn server_main(config: Config) -> Result<()> {
         let verified_peers = Arc::clone(&verified_peers);
         let image_name = server_coord_config.image_name.to_string();
 
-        // Pre-calculate parts of the response that don't change
         let base_response = ReadyProbeResponse {
             image_name: image_name.clone(),
             shutting_down: false,
@@ -1015,11 +985,14 @@ async fn server_main(config: Config) -> Result<()> {
             is_ready: false,
         };
 
-        let my_state = my_state.clone();
+        // Capture a base state for fields that don't change, but re-fetch
+        // rerand_state from the DB on each /startup-sync request so peers
+        // always see the live watermark (not a stale boot-time snapshot).
+        let base_sync_state = my_state.clone();
+        let sync_pool = store.pool.clone();
         async move {
             let is_ready_flag_health = Arc::clone(&is_ready_flag);
             let is_ready_flag_ready = Arc::clone(&is_ready_flag);
-            // Generate a random UUID for each run.
             let app = Router::new()
                 .route(
                     "/health",
@@ -1052,7 +1025,59 @@ async fn server_main(config: Config) -> Result<()> {
                 )
                 .route(
                     "/startup-sync",
-                    get(move || async move { serde_json::to_string(&my_state).unwrap() }),
+                    get({
+                        let base = base_sync_state;
+                        let pool = sync_pool.clone();
+                        move || {
+                            let mut state = base.clone();
+                            let pool = pool.clone();
+                            async move {
+                                match rerand_store::build_rerand_sync_state(&pool).await {
+                                    Ok(live_rerand) => {
+                                        state.rerand_state = live_rerand;
+                                    }
+                                    Err(e) => {
+                                        tracing::warn!(
+                                            "Failed to fetch live rerand_state for /startup-sync, \
+                                             serving stale snapshot: {:?}",
+                                            e
+                                        );
+                                    }
+                                }
+                                serde_json::to_string(&state).unwrap()
+                            }
+                        }
+                    }),
+                )
+                .route(
+                    "/rerand-watermark",
+                    get({
+                        let pool = sync_pool.clone();
+                        move || {
+                            let pool = pool.clone();
+                            async move {
+                                let wm = rerand_store::get_applied_watermark_from_pool(&pool).await;
+                                match wm {
+                                    Ok(Some((epoch, chunk))) => (
+                                        StatusCode::OK,
+                                        serde_json::to_string(&serde_json::json!({
+                                            "epoch": epoch,
+                                            "max_applied_chunk": chunk,
+                                        }))
+                                        .unwrap(),
+                                    ),
+                                    Ok(None) => (StatusCode::OK, "null".to_string()),
+                                    Err(e) => {
+                                        tracing::warn!("rerand-watermark query failed: {:?}", e);
+                                        (
+                                            StatusCode::INTERNAL_SERVER_ERROR,
+                                            format!("DB error: {}", e),
+                                        )
+                                    }
+                                }
+                            }
+                        }
+                    }),
                 );
             let listener = tokio::net::TcpListener::bind("0.0.0.0:3000")
                 .await
@@ -1318,110 +1343,142 @@ async fn server_main(config: Config) -> Result<()> {
         }
     }
 
-    let rerand_lock_conn =
-        rerand_store::rerand_validate_and_lock(&store.pool, &sync_result).await?;
-
-    if download_shutdown_handler.is_shutting_down() {
-        tracing::warn!("Shutting down has been triggered");
-        rerand_store::release_rerand_lock(rerand_lock_conn).await?;
-        return Ok(());
+    // --- Coordinated rerand freeze with watermark convergence ---
+    {
+        eyre::ensure!(
+            server_coord_config.node_hostnames.len() == server_coord_config.healthcheck_ports.len(),
+            "node_hostnames ({}) and healthcheck_ports ({}) must have the same length",
+            server_coord_config.node_hostnames.len(),
+            server_coord_config.healthcheck_ports.len(),
+        );
+        let peer_addrs: Vec<(&str, usize)> = server_coord_config
+            .node_hostnames
+            .iter()
+            .zip(server_coord_config.healthcheck_ports.iter())
+            .enumerate()
+            .filter(|(i, _)| *i != config.party_id)
+            .map(|(_, (h, p))| -> eyre::Result<_> {
+                Ok((h.as_str(), p.parse::<usize>()?))
+            })
+            .collect::<eyre::Result<Vec<_>>>()?;
+        rerand_store::freeze_and_verify_watermarks(&store.pool, &peer_addrs).await?;
     }
+    // Worker is now frozen with verified equal watermarks.
+    // Everything from here until freeze release must be wrapped so that
+    // errors always release the freeze.
+    let freeze_pool = store.pool.clone();
 
-    let startup_result = async {
-        // refetch store_len in case we rolled back
-        let store_len = store.count_irises().await?;
-        tracing::info!("Database store length after sync: {}", store_len);
-
-        let runtime_handle = tokio::runtime::Handle::current();
-        let anon_stats_writer = if let Some(url) = config.get_anon_stats_db_url() {
-            let schema = config.get_anon_stats_db_schema();
-            let anon_client =
-                AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
-            let anon_store = AnonStatsStore::new(&anon_client).await?;
-            Some((anon_store, runtime_handle.clone()))
-        } else {
-            tracing::warn!("No database URL configured for anon stats; skipping DB persistence");
-            None
-        };
-        let anon_stats_writer_for_actor = anon_stats_writer.clone();
-
-        let (tx, rx) = oneshot::channel();
-        let config_clone = config.clone();
-        background_tasks.spawn_blocking(move || {
-            let config = config_clone;
-            // --------------------------------------------------------------------------
-            // ANCHOR: Load the database
-            // --------------------------------------------------------------------------
-            tracing::info!("⚓️ ANCHOR: Starting server actor");
-            match ServerActor::new(
-                config.party_id,
-                chacha_seeds,
-                8,
-                config.max_db_size,
-                config.max_batch_size,
-                config.match_distances_buffer_size,
-                config.match_distances_buffer_size_extra_percent,
-                config.return_partial_results,
-                config.disable_persistence,
-                config.enable_debug_timing,
-                config.full_scan_side,
-                config.full_scan_side_switching_enabled,
-                anon_stats_writer_for_actor,
-            ) {
-                Ok((mut actor, handle)) => {
-                    tracing::info!("⚓️ ANCHOR: Load the database");
-                    let res = if config.fake_db_size > 0 {
-                        // TODO: does this even still work, since we do not page-lock the memory here?
-                        actor.fake_db(config.fake_db_size);
-                        Ok(())
-                    } else {
-                        tracing::info!(
-                            "Initialize iris db: Loading from DB (parallelism: {})",
-                            parallelism
-                        );
-                        let download_shutdown_handler = Arc::clone(&download_shutdown_handler);
-
-                        tokio::runtime::Handle::current().block_on(async {
-                            load_iris_db(
-                                &mut actor,
-                                &store,
-                                store_len,
-                                parallelism,
-                                &config,
-                                download_shutdown_handler,
-                            )
-                            .await
-                        })
-                    };
+    let frozen_result = async {
+        let rerand_lock_conn = rerand_store::acquire_apply_lock(&store.pool).await?;
 
-                    match res {
-                        Ok(_) => {
-                            tx.send(Ok((handle, store))).unwrap();
-                        }
-                        Err(e) => {
-                            tx.send(Err(e)).unwrap();
-                            return Ok(());
+        if download_shutdown_handler.is_shutting_down() {
+            rerand_store::release_apply_lock(rerand_lock_conn).await?;
+            return Ok::<_, eyre::Report>(None);
+        }
+
+        let startup_result = async {
+            let store_len = store.count_irises().await?;
+            tracing::info!("Database store length after sync: {}", store_len);
+
+            let runtime_handle = tokio::runtime::Handle::current();
+            let anon_stats_writer = if let Some(url) = config.get_anon_stats_db_url() {
+                let schema = config.get_anon_stats_db_schema();
+                let anon_client =
+                    AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
+                let anon_store = AnonStatsStore::new(&anon_client).await?;
+                Some((anon_store, runtime_handle.clone()))
+            } else {
+                tracing::warn!(
+                    "No database URL configured for anon stats; skipping DB persistence"
+                );
+                None
+            };
+            let anon_stats_writer_for_actor = anon_stats_writer.clone();
+
+            let (tx, rx) = oneshot::channel();
+            let config_clone = config.clone();
+            background_tasks.spawn_blocking(move || {
+                let config = config_clone;
+                tracing::info!("⚓️ ANCHOR: Starting server actor");
+                match ServerActor::new(
+                    config.party_id,
+                    chacha_seeds,
+                    8,
+                    config.max_db_size,
+                    config.max_batch_size,
+                    config.match_distances_buffer_size,
+                    config.match_distances_buffer_size_extra_percent,
+                    config.return_partial_results,
+                    config.disable_persistence,
+                    config.enable_debug_timing,
+                    config.full_scan_side,
+                    config.full_scan_side_switching_enabled,
+                    anon_stats_writer_for_actor,
+                ) {
+                    Ok((mut actor, handle)) => {
+                        tracing::info!("⚓️ ANCHOR: Load the database");
+                        let res = if config.fake_db_size > 0 {
+                            actor.fake_db(config.fake_db_size);
+                            Ok(())
+                        } else {
+                            tracing::info!(
+                                "Initialize iris db: Loading from DB (parallelism: {})",
+                                parallelism
+                            );
+                            let download_shutdown_handler =
+                                Arc::clone(&download_shutdown_handler);
+
+                            tokio::runtime::Handle::current().block_on(async {
+                                load_iris_db(
+                                    &mut actor,
+                                    &store,
+                                    store_len,
+                                    parallelism,
+                                    &config,
+                                    download_shutdown_handler,
+                                )
+                                .await
+                            })
+                        };
+
+                        match res {
+                            Ok(_) => {
+                                tx.send(Ok((handle, store))).unwrap();
+                            }
+                            Err(e) => {
+                                tx.send(Err(e)).unwrap();
+                                return Ok(());
+                            }
                         }
+
+                        actor.run(); // forever
                     }
+                    Err(e) => {
+                        tx.send(Err(e)).unwrap();
+                        return Ok(());
+                    }
+                };
+                Ok(())
+            });
 
-                    actor.run(); // forever
-                }
-                Err(e) => {
-                    tx.send(Err(e)).unwrap();
-                    return Ok(());
-                }
-            };
-            Ok(())
-        });
+            let startup_result = rx.await;
+            let (handle, store) = startup_result??;
+            Ok::<_, eyre::Report>((handle, store))
+        }
+        .await;
 
-        let startup_result = rx.await;
-        let (handle, store) = startup_result??;
-        Ok::<_, eyre::Report>((handle, store))
+        rerand_store::release_apply_lock(rerand_lock_conn).await?;
+        Ok(Some(startup_result))
     }
     .await;
 
-    rerand_store::release_rerand_lock(rerand_lock_conn).await?;
-    let (mut handle, store) = startup_result?;
+    // Always release freeze, even on error.
+    rerand_store::release_rerand_freeze(&freeze_pool).await?;
+
+    let (mut handle, store) = match frozen_result? {
+        None => return Ok(()),
+        Some(r) => r?,
+    };
 
     background_tasks.check_tasks();
 
@@ -1716,6 +1773,13 @@ async fn server_main(config: Config) -> Result<()> {
 
             let mut tx = store_bg.tx().await?;
 
+            if !config_bg.disable_persistence {
+                sqlx::query("SELECT pg_advisory_xact_lock($1)")
+                    .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
+                    .execute(&mut *tx)
+                    .await?;
+            }
+
             store_bg
                 .update_modifications(&mut tx, &modifications.values().collect::<Vec<_>>())
                 .await?;
diff --git a/iris-mpc-common/src/helpers/sync.rs b/iris-mpc-common/src/helpers/sync.rs
index 9719af4f6f..78fe36ea49 100644
--- a/iris-mpc-common/src/helpers/sync.rs
+++ b/iris-mpc-common/src/helpers/sync.rs
@@ -17,8 +17,8 @@ pub struct SyncState {
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct RerandSyncState {
     pub epoch: i32,
-    /// Highest chunk_id where all_confirmed = TRUE. -1 if none confirmed.
-    pub max_confirmed_chunk: i32,
+    /// Highest chunk_id where live_applied = TRUE. -1 if none applied.
+    pub max_applied_chunk: i32,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -248,11 +248,17 @@ impl SyncResult {
         let max_id = completed_max_mod_ids.iter().flatten().copied().max();
         if let (Some(min_id), Some(max_id)) = (min_id, max_id) {
             let mod_id_diff = max_id.saturating_sub(min_id) as usize;
-            if mod_id_diff > self.my_state.common_config.get_max_modifications_lookback() {
+            let lookback = self.my_state.common_config.get_max_modifications_lookback();
+            if mod_id_diff > lookback {
                 panic!(
-                    "Modification ID difference across nodes is too large: {:?}. Min: {:?}, Max: {:?}. \
-             Can not safely handle this case, consider bumping lookback. Crashing!",
-                    completed_max_mod_ids, min_id, max_id
+                    "Modification ID difference across nodes ({}) exceeds lookback ({}): {:?}. \
+                     Min: {:?}, Max: {:?}. Cannot safely reconcile. \
+                     Bump max_modifications_lookback or investigate drift.",
+                    mod_id_diff,
+                    lookback,
+                    completed_max_mod_ids,
+                    min_id,
+                    max_id
                 );
             }
         }
@@ -290,16 +296,17 @@ impl SyncResult {
                     .expect("At least one completed modification");
                 match local_copy {
                     None => {
-                        // If an item is completed for a party, it should at least exist in the
-                        // local state because it should have been added during receive_batch.
-                        // This can only happen when other party misses an in_progress mod.
-                        // Local party will fetch until modification id X while the other party will
-                        // fetch until mod id X-1. In this case, local party won't find X-1.
-                        // We log and skip updating to avoid rolling back to an older share in local.
-                        tracing::info!(
-                            "Skip missing completed modification: {:?}",
-                            first_completed
+                        // The local node never received this modification (e.g., SQS
+                        // message was lost). Roll it forward from a peer's completed
+                        // copy so the local DB converges with the other parties.
+                        let mut roll_forward = first_completed.clone();
+                        roll_forward.status = ModificationStatus::Completed.to_string();
+                        roll_forward.persisted = any_persisted;
+                        tracing::warn!(
+                            "Recovering missing completed modification from peer: {:?}",
+                            roll_forward
                         );
+                        to_update.push(roll_forward);
                     }
                     Some(local_m) => {
                         if local_m.status != ModificationStatus::Completed.to_string()
@@ -773,10 +780,13 @@ mod tests {
         // Compare modifications across nodes.
         let (to_update, to_delete) = sync_result.compare_modifications();
 
-        assert_eq!(to_update.len(), 0, "Expected no modification to update");
+        assert_eq!(
+            to_update.len(),
+            1,
+            "Expected mod1 to be recovered from peer"
+        );
+        assert_eq!(to_update[0].id, mod1_other.id);
         assert_eq!(to_delete.len(), 1, "Expected one modification to delete");
-
-        // Expectation: Local party should delete mod3.
         assert_eq!(to_delete[0], mod3_local);
     }
 
@@ -1063,13 +1073,8 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "Modification ID difference across nodes is too large")]
+    #[should_panic(expected = "Modification ID difference across nodes")]
     fn test_compare_modifications_large_id_difference_panic() {
-        // Create a scenario where nodes have completed modifications with IDs
-        // that differ by more than the max_modifications_lookback limit.
-        // Test lookback is (100 + 64) * 2 = 328, so we'll create a difference of 350.
-
-        // Node 1: has completed modification with ID 1
         let mod1_node1 = create_modification(
             1,
             Some(100),
@@ -1081,7 +1086,6 @@ mod tests {
         );
         let my_state = create_sync_state_with_lookback(vec![mod1_node1], 10);
 
-        // Node 2: has completed modification with ID 15 (difference = 14 > 10)
         let mod15_node2 = create_modification(
             15,
             Some(1500),
@@ -1093,7 +1097,6 @@ mod tests {
         );
         let other_state1 = create_sync_state_with_lookback(vec![mod15_node2], 10);
 
-        // Node 3: has completed modification with ID 20 (even larger)
         let mod20_node3 = create_modification(
             20,
             Some(2000),
@@ -1112,7 +1115,6 @@ mod tests {
             all_states,
         };
 
-        // This should panic because max_id (20) - min_id (1) = 19 > 10 (test lookback)
         sync_result.compare_modifications();
     }
 }
diff --git a/iris-mpc-store/Cargo.toml b/iris-mpc-store/Cargo.toml
index 97bb12933f..a4d3f1dab9 100644
--- a/iris-mpc-store/Cargo.toml
+++ b/iris-mpc-store/Cargo.toml
@@ -20,6 +20,10 @@ eyre.workspace = true
 itertools.workspace = true
 tracing.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
+uuid.workspace = true
+reqwest.workspace = true
+serde_json.workspace = true
 rand.workspace = true
 ampc-server-utils.workspace = true
 
diff --git a/iris-mpc-store/src/lib.rs b/iris-mpc-store/src/lib.rs
index e351f2a9b9..52f971ded4 100644
--- a/iris-mpc-store/src/lib.rs
+++ b/iris-mpc-store/src/lib.rs
@@ -661,6 +661,39 @@ WHERE id = $1;
         Ok(())
     }
 
+    /// Insert a modification recovered from a peer. Uses the peer's `id` to
+    /// keep modification IDs consistent across parties. If the `id` already
+    /// exists, updates the row to match the peer's state.
+    pub async fn upsert_recovered_modification(
+        &self,
+        tx: &mut Transaction<'_, Postgres>,
+        m: &Modification,
+    ) -> Result<()> {
+        sqlx::query(
+            r#"
+            INSERT INTO modifications (id, serial_id, request_type, s3_url, status, persisted, result_message_body, graph_mutation)
+            VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
+            ON CONFLICT (id) DO UPDATE SET
+                status = EXCLUDED.status,
+                persisted = EXCLUDED.persisted,
+                result_message_body = EXCLUDED.result_message_body,
+                serial_id = EXCLUDED.serial_id,
+                graph_mutation = EXCLUDED.graph_mutation
+            "#,
+        )
+        .bind(m.id)
+        .bind(m.serial_id)
+        .bind(&m.request_type)
+        .bind(&m.s3_url)
+        .bind(&m.status)
+        .bind(m.persisted)
+        .bind(&m.result_message_body)
+        .bind(&m.graph_mutation)
+        .execute(tx.deref_mut())
+        .await?;
+        Ok(())
+    }
+
     /// Delete modifications based on their id.
     pub async fn delete_modifications(
         &self,
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 8b7ab575fc..137564d26b 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -1,8 +1,7 @@
-use std::cmp::Ordering;
 use std::time::Duration;
 
 use eyre::Result;
-use iris_mpc_common::helpers::sync::{RerandSyncState, SyncResult};
+use iris_mpc_common::helpers::sync::RerandSyncState;
 use sqlx::PgPool;
 
 pub const RERAND_APPLY_LOCK: i64 = 0x5245_5241_4E44;
@@ -88,83 +87,11 @@ pub async fn get_staging_version_map(
     Ok(rows)
 }
 
-/// Return IDs where the staging original_version_id no longer matches the
-/// live version_id (modifications landed after staging).
-pub async fn get_locally_divergent_ids(
-    pool: &PgPool,
-    staging_schema: &str,
-    epoch: i32,
-    chunk_id: i32,
-) -> Result<Vec<i64>> {
-    validate_identifier(staging_schema)?;
-    let sql = format!(
-        r#"
-        SELECT s.id FROM "{}".irises s
-        JOIN irises ON irises.id = s.id
-        WHERE s.epoch = $1 AND s.chunk_id = $2
-          AND irises.version_id != s.original_version_id
-        "#,
-        staging_schema,
-    );
-    let rows: Vec<(i64,)> = sqlx::query_as(&sql)
-        .bind(epoch)
-        .bind(chunk_id)
-        .fetch_all(pool)
-        .await?;
-    Ok(rows.into_iter().map(|(id,)| id).collect())
-}
-
-async fn get_locally_divergent_ids_tx(
-    tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
-    staging_schema: &str,
-    epoch: i32,
-    chunk_id: i32,
-) -> Result<Vec<i64>> {
-    validate_identifier(staging_schema)?;
-    let sql = format!(
-        r#"
-        SELECT s.id FROM "{}".irises s
-        JOIN irises ON irises.id = s.id
-        WHERE s.epoch = $1 AND s.chunk_id = $2
-          AND irises.version_id != s.original_version_id
-        "#,
-        staging_schema,
-    );
-    let rows: Vec<(i64,)> = sqlx::query_as(&sql)
-        .bind(epoch)
-        .bind(chunk_id)
-        .fetch_all(&mut **tx)
-        .await?;
-    Ok(rows.into_iter().map(|(id,)| id).collect())
-}
-
-/// Delete specific IDs from a staging chunk.
-pub async fn delete_staging_ids(
-    pool: &PgPool,
-    staging_schema: &str,
-    epoch: i32,
-    ids: &[i64],
-) -> Result<u64> {
-    if ids.is_empty() {
-        return Ok(0);
-    }
-    validate_identifier(staging_schema)?;
-    let sql = format!(
-        r#"DELETE FROM "{}".irises WHERE epoch = $1 AND id = ANY($2)"#,
-        staging_schema,
-    );
-    let result = sqlx::query(&sql)
-        .bind(epoch)
-        .bind(ids)
-        .execute(pool)
-        .await?;
-    Ok(result.rows_affected())
-}
-
 async fn delete_staging_ids_tx(
     tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
     staging_schema: &str,
     epoch: i32,
+    chunk_id: i32,
     ids: &[i64],
 ) -> Result<u64> {
     if ids.is_empty() {
@@ -172,11 +99,12 @@ async fn delete_staging_ids_tx(
     }
     validate_identifier(staging_schema)?;
     let sql = format!(
-        r#"DELETE FROM "{}".irises WHERE epoch = $1 AND id = ANY($2)"#,
+        r#"DELETE FROM "{}".irises WHERE epoch = $1 AND chunk_id = $2 AND id = ANY($3)"#,
         staging_schema,
     );
     let result = sqlx::query(&sql)
         .bind(epoch)
+        .bind(chunk_id)
         .bind(ids)
         .execute(&mut **tx)
         .await?;
@@ -217,39 +145,50 @@ pub async fn insert_staging_irises(
     Ok(())
 }
 
-/// Apply a confirmed staging chunk to the live DB.
+/// Apply a confirmed staging chunk to the live `irises` table.
 ///
-/// Within a single transaction:
-///   1. Acquire `pg_advisory_xact_lock(RERAND_APPLY_LOCK)` (released
-///      automatically on commit/rollback/connection-drop).
-///   2. UPDATE live irises from staging (optimistic lock on version_id)
-///   3. DELETE staging rows for this chunk
-///   4. Mark live_applied in rerand_progress
-pub async fn apply_staging_chunk(
+/// Opens a single transaction that:
+/// 1. Acquires `RERAND_MODIFY_LOCK` (blocks modification writes)
+/// 2. Acquires `RERAND_APPLY_LOCK` (blocks startup DB load)
+/// 3. Deletes `staging_divergent` IDs from staging (cross-party disagreements)
+/// 4. Applies remaining staging rows via `version_id` CAS
+/// 5. Cleans up staging and marks progress
+///
+/// The `version_id` CAS (`WHERE irises.version_id = staging.original_version_id`)
+/// silently skips any rows that were modified between staging and apply. This is
+/// safe: the modification will propagate to all parties and overwrite whatever
+/// was there, restoring consistency. See the spec's "Conflict Resolution" section.
+pub async fn apply_confirmed_chunk(
     pool: &PgPool,
     staging_schema: &str,
     epoch: i32,
     chunk_id: i32,
+    staging_divergent: &[i64],
 ) -> Result<u64> {
     validate_identifier(staging_schema)?;
     let mut tx = pool.begin().await?;
-    let rows_updated = apply_staging_chunk_in_tx(&mut tx, staging_schema, epoch, chunk_id).await?;
-    tx.commit().await?;
-    Ok(rows_updated)
-}
 
-async fn apply_staging_chunk_in_tx(
-    tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
-    staging_schema: &str,
-    epoch: i32,
-    chunk_id: i32,
-) -> Result<u64> {
-    validate_identifier(staging_schema)?;
+    sqlx::query("SELECT pg_advisory_xact_lock($1)")
+        .bind(RERAND_MODIFY_LOCK)
+        .execute(&mut *tx)
+        .await?;
     sqlx::query("SELECT pg_advisory_xact_lock($1)")
         .bind(RERAND_APPLY_LOCK)
-        .execute(&mut **tx)
+        .execute(&mut *tx)
         .await?;
 
+    if !staging_divergent.is_empty() {
+        let deleted =
+            delete_staging_ids_tx(&mut tx, staging_schema, epoch, chunk_id, staging_divergent)
+                .await?;
+        tracing::info!(
+            "Rerand apply: removed {} staging-divergent rows (epoch={}, chunk={})",
+            deleted,
+            epoch,
+            chunk_id,
+        );
+    }
+
     let update_sql = format!(
         r#"
         UPDATE irises SET
@@ -269,7 +208,7 @@ async fn apply_staging_chunk_in_tx(
     let result = sqlx::query(&update_sql)
         .bind(epoch)
         .bind(chunk_id)
-        .execute(&mut **tx)
+        .execute(&mut *tx)
         .await?;
     let rows_updated = result.rows_affected();
 
@@ -280,7 +219,7 @@ async fn apply_staging_chunk_in_tx(
     sqlx::query(&delete_sql)
         .bind(epoch)
         .bind(chunk_id)
-        .execute(&mut **tx)
+        .execute(&mut *tx)
         .await?;
 
     sqlx::query(
@@ -288,50 +227,11 @@ async fn apply_staging_chunk_in_tx(
     )
     .bind(epoch)
     .bind(chunk_id)
-    .execute(&mut **tx)
+    .execute(&mut *tx)
     .await?;
 
-    Ok(rows_updated)
-}
-
-/// Apply a chunk under the modification fence in one transaction.
-///
-/// Transaction scope:
-///   1. Acquire `pg_advisory_xact_lock(RERAND_MODIFY_LOCK)`
-///   2. Compute local diverged IDs
-///   3. Prune union(cross_party_diverged, local_diverged) from staging
-///   4. Apply staging chunk to live (`RERAND_APPLY_LOCK` is acquired inside)
-///   5. Commit (releasing both transaction locks)
-pub async fn fenced_apply_chunk(
-    pool: &PgPool,
-    staging_schema: &str,
-    epoch: i32,
-    chunk_id: i32,
-    cross_party_divergent: Vec<i64>,
-) -> Result<(u64, usize)> {
-    validate_identifier(staging_schema)?;
-    let mut tx = pool.begin().await?;
-    sqlx::query("SELECT pg_advisory_xact_lock($1)")
-        .bind(RERAND_MODIFY_LOCK)
-        .execute(&mut *tx)
-        .await?;
-
-    let local_divergent =
-        get_locally_divergent_ids_tx(&mut tx, staging_schema, epoch, chunk_id).await?;
-
-    let mut skip_ids = cross_party_divergent;
-    skip_ids.extend(&local_divergent);
-    skip_ids.sort_unstable();
-    skip_ids.dedup();
-    let skip_count = skip_ids.len();
-
-    if !skip_ids.is_empty() {
-        delete_staging_ids_tx(&mut tx, staging_schema, epoch, &skip_ids).await?;
-    }
-
-    let rows = apply_staging_chunk_in_tx(&mut tx, staging_schema, epoch, chunk_id).await?;
     tx.commit().await?;
-    Ok((rows, skip_count))
+    Ok(rows_updated)
 }
 
 pub async fn upsert_rerand_progress(pool: &PgPool, epoch: i32, chunk_id: i32) -> Result<()> {
@@ -398,6 +298,48 @@ pub async fn get_max_confirmed_chunk(pool: &PgPool, epoch: i32) -> Result<Option
     Ok(row.0)
 }
 
+/// Returns the highest `chunk_id` where `live_applied = TRUE` for a given
+/// epoch, or `None` if no chunks have been applied in that epoch yet.
+pub async fn get_max_applied_chunk_for_epoch(
+    pool: &PgPool,
+    epoch: i32,
+) -> Result<Option<i32>> {
+    let row: (Option<i32>,) = sqlx::query_as(
+        "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND live_applied = TRUE",
+    )
+    .bind(epoch)
+    .fetch_one(pool)
+    .await?;
+    Ok(row.0)
+}
+
+/// Delete all staging rows for epochs older than `current_epoch`.
+pub async fn delete_staging_for_old_epochs(
+    pool: &PgPool,
+    staging_schema: &str,
+    current_epoch: i32,
+) -> Result<u64> {
+    validate_identifier(staging_schema)?;
+    let sql = format!(
+        r#"DELETE FROM "{}".irises WHERE epoch < $1"#,
+        staging_schema
+    );
+    let result = sqlx::query(&sql)
+        .bind(current_epoch)
+        .execute(pool)
+        .await?;
+    Ok(result.rows_affected())
+}
+
+/// Delete rerand progress rows for epochs older than `current_epoch`.
+pub async fn delete_rerand_progress_for_old_epochs(pool: &PgPool, current_epoch: i32) -> Result<u64> {
+    let result = sqlx::query("DELETE FROM rerand_progress WHERE epoch < $1")
+        .bind(current_epoch)
+        .execute(pool)
+        .await?;
+    Ok(result.rows_affected())
+}
+
 /// Returns the highest epoch that has any rerand_progress rows.
 pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
     let row: (Option<i32>,) = sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
@@ -413,8 +355,7 @@ pub async fn get_current_epoch(pool: &PgPool) -> Result<Option<i32>> {
 /// Build the rerand sync state from the local `rerand_progress` table.
 ///
 /// Returns `Ok(None)` when the `rerand_progress` table does not exist yet
-/// (rolling deploy before migration). Returns `Err` for real DB failures
-/// so callers can distinguish "not migrated" from "broken".
+/// (rolling deploy before migration). Returns `Err` for real DB failures.
 pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<Option<RerandSyncState>> {
     let epoch = match get_current_epoch(pool).await {
         Ok(e) => e.unwrap_or(0),
@@ -425,10 +366,10 @@ pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<Option<RerandSyncS
             return Err(e);
         }
     };
-    let max_confirmed = get_max_confirmed_chunk(pool, epoch).await?.unwrap_or(-1);
+    let max_applied = get_max_applied_chunk_for_epoch(pool, epoch).await?.unwrap_or(-1);
     Ok(Some(RerandSyncState {
         epoch,
-        max_confirmed_chunk: max_confirmed,
+        max_applied_chunk: max_applied,
     }))
 }
 
@@ -436,8 +377,7 @@ fn is_undefined_table(err: &eyre::Report) -> bool {
     if let Some(db_err) = err.root_cause().downcast_ref::<sqlx::Error>() {
         return is_undefined_table_sqlx(db_err);
     }
-    // Also check the direct error (not just root cause).
-    format!("{:?}", err).contains("42P01")
+    false
 }
 
 fn is_undefined_table_sqlx(err: &sqlx::Error) -> bool {
@@ -447,398 +387,377 @@ fn is_undefined_table_sqlx(err: &sqlx::Error) -> bool {
     false
 }
 
-/// Check whether all locally confirmed chunks have been applied to live.
-///
-/// Returns `Ok(true)` when no confirmed-but-unapplied chunks remain,
-/// `Ok(true)` when the `rerand_progress` table doesn't exist yet
-/// (rolling deploy), and `Err` on real DB failures.
-async fn check_pending_chunks_applied(conn: &mut sqlx::PgConnection) -> Result<bool> {
-    let pending: (i64,) = match sqlx::query_as(
-        "SELECT COUNT(*) FROM rerand_progress \
-         WHERE all_confirmed = TRUE AND live_applied = FALSE",
+
+// ---------------------------------------------------------------------------
+// Freeze protocol: coordinated pause of the rerand worker during startup
+// ---------------------------------------------------------------------------
+
+const FREEZE_TIMEOUT: Duration = Duration::from_secs(120);
+const FREEZE_POLL: Duration = Duration::from_secs(2);
+
+fn rerand_control_exists(err: &sqlx::Error) -> bool {
+    !is_undefined_table_sqlx(err)
+}
+
+/// Request the rerand worker to freeze. Writes a unique `freeze_generation`
+/// to `rerand_control`. Returns the generation token.
+pub async fn request_rerand_freeze(pool: &PgPool) -> Result<Option<String>> {
+    let generation = uuid::Uuid::new_v4().to_string();
+    match sqlx::query(
+        "UPDATE rerand_control SET freeze_requested = TRUE, freeze_generation = $1, frozen_generation = NULL WHERE id = 1",
     )
-    .fetch_one(&mut *conn)
+    .bind(&generation)
+    .execute(pool)
     .await
     {
-        Ok(row) => row,
-        Err(e) if is_undefined_table_sqlx(&e) => return Ok(true),
-        Err(e) => return Err(e.into()),
-    };
-    Ok(pending.0 == 0)
+        Ok(_) => Ok(Some(generation)),
+        Err(e) if !rerand_control_exists(&e) => {
+            tracing::info!("rerand_control table missing; skipping freeze (pre-migration)");
+            Ok(None)
+        }
+        Err(e) => Err(e.into()),
+    }
 }
 
-/// Highest `(epoch, chunk_id)` where `live_applied = TRUE`.
-/// Returns `None` when no chunks have been applied yet.
-async fn get_applied_watermark(conn: &mut sqlx::PgConnection) -> Result<Option<(i32, i32)>> {
-    let row: Option<(i32, i32)> = match sqlx::query_as(
-        "SELECT epoch, chunk_id FROM rerand_progress \
-         WHERE live_applied = TRUE \
-         ORDER BY epoch DESC, chunk_id DESC \
-         LIMIT 1",
+/// Wait until the rerand worker acknowledges the freeze by writing
+/// `frozen_generation = generation`. Fails closed on timeout.
+pub async fn wait_for_rerand_frozen(pool: &PgPool, generation: &str) -> Result<()> {
+    let deadline = tokio::time::Instant::now() + FREEZE_TIMEOUT;
+    loop {
+        let row: Option<(Option<String>,)> = sqlx::query_as(
+            "SELECT frozen_generation FROM rerand_control WHERE id = 1",
+        )
+        .fetch_optional(pool)
+        .await?;
+
+        if let Some((Some(frozen_gen),)) = row {
+            if frozen_gen == generation {
+                tracing::info!("Rerand worker confirmed freeze (generation={})", generation);
+                return Ok(());
+            }
+        }
+
+        if tokio::time::Instant::now() >= deadline {
+            eyre::bail!(
+                "Rerand worker did not acknowledge freeze after {:?} (generation={}). \
+                 Ensure the rerand worker is running and healthy.",
+                FREEZE_TIMEOUT,
+                generation,
+            );
+        }
+        tokio::time::sleep(FREEZE_POLL).await;
+    }
+}
+
+/// Called by the rerand worker between chunks. If a freeze is requested,
+/// acknowledge it and block until the freeze is lifted. Returns `true` if
+/// the worker should continue, `false` if cancelled while frozen.
+pub async fn check_and_handle_freeze(
+    pool: &PgPool,
+    cancel: Option<&tokio_util::sync::CancellationToken>,
+) -> Result<bool> {
+    let row: Option<(bool, Option<String>)> = match sqlx::query_as(
+        "SELECT freeze_requested, freeze_generation FROM rerand_control WHERE id = 1",
     )
-    .fetch_optional(&mut *conn)
+    .fetch_optional(pool)
     .await
     {
-        Ok(row) => row,
-        Err(e) if is_undefined_table_sqlx(&e) => return Ok(None),
+        Ok(r) => r,
+        Err(e) if !rerand_control_exists(&e) => return Ok(true),
         Err(e) => return Err(e.into()),
     };
-    Ok(row)
-}
 
-/// Highest (epoch, max_confirmed_chunk) reported by any peer in the
-/// startup snapshot. Returns `None` when no peer has rerand state
-/// (pre-migration rolling deploy).
-fn peer_rerand_target(sync_result: &SyncResult) -> Option<(i32, i32)> {
-    sync_result
-        .all_states
-        .iter()
-        .filter_map(|s| s.rerand_state.as_ref())
-        .map(|s| (s.epoch, s.max_confirmed_chunk))
-        .max() // lexicographic: epoch first, then chunk
-}
-
-/// Returns `Ok(())` if the peer snapshot is within protocol tolerance
-/// and `Err` if fatally desynchronized (gap > 1).
-fn validate_rerand_sync_inner(sync_result: &SyncResult) -> Result<()> {
-    let my_state = match sync_result.my_state.rerand_state.as_ref() {
-        Some(s) => s,
-        None => return Ok(()),
+    let Some((true, Some(generation))) = row else {
+        return Ok(true);
     };
-    let my_epoch = my_state.epoch;
-    let my_chunk = my_state.max_confirmed_chunk;
 
-    for s in sync_result
-        .all_states
-        .iter()
-        .filter_map(|s| s.rerand_state.as_ref())
-    {
-        let epoch_diff = s.epoch - my_epoch;
-        match epoch_diff {
-            0 => {
-                let chunk_diff = s.max_confirmed_chunk - my_chunk;
-                if chunk_diff > 1 {
-                    eyre::bail!(
-                        "Fatal chunk desync: peer confirmed chunk {} but local is at {} \
-                         (max possible difference is 1)",
-                        s.max_confirmed_chunk,
-                        my_chunk
-                    );
-                }
+    tracing::info!("Rerand freeze requested (generation={}), pausing...", generation);
+
+    // Acknowledge the freeze.
+    sqlx::query("UPDATE rerand_control SET frozen_generation = $1 WHERE id = 1")
+        .bind(&generation)
+        .execute(pool)
+        .await?;
+
+    let mut current_gen = generation.to_string();
+
+    // Block until freeze is lifted. Re-read freeze_generation each iteration
+    // so that if the requesting server crashes and restarts with a new
+    // generation, we re-acknowledge instead of deadlocking.
+    loop {
+        if cancel.is_some_and(|c| c.is_cancelled()) {
+            return Ok(false);
+        }
+
+        let row: Option<(bool, Option<String>)> = sqlx::query_as(
+            "SELECT freeze_requested, freeze_generation FROM rerand_control WHERE id = 1",
+        )
+        .fetch_optional(pool)
+        .await?;
+
+        match row {
+            Some((false, _)) | None => {
+                tracing::info!("Rerand freeze lifted, resuming");
+                return Ok(true);
             }
-            1 => {}
-            -1 => {}
-            _ => {
-                eyre::bail!(
-                    "Fatal epoch desync: local epoch is {}, but peer is on epoch {}",
-                    my_epoch,
-                    s.epoch
+            Some((true, Some(ref new_gen))) if *new_gen != current_gen => {
+                tracing::info!(
+                    "Rerand freeze generation changed ({} -> {}), re-acknowledging",
+                    current_gen,
+                    new_gen
                 );
+                sqlx::query("UPDATE rerand_control SET frozen_generation = $1 WHERE id = 1")
+                    .bind(new_gen)
+                    .execute(pool)
+                    .await?;
+                current_gen = new_gen.clone();
             }
+            _ => {}
         }
-    }
 
-    Ok(())
+        tokio::time::sleep(FREEZE_POLL).await;
+    }
 }
 
-const RERAND_READY_TIMEOUT: Duration = Duration::from_secs(60);
-const RERAND_READY_POLL: Duration = Duration::from_secs(2);
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-enum StartupReadiness {
-    Ready,
-    Behind,
-    Ahead {
-        local_applied: (i32, i32),
-        target: (i32, i32),
-    },
+/// Lift the freeze and clear the generation. Called after `load_iris_db`.
+/// Retries on transient DB errors to avoid leaving the worker permanently frozen.
+/// Silently succeeds if the `rerand_control` table doesn't exist (pre-migration).
+pub async fn release_rerand_freeze(pool: &PgPool) -> Result<()> {
+    for attempt in 0..5 {
+        match sqlx::query(
+            "UPDATE rerand_control SET freeze_requested = FALSE, freeze_generation = NULL, frozen_generation = NULL WHERE id = 1",
+        )
+        .execute(pool)
+        .await
+        {
+            Ok(_) => {
+                tracing::info!("Rerand freeze released");
+                return Ok(());
+            }
+            Err(e) if !rerand_control_exists(&e) => {
+                return Ok(());
+            }
+            Err(e) => {
+                tracing::warn!(
+                    "Failed to release rerand freeze (attempt {}): {:?}",
+                    attempt + 1,
+                    e
+                );
+                tokio::time::sleep(FREEZE_POLL).await;
+            }
+        }
+    }
+    eyre::bail!("Failed to release rerand freeze after 5 attempts — worker may be stuck frozen");
 }
 
-fn classify_startup_readiness_for_target(
-    local_applied: (i32, i32),
-    target: (i32, i32),
-) -> StartupReadiness {
-    if local_applied.cmp(&target) == Ordering::Greater {
-        return StartupReadiness::Ahead {
-            local_applied,
-            target,
-        };
-    }
+/// Acquire `RERAND_APPLY_LOCK` on a detached connection. The lock is held
+/// through `load_iris_db` to prevent any concurrent rerand applies (belt
+/// and suspenders — the freeze should already have paused the worker).
+pub async fn acquire_apply_lock(pool: &PgPool) -> Result<Option<sqlx::PgConnection>> {
+    let mut conn = pool.acquire().await?;
 
-    if target.1 < 0 {
-        // No confirmed chunks exist in target_epoch yet.
-        return StartupReadiness::Ready;
+    // If rerand tables don't exist yet, skip.
+    match sqlx::query_as::<_, (i64,)>(
+        "SELECT COUNT(*) FROM rerand_progress LIMIT 1",
+    )
+    .fetch_one(&mut *conn)
+    .await
+    {
+        Err(e) if is_undefined_table_sqlx(&e) => return Ok(None),
+        Err(e) => return Err(e.into()),
+        Ok(_) => {}
     }
 
-    if local_applied == target {
-        StartupReadiness::Ready
-    } else {
-        StartupReadiness::Behind
-    }
+    sqlx::query("SELECT pg_advisory_lock($1)")
+        .bind(RERAND_APPLY_LOCK)
+        .execute(&mut *conn)
+        .await?;
+
+    Ok(Some(conn.detach()))
 }
 
-async fn get_startup_readiness(
-    conn: &mut sqlx::PgConnection,
-    target: Option<(i32, i32)>,
-) -> Result<StartupReadiness> {
-    if !check_pending_chunks_applied(conn).await? {
-        return Ok(StartupReadiness::Behind);
+/// Release the advisory lock and close the connection.
+pub async fn release_apply_lock(lock_conn: Option<sqlx::PgConnection>) -> Result<()> {
+    if let Some(mut conn) = lock_conn {
+        let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
+            .bind(RERAND_APPLY_LOCK)
+            .execute(&mut conn)
+            .await;
+        drop(conn);
+        tracing::info!("RERAND_APPLY_LOCK released after DB load");
     }
+    Ok(())
+}
 
-    let Some(target) = target else {
-        return Ok(StartupReadiness::Ready);
+/// Get the local applied watermark: `(epoch, max_chunk_id)` where
+/// `live_applied = TRUE`. Returns `None` pre-migration or if no chunks
+/// have been applied.
+pub async fn get_applied_watermark_from_pool(pool: &PgPool) -> Result<Option<(i32, i32)>> {
+    let row: Option<(i32, i32)> = match sqlx::query_as(
+        "SELECT epoch, chunk_id FROM rerand_progress \
+         WHERE live_applied = TRUE \
+         ORDER BY epoch DESC, chunk_id DESC \
+         LIMIT 1",
+    )
+    .fetch_optional(pool)
+    .await
+    {
+        Ok(row) => row,
+        Err(e) if is_undefined_table_sqlx(&e) => return Ok(None),
+        Err(e) => return Err(e.into()),
     };
-
-    let local_applied = get_applied_watermark(conn).await?.unwrap_or((-1, -1));
-    Ok(classify_startup_readiness_for_target(local_applied, target))
+    Ok(row)
 }
 
-/// Wait for local rerand progress to reach the startup snapshot target,
-/// then hold `RERAND_APPLY_LOCK` through DB load.
-///
-/// The loop is lock-first:
-/// 1. acquire `pg_advisory_lock(RERAND_APPLY_LOCK)`,
-/// 2. check readiness while applies are frozen,
-/// 3. if behind, unlock and retry after a short sleep.
+async fn fetch_peer_watermark(host: &str, port: usize) -> Result<Option<(i32, i32)>> {
+    let url = format!("http://{}:{}/rerand-watermark", host, port);
+    let client = reqwest::Client::builder()
+        .timeout(Duration::from_secs(10))
+        .build()?;
+    let resp = client
+        .get(&url)
+        .send()
+        .await
+        .map_err(|e| eyre::eyre!("Failed to reach {} for watermark: {}", url, e))?;
+    if !resp.status().is_success() {
+        eyre::bail!(
+            "Peer {} returned HTTP {} for watermark",
+            url,
+            resp.status()
+        );
+    }
+    let body = resp
+        .text()
+        .await
+        .map_err(|e| eyre::eyre!("Failed to read watermark from {}: {}", url, e))?;
+
+    if body.trim() == "null" {
+        return Ok(None);
+    }
+    let v: serde_json::Value = serde_json::from_str(&body)
+        .map_err(|e| eyre::eyre!("Failed to parse watermark from {}: {}", url, e))?;
+    Ok(Some((
+        v["epoch"]
+            .as_i64()
+            .ok_or_else(|| eyre::eyre!("Missing epoch in watermark from {}", url))?
+            as i32,
+        v["max_applied_chunk"]
+            .as_i64()
+            .ok_or_else(|| eyre::eyre!("Missing max_applied_chunk in watermark from {}", url))?
+            as i32,
+    )))
+}
+
+/// Freeze the local rerand worker, then verify all peers report the exact
+/// same applied watermark. If this party is behind, release the freeze
+/// briefly so the worker can catch up, then re-freeze and re-check.
+/// If this party is at the max, stay frozen and wait for peers to catch up.
 ///
-/// This avoids startup/apply races without a separate startup-cap table.
-pub async fn rerand_validate_and_lock(
+/// Guarantees: when this returns `Ok(())`, the local worker is frozen and
+/// all parties have the same `(epoch, max_applied_chunk)`.
+/// On any error, the freeze is released before the error propagates.
+pub async fn freeze_and_verify_watermarks(
     pool: &PgPool,
-    sync_result: &SyncResult,
-) -> Result<Option<sqlx::pool::PoolConnection<sqlx::Postgres>>> {
-    if sync_result.my_state.rerand_state.is_none() {
-        tracing::info!("Rerand startup lock: skipped (rerand tables not yet migrated)");
-        return Ok(None);
+    peers: &[(&str, usize)],
+) -> Result<()> {
+    if peers.is_empty() {
+        eyre::bail!("freeze_and_verify_watermarks called with no peers");
     }
 
-    // One-shot fatal desync check (gap > 1 -> bail).
-    validate_rerand_sync_inner(sync_result)?;
+    let result = freeze_and_verify_inner(pool, peers).await;
+    if result.is_err() {
+        if let Err(release_err) = release_rerand_freeze(pool).await {
+            tracing::error!(
+                "Failed to release rerand freeze during error cleanup: {:?}. \
+                 Worker may be stuck frozen until next successful startup.",
+                release_err
+            );
+        }
+    }
+    result
+}
 
-    let target = peer_rerand_target(sync_result);
-    let deadline = tokio::time::Instant::now() + RERAND_READY_TIMEOUT;
+async fn freeze_and_verify_inner(
+    pool: &PgPool,
+    peers: &[(&str, usize)],
+) -> Result<()> {
+    let deadline = tokio::time::Instant::now() + FREEZE_TIMEOUT;
 
     loop {
-        let mut conn = pool.acquire().await?;
-        let got_lock: (bool,) = sqlx::query_as("SELECT pg_try_advisory_lock($1)")
-            .bind(RERAND_APPLY_LOCK)
-            .fetch_one(&mut *conn)
-            .await?;
-        if !got_lock.0 {
-            drop(conn);
+        let gen = match request_rerand_freeze(pool).await? {
+            Some(g) => g,
+            None => return Ok(()), // pre-migration, no rerand tables
+        };
+        wait_for_rerand_frozen(pool, &gen).await?;
+
+        loop {
             if tokio::time::Instant::now() >= deadline {
+                release_rerand_freeze(pool).await?;
                 eyre::bail!(
-                    "Rerand lock not available after {:?} (target={:?}); \
-                     ensure the rerand worker is healthy.",
-                    RERAND_READY_TIMEOUT,
-                    target
+                    "Rerand watermark convergence timeout after {:?}. \
+                     Ensure all rerand workers and main servers are healthy.",
+                    FREEZE_TIMEOUT,
                 );
             }
-            tokio::time::sleep(RERAND_READY_POLL).await;
-            continue;
-        }
 
-        let readiness = match get_startup_readiness(&mut conn, target).await {
-            Ok(readiness) => readiness,
-            Err(e) => {
-                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
-                    .bind(RERAND_APPLY_LOCK)
-                    .execute(&mut *conn)
-                    .await;
-                drop(conn);
-                return Err(e);
+            let local = get_applied_watermark_from_pool(pool).await?;
+            let mut all_equal = true;
+            let mut max_wm = local;
+
+            for (host, port) in peers {
+                let peer = fetch_peer_watermark(host, *port).await?;
+                if peer != local {
+                    all_equal = false;
+                }
+                if peer > max_wm {
+                    max_wm = peer;
+                }
             }
-        };
 
-        match readiness {
-            StartupReadiness::Ready => return Ok(Some(conn)),
-            StartupReadiness::Ahead {
-                local_applied,
-                target,
-            } => {
-                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
-                    .bind(RERAND_APPLY_LOCK)
-                    .execute(&mut *conn)
-                    .await;
-                drop(conn);
-                eyre::bail!(
-                    "Rerand advanced past startup snapshot target: local_applied={:?}, target={:?}. \
-                     Restart and retry startup.",
-                    local_applied,
-                    target,
+            if all_equal {
+                tracing::info!(
+                    "Rerand watermark equality confirmed across all parties: {:?}",
+                    local
                 );
+                return Ok(());
             }
-            StartupReadiness::Behind => {
-                let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
-                    .bind(RERAND_APPLY_LOCK)
-                    .execute(&mut *conn)
-                    .await;
-                drop(conn);
+
+            if local < max_wm {
+                tracing::info!(
+                    "Local watermark {:?} behind max {:?}, releasing freeze to catch up",
+                    local,
+                    max_wm
+                );
+                release_rerand_freeze(pool).await?;
+                tokio::time::sleep(Duration::from_secs(5)).await;
+                break; // outer loop will re-freeze and re-check
             }
-        }
 
-        if tokio::time::Instant::now() >= deadline {
-            eyre::bail!(
-                "Rerand not caught up after {:?} (target={:?}); \
-                 ensure the rerand worker is running.",
-                RERAND_READY_TIMEOUT,
-                target
+            tracing::info!(
+                "Local watermark {:?} at max, waiting for peers to catch up...",
+                local
             );
+            tokio::time::sleep(FREEZE_POLL).await;
         }
-
-        tracing::info!(
-            "Waiting for rerand worker catch-up (target={:?}, {:.0}s left)...",
-            target,
-            deadline
-                .saturating_duration_since(tokio::time::Instant::now())
-                .as_secs_f64(),
-        );
-        tokio::time::sleep(RERAND_READY_POLL).await;
     }
 }
 
-/// Release the advisory lock and close the connection.
-///
-/// Explicit release keeps the lock lifecycle clear in logs and avoids
-/// returning a locked connection to the pool.
-pub async fn release_rerand_lock(
-    lock_conn: Option<sqlx::pool::PoolConnection<sqlx::Postgres>>,
-) -> Result<()> {
-    if let Some(mut conn) = lock_conn {
-        let _ = sqlx::query("SELECT pg_advisory_unlock($1)")
-            .bind(RERAND_APPLY_LOCK)
-            .execute(&mut *conn)
-            .await;
-        drop(conn);
-        tracing::info!("Rerand advisory lock released after DB load");
-    }
-    Ok(())
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
-    use iris_mpc_common::config::CommonConfig;
-    use iris_mpc_common::helpers::sync::SyncState;
-
-    fn dummy_sync_state(epoch: i32, max_confirmed_chunk: i32) -> SyncState {
-        SyncState {
-            db_len: 100,
-            modifications: vec![],
-            next_sns_sequence_num: None,
-            common_config: CommonConfig::default(),
-            rerand_state: Some(RerandSyncState {
-                epoch,
-                max_confirmed_chunk,
-            }),
-        }
-    }
 
     #[test]
-    fn test_validate_peer_one_chunk_ahead_ok() {
-        let p0 = dummy_sync_state(1, 4);
-        let p1 = dummy_sync_state(1, 4);
-        let p2 = dummy_sync_state(1, 5);
-        let sync_result = SyncResult {
-            my_state: p0.clone(),
-            all_states: vec![p0, p1, p2],
-        };
-        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
+    fn test_staging_schema_name() {
+        assert_eq!(staging_schema_name("public"), "public_rerand_staging");
     }
 
     #[test]
-    fn test_validate_all_same_ok() {
-        let p0 = dummy_sync_state(1, 5);
-        let p1 = dummy_sync_state(1, 5);
-        let p2 = dummy_sync_state(1, 5);
-        let sync_result = SyncResult {
-            my_state: p0.clone(),
-            all_states: vec![p0, p1, p2],
-        };
-        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
+    fn test_validate_identifier_ok() {
+        assert!(validate_identifier("public_rerand_staging").is_ok());
     }
 
     #[test]
-    fn test_validate_peer_epoch_ahead_ok() {
-        let p0 = dummy_sync_state(0, 5);
-        let p1 = dummy_sync_state(1, 0);
-        let p2 = dummy_sync_state(0, 5);
-        let sync_result = SyncResult {
-            my_state: p0.clone(),
-            all_states: vec![p0, p1, p2],
-        };
-        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
-    }
-
-    #[test]
-    fn test_validate_peer_epoch_behind_ok() {
-        let p0 = dummy_sync_state(1, 2);
-        let p1 = dummy_sync_state(0, 10);
-        let p2 = dummy_sync_state(1, 2);
-        let sync_result = SyncResult {
-            my_state: p0.clone(),
-            all_states: vec![p0, p1, p2],
-        };
-        assert!(validate_rerand_sync_inner(&sync_result).is_ok());
-    }
-
-    #[test]
-    fn test_validate_fatal_chunk_desync() {
-        let p0 = dummy_sync_state(1, 2);
-        let p1 = dummy_sync_state(1, 4);
-        let p2 = dummy_sync_state(1, 2);
-        let sync_result = SyncResult {
-            my_state: p0.clone(),
-            all_states: vec![p0, p1, p2],
-        };
-        assert!(validate_rerand_sync_inner(&sync_result).is_err());
-    }
-
-    #[test]
-    fn test_validate_fatal_epoch_desync() {
-        let p0 = dummy_sync_state(1, 2);
-        let p1 = dummy_sync_state(3, 10);
-        let p2 = dummy_sync_state(1, 2);
-        let sync_result = SyncResult {
-            my_state: p0.clone(),
-            all_states: vec![p0, p1, p2],
-        };
-        assert!(validate_rerand_sync_inner(&sync_result).is_err());
-    }
-
-    #[test]
-    fn test_classify_target_chunk_minus_one_previous_epoch_applied_is_ready() {
-        let readiness = classify_startup_readiness_for_target((0, 42), (1, -1));
-        assert_eq!(readiness, StartupReadiness::Ready);
-    }
-
-    #[test]
-    fn test_classify_target_chunk_minus_one_same_epoch_applied_is_ahead() {
-        let readiness = classify_startup_readiness_for_target((1, 0), (1, -1));
-        assert_eq!(
-            readiness,
-            StartupReadiness::Ahead {
-                local_applied: (1, 0),
-                target: (1, -1)
-            }
-        );
-    }
-
-    #[test]
-    fn test_classify_target_positive_behind_ready_ahead() {
-        assert_eq!(
-            classify_startup_readiness_for_target((1, 2), (1, 3)),
-            StartupReadiness::Behind
-        );
-        assert_eq!(
-            classify_startup_readiness_for_target((1, 3), (1, 3)),
-            StartupReadiness::Ready
-        );
-        assert_eq!(
-            classify_startup_readiness_for_target((1, 4), (1, 3)),
-            StartupReadiness::Ahead {
-                local_applied: (1, 4),
-                target: (1, 3)
-            }
-        );
+    fn test_validate_identifier_rejects_injection() {
+        assert!(validate_identifier("public; DROP TABLE irises").is_err());
     }
 }
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 84e8430cab..436ed1d54a 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -4,9 +4,11 @@ use bytemuck::cast_slice;
 use eyre::Result;
 use futures::StreamExt;
 use iris_mpc_store::rerand::{
-    delete_staging_chunk, fenced_apply_chunk, get_current_epoch, get_rerand_progress,
-    get_staging_version_map, insert_staging_irises, set_all_confirmed, set_staging_written,
-    staging_schema_name, upsert_rerand_progress, StagingIrisEntry,
+    apply_confirmed_chunk, check_and_handle_freeze, delete_staging_chunk,
+    delete_staging_for_old_epochs, delete_rerand_progress_for_old_epochs, get_current_epoch,
+    get_max_applied_chunk_for_epoch, get_rerand_progress, get_staging_version_map,
+    insert_staging_irises, set_all_confirmed, set_staging_written, staging_schema_name,
+    upsert_rerand_progress, StagingIrisEntry,
 };
 use iris_mpc_store::Store;
 use sqlx::PgPool;
@@ -31,6 +33,13 @@ pub async fn run_continuous_rerand(
     store: &Store,
     cancel: Option<&CancellationToken>,
 ) -> Result<()> {
+    if config.chunk_size == 0 {
+        eyre::bail!("chunk_size must be > 0");
+    }
+    if config.s3_poll_interval_ms == 0 {
+        eyre::bail!("s3_poll_interval_ms must be > 0");
+    }
+
     let pool = &store.pool;
     let staging_schema = staging_schema_name(&store.schema_name);
     let poll_interval = Duration::from_millis(config.s3_poll_interval_ms);
@@ -41,6 +50,10 @@ pub async fn run_continuous_rerand(
             return Ok(());
         }
 
+        if !check_and_handle_freeze(pool, cancel).await? {
+            return Ok(());
+        }
+
         let epoch_hint = get_current_epoch(pool).await?.map(|e| e as u32);
         let active_epoch = epoch::determine_active_epoch(s3, &config.s3_bucket, epoch_hint).await?;
         tracing::info!("Active epoch: {}", active_epoch);
@@ -65,25 +78,49 @@ pub async fn run_continuous_rerand(
             manifest.max_id_inclusive
         );
 
-        let mut chunk_id: u32 = 0;
+        let cleaned =
+            delete_staging_for_old_epochs(pool, &staging_schema, active_epoch as i32).await?;
+        if cleaned > 0 {
+            tracing::info!(
+                "Epoch {}: cleaned {} orphaned staging rows from prior epochs",
+                active_epoch,
+                cleaned
+            );
+        }
+        let cleaned_progress =
+            delete_rerand_progress_for_old_epochs(pool, active_epoch as i32).await?;
+        if cleaned_progress > 0 {
+            tracing::info!(
+                "Epoch {}: cleaned {} rerand_progress rows from prior epochs",
+                active_epoch,
+                cleaned_progress
+            );
+        }
+
+        let start_chunk_id = get_max_applied_chunk_for_epoch(pool, active_epoch as i32)
+            .await?
+            .map(|max_chunk| (max_chunk + 1) as u32)
+            .unwrap_or(0);
+
+        let mut chunk_id: u32 = start_chunk_id;
         loop {
             if is_cancelled(cancel) {
                 return Ok(());
             }
 
+            // Honor startup freeze requests between chunks.
+            if !check_and_handle_freeze(pool, cancel).await? {
+                return Ok(());
+            }
+
             if manifest.chunk_is_empty(chunk_id) {
                 break;
             }
 
             let progress = get_rerand_progress(pool, active_epoch as i32, chunk_id as i32).await?;
-
-            if progress.as_ref().is_some_and(|p| p.live_applied) {
-                chunk_id += 1;
-                continue;
-            }
-
             upsert_rerand_progress(pool, active_epoch as i32, chunk_id as i32).await?;
 
+            // --- Stage ---
             if !progress.as_ref().is_some_and(|p| p.staging_written) {
                 process_chunk_staging(
                     pool,
@@ -96,11 +133,10 @@ pub async fn run_continuous_rerand(
                     &manifest,
                 )
                 .await?;
-
                 set_staging_written(pool, active_epoch as i32, chunk_id as i32).await?;
             }
 
-            // Upload version map + staged marker (both idempotent).
+            // --- Upload version map + staged marker (both idempotent) ---
             if !progress.as_ref().is_some_and(|p| p.all_confirmed) {
                 let version_map = get_staging_version_map(
                     pool,
@@ -137,6 +173,7 @@ pub async fn run_continuous_rerand(
                 return Ok(());
             }
 
+            // --- Wait for all parties to confirm staging ---
             if !progress.as_ref().is_some_and(|p| p.all_confirmed) {
                 s3_coordination::poll_chunk_staged_all(
                     s3,
@@ -146,7 +183,6 @@ pub async fn run_continuous_rerand(
                     poll_interval,
                 )
                 .await?;
-
                 set_all_confirmed(pool, active_epoch as i32, chunk_id as i32).await?;
                 tracing::info!(
                     "Epoch {} chunk {}: all parties confirmed",
@@ -159,9 +195,10 @@ pub async fn run_continuous_rerand(
                 return Ok(());
             }
 
-            // --- Modification fence ---
-            // 1. Compute cross-party version_id disagreements (before lock)
-            let cross_party_divergent = s3_coordination::compute_cross_party_divergent_ids(
+            // --- Apply ---
+            // 1. Compute staging-time cross-party disagreements from version maps.
+            //    This is pure S3 reads — no DB lock held.
+            let staging_divergent = s3_coordination::compute_cross_party_divergent_ids(
                 s3,
                 &config.s3_bucket,
                 active_epoch,
@@ -170,23 +207,25 @@ pub async fn run_continuous_rerand(
             )
             .await?;
 
-            // 2-6. Lock, check, prune, apply, unlock — helper guarantees
-            //      unlock on all error paths.
-            let (rows, skip_count) = fenced_apply_chunk(
+            // 2. Apply under lock. The function acquires RERAND_MODIFY_LOCK +
+            //    RERAND_APPLY_LOCK, deletes staging_divergent, applies via
+            //    version_id CAS, cleans up staging, and commits.
+            //    No S3 I/O happens while the lock is held.
+            let rows = apply_confirmed_chunk(
                 pool,
                 &staging_schema,
                 active_epoch as i32,
                 chunk_id as i32,
-                cross_party_divergent,
+                &staging_divergent,
             )
             .await?;
 
             tracing::info!(
-                "Epoch {} chunk {}: applied to live DB ({} rows updated, {} skipped)",
+                "Epoch {} chunk {}: applied to live DB ({} rows updated, {} staging-divergent skipped)",
                 active_epoch,
                 chunk_id,
                 rows,
-                skip_count,
+                staging_divergent.len(),
             );
 
             chunk_id += 1;
@@ -196,12 +235,15 @@ pub async fn run_continuous_rerand(
             }
         }
 
-        if chunk_id == 0 && chunk_delay > Duration::ZERO {
+        if chunk_id == 0 {
+            let empty_epoch_sleep = chunk_delay.max(Duration::from_secs(30));
             tracing::info!(
-                "Epoch {} is empty, sleeping to avoid spinning",
-                active_epoch
+                "Epoch {} is empty (max_id_inclusive={}), sleeping {:.0}s to avoid spinning",
+                active_epoch,
+                manifest.max_id_inclusive,
+                empty_epoch_sleep.as_secs_f64(),
             );
-            sleep(chunk_delay).await;
+            sleep(empty_epoch_sleep).await;
         }
 
         epoch::complete_epoch(
@@ -234,15 +276,25 @@ async fn get_or_create_manifest(
             .await;
     }
 
-    if config.party_id == 0 {
-        let local_max = store.get_max_serial_id().await? as u64;
-        s3_coordination::upload_max_id(s3, &config.s3_bucket, epoch, 0, local_max).await?;
+    let local_max = store.get_max_serial_id().await? as u64;
+    s3_coordination::upload_max_id(s3, &config.s3_bucket, epoch, config.party_id, local_max)
+        .await?;
 
+    if config.party_id == 0 {
         let all_max_ids =
             s3_coordination::download_all_max_ids(s3, &config.s3_bucket, epoch, poll_interval)
                 .await?;
         let min_max = *all_max_ids.iter().min().unwrap();
         let max_id_inclusive = min_max.saturating_sub(config.safety_buffer_ids);
+        if max_id_inclusive == 0 {
+            tracing::warn!(
+                "Epoch {}: max_id_inclusive is 0 (min_max={}, safety_buffer_ids={}). \
+                 Epoch will be empty.",
+                epoch,
+                min_max,
+                config.safety_buffer_ids
+            );
+        }
 
         let manifest = Manifest {
             epoch,
@@ -258,10 +310,6 @@ async fn get_or_create_manifest(
         );
         Ok(manifest)
     } else {
-        let local_max = store.get_max_serial_id().await? as u64;
-        s3_coordination::upload_max_id(s3, &config.s3_bucket, epoch, config.party_id, local_max)
-            .await?;
-
         s3_coordination::download_manifest(s3, &config.s3_bucket, epoch, poll_interval).await
     }
 }
@@ -277,8 +325,6 @@ async fn process_chunk_staging(
     chunk_id: u32,
     manifest: &Manifest,
 ) -> Result<()> {
-    // Delete any leftover rows from a previous partial run so all rows in
-    // staging come from one read pass (prevents mixed-snapshot version_ids).
     delete_staging_chunk(pool, staging_schema, epoch as i32, chunk_id as i32).await?;
 
     let (start, end) = manifest.chunk_range(chunk_id);
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index fe483002d3..1df60cebb5 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -1,5 +1,6 @@
 use aws_sdk_s3::Client as S3Client;
 use eyre::{eyre, Result};
+use futures::future::try_join_all;
 use serde::{Deserialize, Serialize};
 use std::time::Duration;
 use tokio::time::{sleep, Instant};
@@ -26,7 +27,7 @@ impl Manifest {
     /// IDs are 1-based.
     pub fn chunk_range(&self, chunk_id: u32) -> (u64, u64) {
         let start = 1 + (chunk_id as u64) * self.chunk_size;
-        let end = std::cmp::min(start + self.chunk_size, self.max_id_inclusive + 1);
+        let end = std::cmp::min(start + self.chunk_size, self.max_id_inclusive.saturating_add(1));
         (start, end)
     }
 
@@ -187,13 +188,20 @@ pub async fn download_all_max_ids(
     epoch: u32,
     poll_interval: Duration,
 ) -> Result<[u64; 3]> {
+    let keys: Vec<String> = (0..NUM_PARTIES)
+        .map(|party| format!("{}/max-id", epoch_party_prefix(epoch, party)))
+        .collect();
+
+    for key in &keys {
+        poll_until_marker_exists(s3, bucket, key, poll_interval).await?;
+    }
+
+    let all_bytes: Vec<Vec<u8>> =
+        try_join_all(keys.iter().map(|key| download_marker(s3, bucket, key))).await?;
     let mut ids = [0u64; 3];
-    for party in 0..NUM_PARTIES {
-        let key = format!("{}/max-id", epoch_party_prefix(epoch, party));
-        poll_until_marker_exists(s3, bucket, &key, poll_interval).await?;
-        let bytes = download_marker(s3, bucket, &key).await?;
+    for (party, bytes) in all_bytes.into_iter().enumerate() {
         let s = String::from_utf8(bytes)?;
-        ids[party as usize] = s
+        ids[party] = s
             .trim()
             .parse()
             .map_err(|e| eyre!("Failed to parse max-id from party {}: {}", party, e))?;
@@ -332,9 +340,9 @@ async fn download_chunk_version_map(
 /// Compare version maps across all 3 parties and return IDs where any
 /// party disagrees on the `original_version_id`.
 ///
-/// Fast path: download only the 32-byte blake3 hashes. If all match,
-/// return empty (no disagreements). Slow path (hash mismatch): download
-/// the full maps and compute the exact disagreement set.
+/// Fast path: download only the 32-byte blake3 hashes concurrently. If all
+/// match, return empty (no disagreements). Slow path (hash mismatch):
+/// download the full maps concurrently and compute the exact disagreement set.
 pub async fn compute_cross_party_divergent_ids(
     s3: &S3Client,
     bucket: &str,
@@ -342,12 +350,11 @@ pub async fn compute_cross_party_divergent_ids(
     chunk_id: u32,
     poll_interval: Duration,
 ) -> Result<Vec<i64>> {
-    let mut hashes = Vec::new();
-    for party in 0..NUM_PARTIES {
-        hashes.push(
-            download_chunk_version_hash(s3, bucket, epoch, party, chunk_id, poll_interval).await?,
-        );
-    }
+    let hashes: Vec<[u8; 32]> = try_join_all((0..NUM_PARTIES).map(|party| {
+        download_chunk_version_hash(s3, bucket, epoch, party, chunk_id, poll_interval)
+    }))
+    .await?;
+
     if hashes[0] == hashes[1] && hashes[1] == hashes[2] {
         return Ok(Vec::new());
     }
@@ -359,12 +366,14 @@ pub async fn compute_cross_party_divergent_ids(
     );
 
     use std::collections::HashMap;
-    let mut all_maps: Vec<HashMap<i64, i16>> = Vec::new();
-    for party in 0..NUM_PARTIES {
-        let map =
-            download_chunk_version_map(s3, bucket, epoch, party, chunk_id, poll_interval).await?;
-        all_maps.push(map.into_iter().collect());
-    }
+    let all_maps: Vec<HashMap<i64, i16>> =
+        try_join_all((0..NUM_PARTIES).map(|party| {
+            download_chunk_version_map(s3, bucket, epoch, party, chunk_id, poll_interval)
+        }))
+        .await?
+        .into_iter()
+        .map(|v| v.into_iter().collect::<HashMap<_, _>>())
+        .collect();
 
     let mut divergent = Vec::new();
     let all_ids: std::collections::BTreeSet<i64> =
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index 7387c4a310..7d794bc721 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -2,9 +2,7 @@
 
 use eyre::Result;
 use iris_mpc_common::{
-    config::CommonConfig,
     galois_engine::degree4::FullGaloisRingIrisCodeShare,
-    helpers::sync::{SyncResult, SyncState},
     iris_db::iris::IrisCode,
     postgres::{AccessMode, PostgresClient},
 };
@@ -412,36 +410,16 @@ pub async fn wait_chunks_staged(harness: &TestHarness, epoch: i32, n: i32) -> Re
 // ---- Server simulation ----
 
 pub async fn simulate_server_startup(harness: &TestHarness, party: usize) -> Result<()> {
-    let sync_result = build_test_sync_result(harness, party).await?;
     let pool = &harness.parties[party].store.pool;
-    let lock_conn = rerand_store::rerand_validate_and_lock(pool, &sync_result).await?;
+    let lock_conn = rerand_store::acquire_apply_lock(pool).await?;
     let query_result: Result<(i64,), sqlx::Error> = sqlx::query_as("SELECT COUNT(*) FROM irises")
         .fetch_one(pool)
         .await;
-    rerand_store::release_rerand_lock(lock_conn).await?;
+    rerand_store::release_apply_lock(lock_conn).await?;
     let _count = query_result?;
     Ok(())
 }
 
-async fn build_test_sync_result(harness: &TestHarness, party: usize) -> Result<SyncResult> {
-    let mut all_states = Vec::new();
-    for p in &harness.parties {
-        let rerand_state = rerand_store::build_rerand_sync_state(&p.store.pool).await?;
-        all_states.push(SyncState {
-            db_len: p.store.count_irises().await? as u64,
-            modifications: vec![],
-            next_sns_sequence_num: None,
-            common_config: CommonConfig::default(),
-            rerand_state,
-        });
-    }
-    let my_state = all_states[party].clone();
-    Ok(SyncResult {
-        my_state,
-        all_states,
-    })
-}
-
 pub async fn assert_consistent_rerand_epoch(
     harness: &TestHarness,
     skip_ids: &[i64],
diff --git a/iris-mpc/Cargo.toml b/iris-mpc/Cargo.toml
index 1f56c59d4f..70037d2e42 100644
--- a/iris-mpc/Cargo.toml
+++ b/iris-mpc/Cargo.toml
@@ -27,12 +27,14 @@ iris-mpc-common = { path = "../iris-mpc-common" }
 iris-mpc-store = { path = "../iris-mpc-store" }
 itertools.workspace = true
 metrics.workspace = true
+futures.workspace = true
 serde = { version = "1.0.214", features = ["derive"] }
 iris-mpc-cpu.workspace = true
 chrono.workspace = true
 sqlx.workspace = true
 bincode.workspace = true
 pprof = { version = "0.15.0", features = ["flamegraph", "prost-codec"] }
+axum.workspace = true
 ampc-anon-stats.workspace = true
 ampc-server-utils.workspace = true
 
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index 53976743fb..f42086d490 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -13,8 +13,8 @@ use ampc_server_utils::batch_sync::{CURRENT_BATCH_SHA, CURRENT_BATCH_VALID_ENTRI
 use ampc_server_utils::shutdown_handler::ShutdownHandler;
 use ampc_server_utils::{
     delete_messages_until_sequence_num, get_next_sns_seq_num, get_others_sync_state,
-    init_heartbeat_task, set_node_ready, start_coordination_server, wait_for_others_ready,
-    wait_for_others_unready, BatchSyncSharedState, TaskMonitor,
+    init_heartbeat_task, set_node_ready, start_coordination_server_with_extra_routes,
+    wait_for_others_ready, wait_for_others_unready, BatchSyncSharedState, TaskMonitor,
 };
 use chrono::Utc;
 use eyre::{bail, eyre, Report, Result};
@@ -93,13 +93,46 @@ pub async fn server_main(config: Config) -> Result<()> {
         server_coord_config.healthcheck_ports
     );
 
-    // Start coordination server
-    let (is_ready_flag, verified_peers, my_uuid) = start_coordination_server(
+    // Build a /rerand-watermark route that queries the DB live on each request.
+    let rerand_watermark_route = {
+        let pool = iris_store.pool.clone();
+        axum::Router::new().route(
+            "/rerand-watermark",
+            axum::routing::get(move || {
+                let pool = pool.clone();
+                async move {
+                    let wm = rerand_store::get_applied_watermark_from_pool(&pool).await;
+                    match wm {
+                        Ok(Some((epoch, chunk))) => (
+                            axum::http::StatusCode::OK,
+                            serde_json::to_string(&serde_json::json!({
+                                "epoch": epoch,
+                                "max_applied_chunk": chunk,
+                            }))
+                            .unwrap(),
+                        ),
+                        Ok(None) => (axum::http::StatusCode::OK, "null".to_string()),
+                        Err(e) => {
+                            tracing::warn!("rerand-watermark query failed: {:?}", e);
+                            (
+                                axum::http::StatusCode::INTERNAL_SERVER_ERROR,
+                                format!("DB error: {}", e),
+                            )
+                        }
+                    }
+                }
+            }),
+        )
+    };
+
+    // Start coordination server with the live watermark route injected.
+    let (is_ready_flag, verified_peers, my_uuid) = start_coordination_server_with_extra_routes(
         &server_coord_config,
         &mut background_tasks,
         &shutdown_handler,
         &my_state,
         Some(batch_sync_shared_state.clone()),
+        Some(rerand_watermark_route),
     )
     .await;
     tracing::info!("Coordination server started");
@@ -139,44 +172,77 @@ pub async fn server_main(config: Config) -> Result<()> {
 
     sync_sqs_queues(&config, &sync_result, &aws_clients).await?;
 
-    let rerand_lock_conn =
-        rerand_store::rerand_validate_and_lock(&iris_store.pool, &sync_result).await?;
-
-    if shutdown_handler.is_shutting_down() {
-        tracing::warn!("Shutting down has been triggered");
-        rerand_store::release_rerand_lock(rerand_lock_conn).await?;
-        return Ok(());
+    // --- Coordinated rerand freeze with watermark convergence ---
+    {
+        let sc = config.server_coordination.as_ref().unwrap();
+        eyre::ensure!(
+            sc.node_hostnames.len() == sc.healthcheck_ports.len(),
+            "node_hostnames ({}) and healthcheck_ports ({}) must have the same length",
+            sc.node_hostnames.len(),
+            sc.healthcheck_ports.len(),
+        );
+        let peer_addrs: Vec<(&str, usize)> = sc
+            .node_hostnames
+            .iter()
+            .zip(sc.healthcheck_ports.iter())
+            .enumerate()
+            .filter(|(i, _)| *i != config.party_id)
+            .map(|(_, (h, p))| -> eyre::Result<_> {
+                Ok((h.as_str(), p.parse::<usize>()?))
+            })
+            .collect::<eyre::Result<Vec<_>>>()?;
+        rerand_store::freeze_and_verify_watermarks(&iris_store.pool, &peer_addrs).await?;
     }
+    // Worker is now frozen with verified equal watermarks.
+    // Everything from here until freeze release must be wrapped so that
+    // errors always release the freeze.
+    let frozen_result = async {
+        let rerand_lock_conn = rerand_store::acquire_apply_lock(&iris_store.pool).await?;
+
+        if shutdown_handler.is_shutting_down() {
+            rerand_store::release_apply_lock(rerand_lock_conn).await?;
+            return Ok::<_, eyre::Report>(None);
+        }
+
+        let startup_result = async {
+            let mut hawk_actor = init_hawk_actor(&config, &shutdown_handler).await?;
 
-    let startup_result = async {
-        let mut hawk_actor = init_hawk_actor(&config, &shutdown_handler).await?;
-
-        if let Some(url) = config.get_anon_stats_db_url() {
-            let schema = config.get_anon_stats_db_schema();
-            let anon_client =
-                AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
-            let anon_store = AnonStatsStore::new(&anon_client).await?;
-            hawk_actor.set_anon_stats_store(Some(anon_store));
-        } else {
-            tracing::warn!(
+            if let Some(url) = config.get_anon_stats_db_url() {
+                let schema = config.get_anon_stats_db_schema();
+                let anon_client =
+                    AnonStatsPgClient::new(&url, &schema, AnonStatsAccessMode::ReadWrite).await?;
+                let anon_store = AnonStatsStore::new(&anon_client).await?;
+                hawk_actor.set_anon_stats_store(Some(anon_store));
+            } else {
+                tracing::warn!(
                     "Anon stats persistence enabled but no anon stats database configured; skipping DB writes"
                 );
+            }
+
+            load_database(
+                &config,
+                &iris_store,
+                &graph_store,
+                &shutdown_handler,
+                &mut hawk_actor,
+            )
+            .await?;
+            Ok::<_, eyre::Report>(hawk_actor)
         }
+        .await;
 
-        load_database(
-            &config,
-            &iris_store,
-            &graph_store,
-            &shutdown_handler,
-            &mut hawk_actor,
-        )
-        .await?;
-        Ok::<_, eyre::Report>(hawk_actor)
+        rerand_store::release_apply_lock(rerand_lock_conn).await?;
+        Ok(Some(startup_result))
     }
     .await;
 
-    rerand_store::release_rerand_lock(rerand_lock_conn).await?;
-    let hawk_actor = startup_result?;
+    // Always release freeze, even on error.
+    rerand_store::release_rerand_freeze(&iris_store.pool).await?;
+
+    let hawk_actor = match frozen_result? {
+        None => return Ok(()),
+        Some(r) => r?,
+    };
 
     background_tasks.check_tasks();
 
diff --git a/iris-mpc/src/services/processors/batch.rs b/iris-mpc/src/services/processors/batch.rs
index b3d159d0eb..61ce29ba3b 100644
--- a/iris-mpc/src/services/processors/batch.rs
+++ b/iris-mpc/src/services/processors/batch.rs
@@ -401,8 +401,6 @@ impl<'a> BatchProcessor<'a> {
             .string_value()
             .ok_or(ReceiveRequestError::NoMessageTypeAttribute)?;
 
-        self.delete_message(&sqs_message).await?;
-
         let res = match request_type {
             IDENTITY_DELETION_MESSAGE_TYPE => {
                 self.process_identity_deletion(&message, batch_metadata)
@@ -417,6 +415,8 @@ impl<'a> BatchProcessor<'a> {
                 if !self.config.hawk_server_recovery_enabled {
                     metrics::counter!("request.skipped", "type" => "recovery_check").increment(1);
                     tracing::warn!("Recovery checks are disabled, skipping recovery check request");
+                    self.delete_message(&sqs_message).await?;
+                    self.msg_counter += 1;
                     return Ok(());
                 }
                 self.process_identity_match_check_request(
@@ -430,6 +430,8 @@ impl<'a> BatchProcessor<'a> {
                 if !self.config.hawk_server_resets_enabled {
                     metrics::counter!("request.skipped", "type" => "reset_check").increment(1);
                     tracing::warn!("Resets are disabled, skipping reset request");
+                    self.delete_message(&sqs_message).await?;
+                    self.msg_counter += 1;
                     return Ok(());
                 }
                 self.process_identity_match_check_request(
@@ -445,12 +447,19 @@ impl<'a> BatchProcessor<'a> {
             }
             _ => {
                 tracing::error!("Error: {}", ReceiveRequestError::InvalidMessageType);
-                Ok(())
+                self.delete_message(&sqs_message).await?;
+                self.msg_counter += 1;
+                return Ok(());
             }
         };
 
+        // Only delete from SQS after the message has been successfully
+        // processed and the modification row is durably persisted. If we
+        // crash before this point, SQS will redeliver the message.
+        res?;
+        self.delete_message(&sqs_message).await?;
         self.msg_counter += 1;
-        res
+        Ok(())
     }
 
     async fn process_identity_deletion(
diff --git a/iris-mpc/src/services/processors/job.rs b/iris-mpc/src/services/processors/job.rs
index 333d6091bd..6ddb61df46 100644
--- a/iris-mpc/src/services/processors/job.rs
+++ b/iris-mpc/src/services/processors/job.rs
@@ -290,6 +290,13 @@ pub async fn process_job_result(
     let persist_total_start = Instant::now();
     let mut iris_tx = store.tx().await?;
 
+    if !config.disable_persistence {
+        sqlx::query("SELECT pg_advisory_xact_lock($1)")
+            .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
+            .execute(&mut *iris_tx)
+            .await?;
+    }
+
     if !codes_and_masks.is_empty() && !config.disable_persistence {
         let step_start = Instant::now();
         let db_serial_ids = store.insert_irises(&mut iris_tx, &codes_and_masks).await?;
diff --git a/iris-mpc/src/services/processors/modifications_sync.rs b/iris-mpc/src/services/processors/modifications_sync.rs
index cb3e742029..eafa9e990b 100644
--- a/iris-mpc/src/services/processors/modifications_sync.rs
+++ b/iris-mpc/src/services/processors/modifications_sync.rs
@@ -39,113 +39,170 @@ pub async fn sync_modifications(
     // Sort modifications in id order
     to_update.sort_by_key(|m| m.id);
 
-    // Update node_id for each modification and collect &refs
-    let to_update_refs: Vec<&Modification> = to_update
-        .iter_mut()
-        .map(|modification| {
-            if let Err(e) = modification.update_result_message_node_id(config.party_id) {
-                tracing::error!("Failed to update modification node_id: {:?}", e);
-            }
-            &*modification
-        })
-        .collect();
+    // Update node_id for each modification (mutable pass)
+    for modification in &mut to_update {
+        if let Err(e) = modification.update_result_message_node_id(config.party_id) {
+            tracing::error!("Failed to update modification node_id: {:?}", e);
+        }
+    }
 
-    let mut iris_tx = store.tx().await?;
+    // Prefetch shares in bounded batches before taking the modification lock.
+    // This avoids holding `RERAND_MODIFY_LOCK` across remote I/O while also
+    // preventing unbounded memory growth if `to_update` is large.
+    const PREFETCH_BATCH_SIZE: usize = 128;
 
-    // Acquire the modification lock to serialize with rerand apply.
-    // Uses xact lock so it auto-releases on commit/rollback.
-    sqlx::query("SELECT pg_advisory_xact_lock($1)")
-        .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
-        .execute(&mut *iris_tx)
-        .await?;
+    let semaphore = Arc::new(Semaphore::new(MAX_CONCURRENT_REQUESTS));
 
-    // Persist changes into modifications table
-    store
-        .update_modifications(&mut iris_tx, &to_update_refs)
-        .await?;
-    store.delete_modifications(&mut iris_tx, &to_delete).await?;
+    // Apply deletions even if there are no updates.
+    if !to_delete.is_empty() {
+        let mut iris_tx = store.tx().await?;
+        store.delete_modifications(&mut iris_tx, &to_delete).await?;
+        iris_tx.commit().await?;
+    }
 
-    let semaphore = Arc::new(Semaphore::new(MAX_CONCURRENT_REQUESTS));
-    let mut graph_mutations = Vec::new();
-
-    // Persist changes into iris and graph tables
-    for modification in &to_update {
-        if !modification.persisted {
-            tracing::debug!(
-                "Skip writing non-persisted modification to iris table: {:?}",
-                modification
-            );
-            continue;
+    if to_update.is_empty() {
+        return Ok(());
+    }
+
+    // Ensure all modification rows exist locally before the update loop.
+    // Recovered modifications (completed on peers but missing locally) need
+    // to be inserted. We set persisted=false so the batch loop below fetches
+    // shares from S3 and writes them to iris; only then does
+    // update_modifications mark persisted=true after the iris write succeeds.
+    {
+        let mut tx = store.tx().await?;
+        for m in &to_update {
+            let mut staging = m.clone();
+            staging.persisted = false;
+            store
+                .upsert_recovered_modification(&mut tx, &staging)
+                .await?;
         }
+        tx.commit().await?;
+    }
 
-        tracing::warn!("Applying modification to local node: {:?}", modification);
-        metrics::counter!("db.modifications.rollforward").increment(1);
-
-        let (lc, lm, rc, rm) = match modification.request_type.as_str() {
-            IDENTITY_DELETION_MESSAGE_TYPE => (
-                dummy_shares_for_deletions.clone().0,
-                dummy_shares_for_deletions.clone().1,
-                dummy_shares_for_deletions.clone().0,
-                dummy_shares_for_deletions.clone().1,
-            ),
-            REAUTH_MESSAGE_TYPE | RESET_UPDATE_MESSAGE_TYPE | UNIQUENESS_MESSAGE_TYPE => {
-                let (left_shares, right_shares) = get_iris_shares_parse_task(
-                    config.party_id,
-                    shares_encryption_key_pair.clone(),
-                    Arc::clone(&semaphore),
-                    aws_clients.s3_client.clone(),
-                    config.shares_bucket_name.clone(),
-                    modification.clone().s3_url.unwrap(),
-                )?
-                .await?
-                .unwrap();
-                (
-                    left_shares.code,
-                    left_shares.mask,
-                    right_shares.code,
-                    right_shares.mask,
-                )
-            }
-            _ => {
-                panic!("Unknown modification type: {:?}", modification);
+    for batch in to_update.chunks(PREFETCH_BATCH_SIZE) {
+        struct PrefetchedShareData {
+            serial_id: i64,
+            left_code: Vec<u16>,
+            left_mask: Vec<u16>,
+            right_code: Vec<u16>,
+            right_mask: Vec<u16>,
+        }
+
+        // Kick off S3 fetches concurrently for this batch.
+        let fetched: Vec<Option<PrefetchedShareData>> = futures::future::try_join_all(
+            batch.iter().map(|modification| {
+                let semaphore = Arc::clone(&semaphore);
+                let s3_client = aws_clients.s3_client.clone();
+                let bucket_name = config.shares_bucket_name.clone();
+                let party_id = config.party_id;
+                let shares_encryption_key_pair = shares_encryption_key_pair.clone();
+                let dummy_shares_for_deletions = dummy_shares_for_deletions.clone();
+                async move {
+                    if !modification.persisted {
+                        return Ok::<_, Report>(None);
+                    }
+
+                    tracing::warn!("Applying modification to local node: {:?}", modification);
+                    metrics::counter!("db.modifications.rollforward").increment(1);
+
+                    let serial_id = modification.serial_id.ok_or_else(|| {
+                        eyre!("Modification has no serial_id: {:?}", modification)
+                    })?;
+
+                    let (left_code, left_mask, right_code, right_mask) =
+                        match modification.request_type.as_str() {
+                            IDENTITY_DELETION_MESSAGE_TYPE => (
+                                dummy_shares_for_deletions.0.coefs.to_vec(),
+                                dummy_shares_for_deletions.1.coefs.to_vec(),
+                                dummy_shares_for_deletions.0.coefs.to_vec(),
+                                dummy_shares_for_deletions.1.coefs.to_vec(),
+                            ),
+                        REAUTH_MESSAGE_TYPE | RESET_UPDATE_MESSAGE_TYPE | UNIQUENESS_MESSAGE_TYPE => {
+                            let s3_url = modification.s3_url.clone().ok_or_else(|| {
+                                eyre!("Persisted modification missing s3_url: {:?}", modification)
+                            })?;
+                                let (left_shares, right_shares) = get_iris_shares_parse_task(
+                                    party_id,
+                                    shares_encryption_key_pair.clone(),
+                                    Arc::clone(&semaphore),
+                                    s3_client.clone(),
+                                    bucket_name.clone(),
+                                    s3_url,
+                                )?
+                                .await??;
+                                (
+                                    left_shares.code.coefs.to_vec(),
+                                    left_shares.mask.coefs.to_vec(),
+                                    right_shares.code.coefs.to_vec(),
+                                    right_shares.mask.coefs.to_vec(),
+                                )
+                            }
+                            _ => {
+                            return Err(eyre!("Unknown modification type: {:?}", modification));
+                            }
+                        };
+
+                    Ok(Some(PrefetchedShareData {
+                        serial_id,
+                        left_code,
+                        left_mask,
+                        right_code,
+                        right_mask,
+                    }))
+                }
+            }),
+        )
+        .await?;
+
+        // Decode graph mutations (small) outside the DB lock window.
+        let mut batch_graph_mutations = Vec::new();
+        for modification in batch.iter().filter(|m| m.persisted) {
+            if let Some(serialized) = &modification.graph_mutation {
+                let single_mutation: SingleHawkMutation = bincode::deserialize::<SingleHawkMutation>(serialized)
+                    .map_err(|e| eyre!("Failed to deserialize SingleHawkMutation: {}", e))?;
+                batch_graph_mutations.push(single_mutation.clone());
             }
-        };
-
-        let iris_ref = StoredIrisRef {
-            id: modification
-                .serial_id
-                .ok_or_else(|| eyre!("Modification has no serial_id: {:?}", modification))?,
-            left_code: &lc.coefs,
-            left_mask: &lm.coefs,
-            right_code: &rc.coefs,
-            right_mask: &rm.coefs,
-        };
-
-        store
-            .insert_irises_overriding(&mut iris_tx, &[iris_ref])
+        }
+
+        // Now acquire the modification lock and write this batch atomically.
+        let mut iris_tx = store.tx().await?;
+        sqlx::query("SELECT pg_advisory_xact_lock($1)")
+            .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
+            .execute(&mut *iris_tx)
             .await?;
 
-        if let Some(serialized) = &modification.graph_mutation {
-            let single_mutation: SingleHawkMutation =
-                bincode::deserialize::<SingleHawkMutation>(serialized)
-                    .expect("Failed to deserialize SingleHawkMutation");
-            graph_mutations.push(single_mutation.clone());
+        let batch_refs: Vec<&Modification> = batch.iter().collect();
+        store.update_modifications(&mut iris_tx, &batch_refs).await?;
+
+        let prefetched_rows: Vec<PrefetchedShareData> = fetched.into_iter().flatten().collect();
+        if !prefetched_rows.is_empty() {
+            let iris_refs: Vec<StoredIrisRef<'_>> = prefetched_rows
+                .iter()
+                .map(|row| StoredIrisRef {
+                    id: row.serial_id,
+                    left_code: &row.left_code,
+                    left_mask: &row.left_mask,
+                    right_code: &row.right_code,
+                    right_mask: &row.right_mask,
+                })
+                .collect();
+            store
+                .insert_irises_overriding(&mut iris_tx, &iris_refs)
+                .await?;
         }
-    }
 
-    if let Some(graph_store) = graph_store {
-        let mut graph_tx = graph_store.tx_wrap(iris_tx);
-        if !graph_mutations.is_empty() {
-            tracing::info!("Applying {} graph mutations", graph_mutations.len());
-            let hawk_mutation = HawkMutation(graph_mutations);
-            hawk_mutation.persist(&mut graph_tx).await?;
+        if let Some(graph_store) = graph_store {
+            let mut graph_tx = graph_store.tx_wrap(iris_tx);
+            if !batch_graph_mutations.is_empty() {
+                let hawk_mutation = HawkMutation(batch_graph_mutations);
+                hawk_mutation.persist(&mut graph_tx).await?;
+            }
+            graph_tx.tx.commit().await?;
         } else {
-            tracing::info!("No graph mutations to apply");
+            iris_tx.commit().await?;
         }
-        graph_tx.tx.commit().await?;
-    } else {
-        tracing::warn!("Graph store is not available, skipping graph mutations");
-        iris_tx.commit().await?;
     }
 
     Ok(())
diff --git a/migrations/20260226000004_create_rerand_control.down.sql b/migrations/20260226000004_create_rerand_control.down.sql
new file mode 100644
index 0000000000..831afe63fe
--- /dev/null
+++ b/migrations/20260226000004_create_rerand_control.down.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS rerand_control;
diff --git a/migrations/20260226000004_create_rerand_control.up.sql b/migrations/20260226000004_create_rerand_control.up.sql
new file mode 100644
index 0000000000..a970f19231
--- /dev/null
+++ b/migrations/20260226000004_create_rerand_control.up.sql
@@ -0,0 +1,8 @@
+CREATE TABLE IF NOT EXISTS rerand_control (
+    id              INTEGER PRIMARY KEY DEFAULT 1 CHECK (id = 1),
+    freeze_requested    BOOLEAN NOT NULL DEFAULT FALSE,
+    freeze_generation   TEXT,
+    frozen_generation   TEXT
+);
+
+INSERT INTO rerand_control (id) VALUES (1) ON CONFLICT DO NOTHING;

From 71b3dbd19cc1a031c0471e4feeec700d79522d01 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sat, 28 Feb 2026 23:35:56 +0100
Subject: [PATCH 11/76] clean up

---
 iris-mpc-bins/bin/iris-mpc/server.rs          |  5 +---
 iris-mpc-store/src/rerand.rs                  | 29 +++++++++----------
 iris-mpc-upgrade/src/s3_coordination.rs       |  7 -----
 iris-mpc/src/services/processors/job.rs       |  5 +---
 .../services/processors/modifications_sync.rs |  5 +---
 5 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 2de58dcd1e..76fd150475 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -1774,10 +1774,7 @@ async fn server_main(config: Config) -> Result<()> {
             let mut tx = store_bg.tx().await?;
 
             if !config_bg.disable_persistence {
-                sqlx::query("SELECT pg_advisory_xact_lock($1)")
-                    .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
-                    .execute(&mut *tx)
-                    .await?;
+                iris_mpc_store::rerand::acquire_modify_lock(&mut tx).await?;
             }
 
             store_bg
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 137564d26b..d39d316d94 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -7,6 +7,18 @@ use sqlx::PgPool;
 pub const RERAND_APPLY_LOCK: i64 = 0x5245_5241_4E44;
 pub const RERAND_MODIFY_LOCK: i64 = 0x5245_4D4F_4446;
 
+/// Acquire `RERAND_MODIFY_LOCK` as a transaction-level advisory lock.
+/// Auto-released on commit/rollback.
+pub async fn acquire_modify_lock(
+    tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
+) -> Result<()> {
+    sqlx::query("SELECT pg_advisory_xact_lock($1)")
+        .bind(RERAND_MODIFY_LOCK)
+        .execute(&mut **tx)
+        .await?;
+    Ok(())
+}
+
 pub struct StagingIrisEntry {
     pub epoch: i32,
     pub id: i64,
@@ -168,10 +180,7 @@ pub async fn apply_confirmed_chunk(
     validate_identifier(staging_schema)?;
     let mut tx = pool.begin().await?;
 
-    sqlx::query("SELECT pg_advisory_xact_lock($1)")
-        .bind(RERAND_MODIFY_LOCK)
-        .execute(&mut *tx)
-        .await?;
+    acquire_modify_lock(&mut tx).await?;
     sqlx::query("SELECT pg_advisory_xact_lock($1)")
         .bind(RERAND_APPLY_LOCK)
         .execute(&mut *tx)
@@ -286,18 +295,6 @@ pub async fn get_rerand_progress(
     Ok(row)
 }
 
-/// Returns the highest chunk_id where all_confirmed = TRUE for a given epoch,
-/// or None if no chunks are confirmed.
-pub async fn get_max_confirmed_chunk(pool: &PgPool, epoch: i32) -> Result<Option<i32>> {
-    let row: (Option<i32>,) = sqlx::query_as(
-        "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND all_confirmed = TRUE",
-    )
-    .bind(epoch)
-    .fetch_one(pool)
-    .await?;
-    Ok(row.0)
-}
-
 /// Returns the highest `chunk_id` where `live_applied = TRUE` for a given
 /// epoch, or `None` if no chunks have been applied in that epoch yet.
 pub async fn get_max_applied_chunk_for_epoch(
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index 1df60cebb5..747a69bba6 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -16,13 +16,6 @@ pub struct Manifest {
 }
 
 impl Manifest {
-    pub fn num_chunks(&self) -> u32 {
-        if self.max_id_inclusive == 0 {
-            return 0;
-        }
-        self.max_id_inclusive.div_ceil(self.chunk_size) as u32
-    }
-
     /// Returns (start_id_inclusive, end_id_exclusive) for a given chunk_id.
     /// IDs are 1-based.
     pub fn chunk_range(&self, chunk_id: u32) -> (u64, u64) {
diff --git a/iris-mpc/src/services/processors/job.rs b/iris-mpc/src/services/processors/job.rs
index 6ddb61df46..64d0b095df 100644
--- a/iris-mpc/src/services/processors/job.rs
+++ b/iris-mpc/src/services/processors/job.rs
@@ -291,10 +291,7 @@ pub async fn process_job_result(
     let mut iris_tx = store.tx().await?;
 
     if !config.disable_persistence {
-        sqlx::query("SELECT pg_advisory_xact_lock($1)")
-            .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
-            .execute(&mut *iris_tx)
-            .await?;
+        iris_mpc_store::rerand::acquire_modify_lock(&mut iris_tx).await?;
     }
 
     if !codes_and_masks.is_empty() && !config.disable_persistence {
diff --git a/iris-mpc/src/services/processors/modifications_sync.rs b/iris-mpc/src/services/processors/modifications_sync.rs
index eafa9e990b..3363a598eb 100644
--- a/iris-mpc/src/services/processors/modifications_sync.rs
+++ b/iris-mpc/src/services/processors/modifications_sync.rs
@@ -168,10 +168,7 @@ pub async fn sync_modifications(
 
         // Now acquire the modification lock and write this batch atomically.
         let mut iris_tx = store.tx().await?;
-        sqlx::query("SELECT pg_advisory_xact_lock($1)")
-            .bind(iris_mpc_store::rerand::RERAND_MODIFY_LOCK)
-            .execute(&mut *iris_tx)
-            .await?;
+        iris_mpc_store::rerand::acquire_modify_lock(&mut iris_tx).await?;
 
         let batch_refs: Vec<&Modification> = batch.iter().collect();
         store.update_modifications(&mut iris_tx, &batch_refs).await?;

From 7cf445daa317a310e9a5252afb9a3d078081a93c Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sat, 28 Feb 2026 23:47:48 +0100
Subject: [PATCH 12/76] simplify

---
 Cargo.lock                                    |   1 -
 iris-mpc/Cargo.toml                           |   1 -
 .../services/processors/modifications_sync.rs | 242 +++++++-----------
 3 files changed, 96 insertions(+), 148 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 03166ad567..c1e82f9a33 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2926,7 +2926,6 @@ dependencies = [
  "chrono",
  "clap",
  "eyre",
- "futures",
  "iris-mpc-common",
  "iris-mpc-cpu",
  "iris-mpc-store",
diff --git a/iris-mpc/Cargo.toml b/iris-mpc/Cargo.toml
index 70037d2e42..e1cd8e4524 100644
--- a/iris-mpc/Cargo.toml
+++ b/iris-mpc/Cargo.toml
@@ -27,7 +27,6 @@ iris-mpc-common = { path = "../iris-mpc-common" }
 iris-mpc-store = { path = "../iris-mpc-store" }
 itertools.workspace = true
 metrics.workspace = true
-futures.workspace = true
 serde = { version = "1.0.214", features = ["derive"] }
 iris-mpc-cpu.workspace = true
 chrono.workspace = true
diff --git a/iris-mpc/src/services/processors/modifications_sync.rs b/iris-mpc/src/services/processors/modifications_sync.rs
index 3363a598eb..841f513664 100644
--- a/iris-mpc/src/services/processors/modifications_sync.rs
+++ b/iris-mpc/src/services/processors/modifications_sync.rs
@@ -46,160 +46,112 @@ pub async fn sync_modifications(
         }
     }
 
-    // Prefetch shares in bounded batches before taking the modification lock.
-    // This avoids holding `RERAND_MODIFY_LOCK` across remote I/O while also
-    // preventing unbounded memory growth if `to_update` is large.
-    const PREFETCH_BATCH_SIZE: usize = 128;
-
-    let semaphore = Arc::new(Semaphore::new(MAX_CONCURRENT_REQUESTS));
-
-    // Apply deletions even if there are no updates.
-    if !to_delete.is_empty() {
-        let mut iris_tx = store.tx().await?;
-        store.delete_modifications(&mut iris_tx, &to_delete).await?;
-        iris_tx.commit().await?;
+    let mut iris_tx = store.tx().await?;
+
+    // Acquire the modification lock to serialize with rerand apply.
+    iris_mpc_store::rerand::acquire_modify_lock(&mut iris_tx).await?;
+
+    // Ensure recovered modification rows exist locally (completed on peers
+    // but missing here). Inserted with persisted=false so the loop below
+    // fetches shares and writes iris data before marking persisted=true.
+    for m in &to_update {
+        let mut staging = m.clone();
+        staging.persisted = false;
+        store
+            .upsert_recovered_modification(&mut iris_tx, &staging)
+            .await?;
     }
 
-    if to_update.is_empty() {
-        return Ok(());
-    }
-
-    // Ensure all modification rows exist locally before the update loop.
-    // Recovered modifications (completed on peers but missing locally) need
-    // to be inserted. We set persisted=false so the batch loop below fetches
-    // shares from S3 and writes them to iris; only then does
-    // update_modifications mark persisted=true after the iris write succeeds.
-    {
-        let mut tx = store.tx().await?;
-        for m in &to_update {
-            let mut staging = m.clone();
-            staging.persisted = false;
-            store
-                .upsert_recovered_modification(&mut tx, &staging)
-                .await?;
-        }
-        tx.commit().await?;
-    }
-
-    for batch in to_update.chunks(PREFETCH_BATCH_SIZE) {
-        struct PrefetchedShareData {
-            serial_id: i64,
-            left_code: Vec<u16>,
-            left_mask: Vec<u16>,
-            right_code: Vec<u16>,
-            right_mask: Vec<u16>,
-        }
-
-        // Kick off S3 fetches concurrently for this batch.
-        let fetched: Vec<Option<PrefetchedShareData>> = futures::future::try_join_all(
-            batch.iter().map(|modification| {
-                let semaphore = Arc::clone(&semaphore);
-                let s3_client = aws_clients.s3_client.clone();
-                let bucket_name = config.shares_bucket_name.clone();
-                let party_id = config.party_id;
-                let shares_encryption_key_pair = shares_encryption_key_pair.clone();
-                let dummy_shares_for_deletions = dummy_shares_for_deletions.clone();
-                async move {
-                    if !modification.persisted {
-                        return Ok::<_, Report>(None);
-                    }
-
-                    tracing::warn!("Applying modification to local node: {:?}", modification);
-                    metrics::counter!("db.modifications.rollforward").increment(1);
-
-                    let serial_id = modification.serial_id.ok_or_else(|| {
-                        eyre!("Modification has no serial_id: {:?}", modification)
-                    })?;
-
-                    let (left_code, left_mask, right_code, right_mask) =
-                        match modification.request_type.as_str() {
-                            IDENTITY_DELETION_MESSAGE_TYPE => (
-                                dummy_shares_for_deletions.0.coefs.to_vec(),
-                                dummy_shares_for_deletions.1.coefs.to_vec(),
-                                dummy_shares_for_deletions.0.coefs.to_vec(),
-                                dummy_shares_for_deletions.1.coefs.to_vec(),
-                            ),
-                        REAUTH_MESSAGE_TYPE | RESET_UPDATE_MESSAGE_TYPE | UNIQUENESS_MESSAGE_TYPE => {
-                            let s3_url = modification.s3_url.clone().ok_or_else(|| {
-                                eyre!("Persisted modification missing s3_url: {:?}", modification)
-                            })?;
-                                let (left_shares, right_shares) = get_iris_shares_parse_task(
-                                    party_id,
-                                    shares_encryption_key_pair.clone(),
-                                    Arc::clone(&semaphore),
-                                    s3_client.clone(),
-                                    bucket_name.clone(),
-                                    s3_url,
-                                )?
-                                .await??;
-                                (
-                                    left_shares.code.coefs.to_vec(),
-                                    left_shares.mask.coefs.to_vec(),
-                                    right_shares.code.coefs.to_vec(),
-                                    right_shares.mask.coefs.to_vec(),
-                                )
-                            }
-                            _ => {
-                            return Err(eyre!("Unknown modification type: {:?}", modification));
-                            }
-                        };
-
-                    Ok(Some(PrefetchedShareData {
-                        serial_id,
-                        left_code,
-                        left_mask,
-                        right_code,
-                        right_mask,
-                    }))
-                }
-            }),
-        )
+    // Persist changes into modifications table
+    let to_update_refs: Vec<&Modification> = to_update.iter().collect();
+    store
+        .update_modifications(&mut iris_tx, &to_update_refs)
         .await?;
+    store.delete_modifications(&mut iris_tx, &to_delete).await?;
 
-        // Decode graph mutations (small) outside the DB lock window.
-        let mut batch_graph_mutations = Vec::new();
-        for modification in batch.iter().filter(|m| m.persisted) {
-            if let Some(serialized) = &modification.graph_mutation {
-                let single_mutation: SingleHawkMutation = bincode::deserialize::<SingleHawkMutation>(serialized)
-                    .map_err(|e| eyre!("Failed to deserialize SingleHawkMutation: {}", e))?;
-                batch_graph_mutations.push(single_mutation.clone());
-            }
+    let semaphore = Arc::new(Semaphore::new(MAX_CONCURRENT_REQUESTS));
+    let mut graph_mutations = Vec::new();
+
+    // Persist changes into iris and graph tables
+    for modification in &to_update {
+        if !modification.persisted {
+            tracing::debug!(
+                "Skip writing non-persisted modification to iris table: {:?}",
+                modification
+            );
+            continue;
         }
 
-        // Now acquire the modification lock and write this batch atomically.
-        let mut iris_tx = store.tx().await?;
-        iris_mpc_store::rerand::acquire_modify_lock(&mut iris_tx).await?;
-
-        let batch_refs: Vec<&Modification> = batch.iter().collect();
-        store.update_modifications(&mut iris_tx, &batch_refs).await?;
-
-        let prefetched_rows: Vec<PrefetchedShareData> = fetched.into_iter().flatten().collect();
-        if !prefetched_rows.is_empty() {
-            let iris_refs: Vec<StoredIrisRef<'_>> = prefetched_rows
-                .iter()
-                .map(|row| StoredIrisRef {
-                    id: row.serial_id,
-                    left_code: &row.left_code,
-                    left_mask: &row.left_mask,
-                    right_code: &row.right_code,
-                    right_mask: &row.right_mask,
-                })
-                .collect();
-            store
-                .insert_irises_overriding(&mut iris_tx, &iris_refs)
-                .await?;
+        tracing::warn!("Applying modification to local node: {:?}", modification);
+        metrics::counter!("db.modifications.rollforward").increment(1);
+
+        let (lc, lm, rc, rm) = match modification.request_type.as_str() {
+            IDENTITY_DELETION_MESSAGE_TYPE => (
+                dummy_shares_for_deletions.clone().0,
+                dummy_shares_for_deletions.clone().1,
+                dummy_shares_for_deletions.clone().0,
+                dummy_shares_for_deletions.clone().1,
+            ),
+            REAUTH_MESSAGE_TYPE | RESET_UPDATE_MESSAGE_TYPE | UNIQUENESS_MESSAGE_TYPE => {
+                let s3_url = modification.s3_url.clone().ok_or_else(|| {
+                    eyre!("Persisted modification missing s3_url: {:?}", modification)
+                })?;
+                let (left_shares, right_shares) = get_iris_shares_parse_task(
+                    config.party_id,
+                    shares_encryption_key_pair.clone(),
+                    Arc::clone(&semaphore),
+                    aws_clients.s3_client.clone(),
+                    config.shares_bucket_name.clone(),
+                    s3_url,
+                )?
+                .await??;
+                (
+                    left_shares.code,
+                    left_shares.mask,
+                    right_shares.code,
+                    right_shares.mask,
+                )
+            }
+            _ => {
+                return Err(eyre!("Unknown modification type: {:?}", modification));
+            }
+        };
+
+        let iris_ref = StoredIrisRef {
+            id: modification
+                .serial_id
+                .ok_or_else(|| eyre!("Modification has no serial_id: {:?}", modification))?,
+            left_code: &lc.coefs,
+            left_mask: &lm.coefs,
+            right_code: &rc.coefs,
+            right_mask: &rm.coefs,
+        };
+
+        store
+            .insert_irises_overriding(&mut iris_tx, &[iris_ref])
+            .await?;
+
+        if let Some(serialized) = &modification.graph_mutation {
+            let single_mutation: SingleHawkMutation =
+                bincode::deserialize::<SingleHawkMutation>(serialized)
+                    .map_err(|e| eyre!("Failed to deserialize SingleHawkMutation: {}", e))?;
+            graph_mutations.push(single_mutation.clone());
         }
+    }
 
-        if let Some(graph_store) = graph_store {
-            let mut graph_tx = graph_store.tx_wrap(iris_tx);
-            if !batch_graph_mutations.is_empty() {
-                let hawk_mutation = HawkMutation(batch_graph_mutations);
-                hawk_mutation.persist(&mut graph_tx).await?;
-            }
-            graph_tx.tx.commit().await?;
+    if let Some(graph_store) = graph_store {
+        let mut graph_tx = graph_store.tx_wrap(iris_tx);
+        if !graph_mutations.is_empty() {
+            tracing::info!("Applying {} graph mutations", graph_mutations.len());
+            let hawk_mutation = HawkMutation(graph_mutations);
+            hawk_mutation.persist(&mut graph_tx).await?;
         } else {
-            iris_tx.commit().await?;
+            tracing::info!("No graph mutations to apply");
         }
+        graph_tx.tx.commit().await?;
+    } else {
+        tracing::warn!("Graph store is not available, skipping graph mutations");
+        iris_tx.commit().await?;
     }
 
     Ok(())
@@ -222,7 +174,6 @@ pub async fn send_last_modifications_to_sns(
     let recovery_check_message_attributes =
         create_message_type_attribute_map(RECOVERY_CHECK_MESSAGE_TYPE);
 
-    // Fetch the last modifications from the database
     let last_modifications = store.last_modifications(lookback).await?;
     tracing::info!(
         "Replaying last {} modification results to SNS",
@@ -234,7 +185,6 @@ pub async fn send_last_modifications_to_sns(
         return Ok(());
     }
 
-    // Collect messages by type
     let mut deletion_messages = Vec::new();
     let mut reauth_messages = Vec::new();
     let mut reset_update_messages = Vec::new();

From d7e02c9657ee35be7ff99d2e23a3120c02210781 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sun, 1 Mar 2026 00:05:30 +0100
Subject: [PATCH 13/76] update e2e

---
 .../tests/continuous_rerand_e2e.rs            | 242 +++++++++++++++++-
 iris-mpc-upgrade/tests/test_utils.rs          |  72 ++++++
 2 files changed, 310 insertions(+), 4 deletions(-)

diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index 1bc49ff8a3..08fb570394 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -4,6 +4,10 @@ mod test_utils;
 
 use eyre::Result;
 use std::sync::Mutex;
+use iris_mpc_store::rerand as rerand_store;
+use serde_json::json;
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tokio::net::TcpListener;
 use test_utils::*;
 
 const STACK_SIZE: usize = 16 * 1024 * 1024;
@@ -33,6 +37,97 @@ fn run_async(f: impl std::future::Future<Output = Result<()>> + Send + 'static)
     result.unwrap();
 }
 
+async fn set_live_applied_chunk(pool: &sqlx::PgPool, epoch: i32, max_chunk: i32) -> Result<()> {
+    for chunk in 0..=max_chunk {
+        rerand_store::upsert_rerand_progress(pool, epoch, chunk).await?;
+        sqlx::query(
+            "UPDATE rerand_progress SET live_applied = TRUE WHERE epoch = $1 AND chunk_id = $2",
+        )
+        .bind(epoch)
+        .bind(chunk)
+        .execute(pool)
+        .await?;
+    }
+    Ok(())
+}
+
+fn spawn_checking_worker(pool: sqlx::PgPool) -> tokio::task::JoinHandle<()> {
+    tokio::spawn(async move {
+        loop {
+            match rerand_store::check_and_handle_freeze(&pool, None).await {
+                Ok(true) => {}
+                Ok(false) | Err(_) => break,
+            }
+        }
+    })
+}
+
+async fn simulate_server_startup_with_freeze(
+    pool: &sqlx::PgPool,
+    peer_addrs: &[(&str, usize)],
+) -> Result<()> {
+    rerand_store::freeze_and_verify_watermarks(pool, peer_addrs).await?;
+
+    // Mimic startup DB load behind apply lock.
+    let startup_lock = rerand_store::acquire_apply_lock(pool).await?;
+    let _: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM irises")
+        .fetch_one(pool)
+        .await?;
+    rerand_store::release_apply_lock(startup_lock).await?;
+
+    Ok(())
+}
+
+async fn start_peer_watermark_server(
+    pool: &sqlx::PgPool,
+) -> Result<(usize, tokio::task::JoinHandle<()>)> {
+    let listener = TcpListener::bind("127.0.0.1:0").await?;
+    let port = listener.local_addr()?.port() as usize;
+    let pool = pool.clone();
+    let handle = tokio::spawn(async move {
+        loop {
+            let (mut socket, _) = match listener.accept().await {
+                Ok(value) => value,
+                Err(_) => return,
+            };
+
+            let pool = pool.clone();
+            tokio::spawn(async move {
+                let mut buf = vec![0u8; 2048];
+                let _ = socket.read(&mut buf).await;
+
+                let wm = match rerand_store::get_applied_watermark_from_pool(&pool).await {
+                    Ok(Some((epoch, chunk_id))) => json!({
+                        "epoch": epoch,
+                        "max_applied_chunk": chunk_id,
+                    })
+                    .to_string(),
+                    Ok(None) => "null".to_string(),
+                    Err(e) => {
+                        let body = format!("{{\"error\":\"{}\"}}", e);
+                        let response = format!(
+                            "HTTP/1.1 500 Internal Server Error\r\ncontent-type: application/json\r\ncontent-length: {}\r\n\r\n{}",
+                            body.len(),
+                            body
+                        );
+                        let _ = socket.write_all(response.as_bytes()).await;
+                        return;
+                    }
+                };
+
+                let response = format!(
+                    "HTTP/1.1 200 OK\r\ncontent-type: application/json\r\ncontent-length: {}\r\n\r\n{}",
+                    wm.len(),
+                    wm
+                );
+                let _ = socket.write_all(response.as_bytes()).await;
+            });
+        }
+    });
+
+    Ok((port, handle))
+}
+
 // ============================================================================
 // Phase 1: Clean epoch -- run one full epoch, verify crypto correctness
 // ============================================================================
@@ -276,7 +371,7 @@ fn phase7_startup_validation() {
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (2, 0, TRUE, TRUE, TRUE)")
             .execute(&env.harness.parties[0].store.pool).await.unwrap();
 
-        let r_fatal = simulate_server_startup(&env.harness, 1).await;
+        let r_fatal = simulate_server_startup_with_rerand_validation(&env.harness, 1).await;
         assert!(r_fatal.is_err(), "Fatal epoch gap should bail immediately");
 
         // In-sync → startup succeeds immediately
@@ -290,7 +385,7 @@ fn phase7_startup_validation() {
                 .execute(pool).await.unwrap();
         }
 
-        let r_ok = simulate_server_startup(&env.harness, 0).await;
+        let r_ok = simulate_server_startup_with_rerand_validation(&env.harness, 0).await;
         assert!(r_ok.is_ok(), "In-sync startup should succeed");
 
         println!("[phase 7] PASSED");
@@ -325,7 +420,7 @@ fn phase8_reject_desync() {
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (2, 0, TRUE, TRUE, FALSE)")
 .execute(&env.harness.parties[2].store.pool).await.unwrap();
 
-        let r1 = simulate_server_startup(&env.harness, 1).await;
+        let r1 = simulate_server_startup_with_rerand_validation(&env.harness, 1).await;
         assert!(
             r1.is_err(),
             "P1 startup should have failed due to large epoch gap"
@@ -347,7 +442,7 @@ fn phase8_reject_desync() {
         sqlx::query("INSERT INTO rerand_progress (epoch, chunk_id, staging_written, all_confirmed, live_applied) VALUES (3, 2, TRUE, TRUE, FALSE)")
 .execute(&env.harness.parties[0].store.pool).await.unwrap();
 
-        let r1_chunk_desync = simulate_server_startup(&env.harness, 1).await;
+        let r1_chunk_desync = simulate_server_startup_with_rerand_validation(&env.harness, 1).await;
         assert!(
             r1_chunk_desync.is_err(),
             "P1 startup should have failed due to large chunk gap"
@@ -422,3 +517,142 @@ fn phase9_asymmetric_modification_consistency() {
         env.teardown().await
     });
 }
+
+// ============================================================================
+// Phase 10: Startup freeze catchup path — local party is behind peers and
+// advances while freeze is released and re-acquired.
+// ============================================================================
+
+#[test]
+fn phase10_startup_freeze_local_catchup() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 10] Startup freeze catchup...");
+
+        let p0_pool = &env.harness.parties[0].store.pool;
+        let p1_pool = &env.harness.parties[1].store.pool;
+        let p2_pool = &env.harness.parties[2].store.pool;
+
+        // Local is behind peers in this epoch.
+        set_live_applied_chunk(p0_pool, 0, 0).await?;
+        set_live_applied_chunk(p1_pool, 0, 4).await?;
+        set_live_applied_chunk(p2_pool, 0, 4).await?;
+
+        let (p1_port, p1_server) = start_peer_watermark_server(p1_pool).await?;
+        let (p2_port, p2_server) = start_peer_watermark_server(p2_pool).await?;
+        let worker = spawn_checking_worker(p0_pool.clone());
+
+        // Simulate a main-server startup sequence where this party releases freeze
+        // so catchup can happen, then re-enters freeze logic.
+        let catchup = tokio::spawn({
+            let p0_pool = p0_pool.clone();
+            async move {
+                loop {
+                    let (freeze_requested,): (bool,) =
+                        sqlx::query_as("SELECT freeze_requested FROM rerand_control WHERE id = 1")
+                            .fetch_one(&p0_pool)
+                            .await?;
+                    if freeze_requested {
+                        set_live_applied_chunk(&p0_pool, 0, 4).await?;
+                        return Ok::<_, eyre::Report>(());
+                    }
+                    tokio::time::sleep(std::time::Duration::from_millis(25)).await;
+                }
+            }
+        });
+
+        let startup = tokio::time::timeout(
+            std::time::Duration::from_secs(25),
+            simulate_server_startup_with_freeze(
+                p0_pool,
+                &[("127.0.0.1", p1_port), ("127.0.0.1", p2_port)],
+            ),
+        )
+        .await;
+        assert!(startup.is_ok(), "startup freeze converge timed out");
+        startup.unwrap()?;
+
+        assert_eq!(rerand_store::get_applied_watermark_from_pool(p0_pool).await?, Some((0, 4)));
+        rerand_store::release_rerand_freeze(p0_pool).await?;
+        catchup.await?.unwrap();
+
+        let control = sqlx::query_as::<_, (bool, Option<String>)>(
+            "SELECT freeze_requested, freeze_generation FROM rerand_control WHERE id = 1",
+        )
+        .fetch_one(p0_pool)
+        .await?;
+        assert!(!control.0, "freeze should be released after startup converge");
+        assert!(control.1.is_none(), "stale freeze generation should be cleared");
+
+        worker.abort();
+        p1_server.abort();
+        p2_server.abort();
+        env.teardown().await
+    });
+}
+
+// ============================================================================
+// Phase 11: Startup freeze wait path — local party is at max and peers catch up.
+// ============================================================================
+
+#[test]
+fn phase11_startup_freeze_waits_for_peers() {
+    run_async(async {
+        let _ = tracing_subscriber::fmt::try_init();
+        let env = TestEnv::setup().await?;
+        println!("[phase 11] Startup freeze peer catchup...");
+
+        let p0_pool = &env.harness.parties[0].store.pool;
+        let p1_pool = &env.harness.parties[1].store.pool;
+        let p2_pool = &env.harness.parties[2].store.pool;
+
+        // Local is fully caught up initially; peers lag at chunk 0.
+        set_live_applied_chunk(p0_pool, 0, 4).await?;
+        set_live_applied_chunk(p1_pool, 0, 0).await?;
+        set_live_applied_chunk(p2_pool, 0, 0).await?;
+
+        let (p1_port, p1_server) = start_peer_watermark_server(p1_pool).await?;
+        let (p2_port, p2_server) = start_peer_watermark_server(p2_pool).await?;
+        let worker = spawn_checking_worker(p0_pool.clone());
+
+        let advance_peers = tokio::spawn({
+            let p1_pool = p1_pool.clone();
+            let p2_pool = p2_pool.clone();
+            async move {
+                tokio::time::sleep(std::time::Duration::from_millis(300)).await;
+                set_live_applied_chunk(&p1_pool, 0, 4).await?;
+                set_live_applied_chunk(&p2_pool, 0, 4).await?;
+                Result::<(), eyre::Report>::Ok(())
+            }
+        });
+
+        let startup = tokio::time::timeout(
+            std::time::Duration::from_secs(25),
+            simulate_server_startup_with_freeze(
+                p0_pool,
+                &[("127.0.0.1", p1_port), ("127.0.0.1", p2_port)],
+            ),
+        )
+        .await;
+        assert!(startup.is_ok(), "startup freeze converge timed out");
+        startup.unwrap()?;
+
+        assert_eq!(rerand_store::get_applied_watermark_from_pool(p0_pool).await?, Some((0, 4)));
+        rerand_store::release_rerand_freeze(p0_pool).await?;
+        advance_peers.await??;
+
+        let control = sqlx::query_as::<_, (bool, Option<String>)>(
+            "SELECT freeze_requested, freeze_generation FROM rerand_control WHERE id = 1",
+        )
+        .fetch_one(p0_pool)
+        .await?;
+        assert!(!control.0, "freeze should be released after startup converge");
+        assert!(control.1.is_none(), "stale freeze generation should be cleared");
+
+        worker.abort();
+        p1_server.abort();
+        p2_server.abort();
+        env.teardown().await
+    });
+}
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index 7d794bc721..f946a717fa 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -420,6 +420,78 @@ pub async fn simulate_server_startup(harness: &TestHarness, party: usize) -> Res
     Ok(())
 }
 
+pub async fn simulate_server_startup_with_rerand_validation(
+    harness: &TestHarness,
+    party: usize,
+) -> Result<()> {
+    simulate_server_startup(harness, party).await?;
+    validate_rerand_startup_safety(harness).await
+}
+
+async fn validate_rerand_startup_safety(harness: &TestHarness) -> Result<()> {
+    let mut epochs = Vec::with_capacity(harness.parties.len());
+    let mut confirmed_chunks = Vec::with_capacity(harness.parties.len());
+
+    for party in &harness.parties {
+        let (epoch,): (Option<i32>,) =
+            sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
+                .fetch_one(&party.store.pool)
+                .await?;
+        let epoch = epoch.unwrap_or(0);
+
+        let (max_confirmed_chunk,): (Option<i32>,) =
+            sqlx::query_as(
+                "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND all_confirmed = TRUE",
+            )
+            .bind(epoch)
+            .fetch_one(&party.store.pool)
+            .await?;
+
+        epochs.push(epoch);
+        confirmed_chunks.push(max_confirmed_chunk.unwrap_or(-1));
+    }
+
+    let min_epoch = *epochs
+        .iter()
+        .min()
+        .ok_or_else(|| eyre::eyre!("No parties found for rerand startup validation"))?;
+    let max_epoch = *epochs
+        .iter()
+        .max()
+        .ok_or_else(|| eyre::eyre!("No parties found for rerand startup validation"))?;
+
+    if max_epoch - min_epoch > 1 {
+        eyre::bail!(
+            "Startup cannot proceed: rerand epoch gap is too large (min={}, max={}).",
+            min_epoch,
+            max_epoch
+        );
+    }
+
+    let max_epoch_parties: Vec<_> = epochs
+        .iter()
+        .zip(confirmed_chunks.iter())
+        .filter(|(e, _)| **e == max_epoch)
+        .map(|(_, c)| *c)
+        .collect();
+
+    if let (Some(min_chunk), Some(max_chunk)) = (
+        max_epoch_parties.iter().min().cloned(),
+        max_epoch_parties.iter().max().cloned(),
+    ) {
+        if max_chunk - min_chunk > 1 {
+            eyre::bail!(
+                "Startup cannot proceed: rerand confirmed-chunk gap is too large at epoch {} (min={}, max={}).",
+                max_epoch,
+                min_chunk,
+                max_chunk
+            );
+        }
+    }
+
+    Ok(())
+}
+
 pub async fn assert_consistent_rerand_epoch(
     harness: &TestHarness,
     skip_ids: &[i64],

From 53da85f4d2a804ec25f55703c60c50dde190fe1b Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sun, 1 Mar 2026 00:24:56 +0100
Subject: [PATCH 14/76] up spec

---
 docs/specs/rerandomization.md | 39 ++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 0ff6a712d4..6c495a2ead 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -8,15 +8,15 @@ Key design decision: in-memory shares are less likely to be exfiltrated, so only
 
 ## Critical assumption: reliable modification delivery
 
-The correctness of this protocol depends on **every modification (reauth, deletion, reset) eventually arriving at every party via SQS**. This is a pre-existing system invariant — without it, the MPC shares diverge regardless of rerandomization. Rerandomization does not weaken this guarantee, but it does create a new transient inconsistency window (see [Post-staging modifications](#post-staging-modifications-transient-inconsistency)) that relies on modification delivery to self-correct.
+The correctness of this protocol depends on **every modification (reauth, deletion, reset) eventually arriving at every party via SQS**. This is a pre-existing system invariant — without it, the MPC shares diverge regardless of rerandomization. The prior system already depended on this assumption; this design makes the dependency explicit and continues to enforce the same safety boundary. Rerandomization does not weaken this guarantee, but it does create a new transient inconsistency window (see [Post-staging modifications](#post-staging-modifications-transient-inconsistency)) that relies on modification delivery to self-correct.
 
-Three mechanisms enforce this invariant:
+The protocol enforces this in two active mechanisms, with one residual coverage gap:
 
-1. **SQS delete after persist** — the SQS message is only deleted *after* the modification row is durably written to the DB. If the process crashes between receiving and persisting, SQS redelivers the message. This eliminates the window where a message could be lost between delete and persist.
+1. **SQS delete after persist** — the SQS message is only deleted *after* the modification row is durably written to the DB. If the process crashes between receiving and persisting, SQS redelivers the message. This eliminates the window where a message could be lost between delete and persist. This behavior is implemented in this branch; it is safer than main’s previous delete-before-process behavior.
 
-2. **Startup reconciliation recovers missing modifications** — `sync_modifications` compares modification state across all three parties. If a modification is completed on peers but missing locally (e.g., from a historical race before the delete-after-persist fix), it is now recovered from the peer's copy and applied locally. Large modification-ID drift across nodes is logged as an error but does not crash the process, allowing best-effort reconciliation.
+2. **Startup reconciliation recovers missing modifications** — `sync_modifications` compares modification state across all three parties. In this branch, `compare_modifications` was strengthened to emit missing completed rows, and `sync_modifications` now stages them with `upsert_recovered_modification` so they are fully replayed locally. This closes local startup drift paths that were only partially handled before and is linked with lock ordering around rerand apply/state freeze. It still fails closed on lookback overrun.
 
-3. **Remaining gap**: `sync_modifications` is a startup procedure, not a continuous background loop. A running node that permanently lost a modification (and never restarts) will stay inconsistent for the affected row until the next epoch re-randomizes it. Periodic rolling restarts or a future continuous reconciliation loop would close this gap entirely.
+3. **Residual gap**: `sync_modifications` is a startup procedure, not a continuous background loop. A running node that permanently lost a modification (and never restarts) will stay inconsistent for the affected row until the next epoch re-randomizes it. Periodic rolling restarts or a future continuous reconciliation loop would close this gap entirely.
 
 ## Architecture
 
@@ -231,7 +231,7 @@ At startup, before `load_iris_db`:
       - **All equal** → proceed to DB load.
       - **Local is behind max(peers)** → release the local freeze so the worker can catch up (apply the pending chunk), sleep briefly, then re-freeze and re-check from step (a).
       - **Local is at or ahead of max(peers)** → stay frozen and re-poll peers after a short sleep. The behind parties' startups will release their own freezes, letting their workers catch up.
-   e. This loop converges because: only behind parties release their freeze, leading parties stay frozen (can't advance), and the S3 barrier limits the gap to at most 1 chunk. Timeout after 2 minutes if convergence doesn't happen (indicates a stuck worker).
+   e. This loop converges by repeatedly releasing/re-freezing until all parties report matching `(epoch, max_applied_chunk)` watermarks. Timeout after 2 minutes if convergence doesn't happen (indicates a stuck worker). Only behind parties release their freeze, while at-max parties stay frozen and wait for peers.
 3. **New**: acquire `RERAND_APPLY_LOCK` on a dedicated connection (belt-and-suspenders with the freeze).
 4. **Existing**: `load_iris_db` — loads from live DB into GPU/HNSW memory. Both the freeze and the advisory lock are held, so the rerand server cannot apply new chunks.
 5. Release `RERAND_APPLY_LOCK`.
@@ -387,17 +387,22 @@ sequenceDiagram
     MS->>Peer2: GET /rerand-watermark
     Peer2-->>MS: {epoch: 3, max_applied_chunk: 7}
 
-    alt All watermarks equal
-        MS->>DB: pg_advisory_lock(APPLY_LOCK)
-        MS->>DB: load_iris_db (full DB snapshot into memory)
-        MS->>DB: pg_advisory_unlock(APPLY_LOCK)
-        MS->>DB: SET freeze_requested=FALSE
-        Note over RW: Poll sees freeze_requested=FALSE
-        RW->>RW: Resume chunk processing
-    else Watermark mismatch
-        MS->>DB: SET freeze_requested=FALSE
-        Note over RW: Resume chunk processing
-        Note over MS: ABORT startup (fail closed)
+    loop Convergence
+        alt All watermarks equal
+            MS->>DB: pg_advisory_lock(APPLY_LOCK)
+            MS->>DB: load_iris_db (full DB snapshot into memory)
+            MS->>DB: pg_advisory_unlock(APPLY_LOCK)
+            MS->>DB: SET freeze_requested=FALSE
+            Note over RW: Poll sees freeze_requested=FALSE
+            RW->>RW: Resume chunk processing
+            break
+        else Local behind max
+            MS->>DB: SET freeze_requested=FALSE
+            Note over RW: Resume to catch up
+            MS->>MS: sleep + re-freeze with new request
+        else Local at max, peers behind
+            MS->>MS: sleep briefly
+        end
     end
 ```
 

From 68a593cc5c7193100148254f1b9a011686ead49a Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sun, 1 Mar 2026 00:35:48 +0100
Subject: [PATCH 15/76] fmt + clippy

---
 iris-mpc-bins/bin/iris-mpc/server.rs          | 63 ++++++++++--------
 iris-mpc-common/src/helpers/sync.rs           |  6 +-
 iris-mpc-store/src/rerand.rs                  | 65 +++++++------------
 iris-mpc-upgrade/src/continuous_rerand.rs     |  4 +-
 iris-mpc-upgrade/src/s3_coordination.rs       | 20 +++---
 .../tests/continuous_rerand_e2e.rs            | 41 ++++++++----
 iris-mpc-upgrade/tests/test_utils.rs          | 20 +++---
 iris-mpc/src/server/mod.rs                    |  4 +-
 8 files changed, 112 insertions(+), 111 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 76fd150475..4b1fd15316 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -159,6 +159,24 @@ pub fn receive_batch_stream(
     rx
 }
 
+async fn delete_message_from_sqs(
+    client: &Client,
+    queue_url: &str,
+    sqs_message: &aws_sdk_sqs::types::Message,
+) -> Result<(), ReceiveRequestError> {
+    let receipt_handle = sqs_message.receipt_handle.as_deref().ok_or_else(|| {
+        ReceiveRequestError::FailedToMarkRequestAsDeleted(eyre!("Missing receipt handle"))
+    })?;
+    client
+        .delete_message()
+        .queue_url(queue_url)
+        .receipt_handle(receipt_handle)
+        .send()
+        .await
+        .map_err(ReceiveRequestError::from)?;
+    Ok(())
+}
+
 #[allow(clippy::too_many_arguments)]
 async fn receive_batch(
     party_id: usize,
@@ -242,7 +260,7 @@ async fn receive_batch(
                                 identity_deletion_request.serial_id,
                                 identity_deletion_request,
                             );
-                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                            delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                             continue;
                         }
                         let modification = store
@@ -252,7 +270,7 @@ async fn receive_batch(
                                 None,
                             )
                             .await?;
-                        client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                        delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                         batch_query.modifications.insert(
                             RequestSerialId(identity_deletion_request.serial_id),
                             modification,
@@ -296,7 +314,7 @@ async fn receive_batch(
                                 Some(uniqueness_request.s3_key.as_str()),
                             )
                             .await?;
-                        client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                        delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                         batch_query.modifications.insert(
                             RequestId(uniqueness_request.signup_id.clone()),
                             modification,
@@ -392,7 +410,7 @@ async fn receive_batch(
                                 "Received a reauth request with use_or_rule set to true, but LUC \
                                  is not enabled. Skipping request."
                             );
-                                client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                                delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                                 continue;
                             }
 
@@ -405,7 +423,7 @@ async fn receive_batch(
                                 reauth_request.serial_id,
                                 reauth_request,
                             );
-                                client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                                delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                                 continue;
                             }
 
@@ -418,7 +436,7 @@ async fn receive_batch(
                                     Some(reauth_request.s3_key.as_str()),
                                 )
                                 .await?;
-                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                            delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                             batch_query
                                 .modifications
                                 .insert(RequestSerialId(reauth_request.serial_id), modification);
@@ -476,7 +494,7 @@ async fn receive_batch(
                             handles.push(handle);
                         } else {
                             tracing::warn!("Reauth is disabled, skipping reauth request");
-                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                            delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                         }
                     }
 
@@ -491,10 +509,10 @@ async fn receive_batch(
                                 )
                             })?;
 
-                        if !is_enabled(&request_type, &config) {
+                        if !is_enabled(request_type, config) {
                             metrics::counter!("request.skipped", "type" => request_type.to_string()).increment(1);
                             tracing::warn!("{} is disabled, skipping request", request_type);
-                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                            delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                             continue;
                         }
                         metrics::counter!("request.received", "type" => request_type.to_string())
@@ -505,11 +523,11 @@ async fn receive_batch(
                         let modification = store
                             .insert_modification(
                                 None,
-                                &request_type,
+                                request_type,
                                 Some(identity_match_check_request.s3_key.as_str()),
                             )
                             .await?;
-                        client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                        delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                         batch_query.modifications.insert(
                             RequestId(identity_match_check_request.request_id.clone()),
                             modification,
@@ -524,7 +542,7 @@ async fn receive_batch(
                         batch_query.push_matching_request(
                             sns_message_id,
                             identity_match_check_request.request_id.clone(),
-                            &request_type,
+                            request_type,
                             batch_metadata,
                             vec![], // use AND rule for identity match check requests
                             false,  // skip_persistence is only used for uniqueness requests
@@ -600,7 +618,7 @@ async fn receive_batch(
                                 reset_update_request.serial_id,
                                 reset_update_request,
                             );
-                                client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                                delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                                 continue;
                             }
 
@@ -611,7 +629,7 @@ async fn receive_batch(
                                     Some(reset_update_request.s3_key.as_str()),
                                 )
                                 .await?;
-                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                            delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                             batch_query.modifications.insert(
                                 RequestSerialId(reset_update_request.serial_id),
                                 modification,
@@ -630,18 +648,12 @@ async fn receive_batch(
                             );
                         } else {
                             tracing::warn!("Reset is disabled, skipping reset update request");
-                            client.delete_message().queue_url(queue_url).receipt_handle(sqs_message.receipt_handle.unwrap()).send().await.map_err(ReceiveRequestError::from)?;
+                            delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                         }
                     }
 
                     _ => {
-                        client
-                            .delete_message()
-                            .queue_url(queue_url)
-                            .receipt_handle(sqs_message.receipt_handle.unwrap())
-                            .send()
-                            .await
-                            .map_err(ReceiveRequestError::from)?;
+                        delete_message_from_sqs(client, queue_url, &sqs_message).await?;
                         tracing::error!("Error: {}", ReceiveRequestError::InvalidMessageType);
                     }
                 }
@@ -1357,9 +1369,7 @@ async fn server_main(config: Config) -> Result<()> {
             .zip(server_coord_config.healthcheck_ports.iter())
             .enumerate()
             .filter(|(i, _)| *i != config.party_id)
-            .map(|(_, (h, p))| -> eyre::Result<_> {
-                Ok((h.as_str(), p.parse::<usize>()?))
-            })
+            .map(|(_, (h, p))| -> eyre::Result<_> { Ok((h.as_str(), p.parse::<usize>()?)) })
             .collect::<eyre::Result<Vec<_>>>()?;
         rerand_store::freeze_and_verify_watermarks(&store.pool, &peer_addrs).await?;
     }
@@ -1425,8 +1435,7 @@ async fn server_main(config: Config) -> Result<()> {
                                 "Initialize iris db: Loading from DB (parallelism: {})",
                                 parallelism
                             );
-                            let download_shutdown_handler =
-                                Arc::clone(&download_shutdown_handler);
+                            let download_shutdown_handler = Arc::clone(&download_shutdown_handler);
 
                             tokio::runtime::Handle::current().block_on(async {
                                 load_iris_db(
diff --git a/iris-mpc-common/src/helpers/sync.rs b/iris-mpc-common/src/helpers/sync.rs
index 78fe36ea49..0dde7ae2a3 100644
--- a/iris-mpc-common/src/helpers/sync.rs
+++ b/iris-mpc-common/src/helpers/sync.rs
@@ -254,11 +254,7 @@ impl SyncResult {
                     "Modification ID difference across nodes ({}) exceeds lookback ({}): {:?}. \
                      Min: {:?}, Max: {:?}. Cannot safely reconcile. \
                      Bump max_modifications_lookback or investigate drift.",
-                    mod_id_diff,
-                    lookback,
-                    completed_max_mod_ids,
-                    min_id,
-                    max_id
+                    mod_id_diff, lookback, completed_max_mod_ids, min_id, max_id
                 );
             }
         }
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index d39d316d94..d91b232c0b 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -9,9 +9,7 @@ pub const RERAND_MODIFY_LOCK: i64 = 0x5245_4D4F_4446;
 
 /// Acquire `RERAND_MODIFY_LOCK` as a transaction-level advisory lock.
 /// Auto-released on commit/rollback.
-pub async fn acquire_modify_lock(
-    tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
-) -> Result<()> {
+pub async fn acquire_modify_lock(tx: &mut sqlx::Transaction<'_, sqlx::Postgres>) -> Result<()> {
     sqlx::query("SELECT pg_advisory_xact_lock($1)")
         .bind(RERAND_MODIFY_LOCK)
         .execute(&mut **tx)
@@ -297,10 +295,7 @@ pub async fn get_rerand_progress(
 
 /// Returns the highest `chunk_id` where `live_applied = TRUE` for a given
 /// epoch, or `None` if no chunks have been applied in that epoch yet.
-pub async fn get_max_applied_chunk_for_epoch(
-    pool: &PgPool,
-    epoch: i32,
-) -> Result<Option<i32>> {
+pub async fn get_max_applied_chunk_for_epoch(pool: &PgPool, epoch: i32) -> Result<Option<i32>> {
     let row: (Option<i32>,) = sqlx::query_as(
         "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND live_applied = TRUE",
     )
@@ -321,15 +316,15 @@ pub async fn delete_staging_for_old_epochs(
         r#"DELETE FROM "{}".irises WHERE epoch < $1"#,
         staging_schema
     );
-    let result = sqlx::query(&sql)
-        .bind(current_epoch)
-        .execute(pool)
-        .await?;
+    let result = sqlx::query(&sql).bind(current_epoch).execute(pool).await?;
     Ok(result.rows_affected())
 }
 
 /// Delete rerand progress rows for epochs older than `current_epoch`.
-pub async fn delete_rerand_progress_for_old_epochs(pool: &PgPool, current_epoch: i32) -> Result<u64> {
+pub async fn delete_rerand_progress_for_old_epochs(
+    pool: &PgPool,
+    current_epoch: i32,
+) -> Result<u64> {
     let result = sqlx::query("DELETE FROM rerand_progress WHERE epoch < $1")
         .bind(current_epoch)
         .execute(pool)
@@ -363,7 +358,9 @@ pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<Option<RerandSyncS
             return Err(e);
         }
     };
-    let max_applied = get_max_applied_chunk_for_epoch(pool, epoch).await?.unwrap_or(-1);
+    let max_applied = get_max_applied_chunk_for_epoch(pool, epoch)
+        .await?
+        .unwrap_or(-1);
     Ok(Some(RerandSyncState {
         epoch,
         max_applied_chunk: max_applied,
@@ -384,7 +381,6 @@ fn is_undefined_table_sqlx(err: &sqlx::Error) -> bool {
     false
 }
 
-
 // ---------------------------------------------------------------------------
 // Freeze protocol: coordinated pause of the rerand worker during startup
 // ---------------------------------------------------------------------------
@@ -421,11 +417,10 @@ pub async fn request_rerand_freeze(pool: &PgPool) -> Result<Option<String>> {
 pub async fn wait_for_rerand_frozen(pool: &PgPool, generation: &str) -> Result<()> {
     let deadline = tokio::time::Instant::now() + FREEZE_TIMEOUT;
     loop {
-        let row: Option<(Option<String>,)> = sqlx::query_as(
-            "SELECT frozen_generation FROM rerand_control WHERE id = 1",
-        )
-        .fetch_optional(pool)
-        .await?;
+        let row: Option<(Option<String>,)> =
+            sqlx::query_as("SELECT frozen_generation FROM rerand_control WHERE id = 1")
+                .fetch_optional(pool)
+                .await?;
 
         if let Some((Some(frozen_gen),)) = row {
             if frozen_gen == generation {
@@ -468,7 +463,10 @@ pub async fn check_and_handle_freeze(
         return Ok(true);
     };
 
-    tracing::info!("Rerand freeze requested (generation={}), pausing...", generation);
+    tracing::info!(
+        "Rerand freeze requested (generation={}), pausing...",
+        generation
+    );
 
     // Acknowledge the freeze.
     sqlx::query("UPDATE rerand_control SET frozen_generation = $1 WHERE id = 1")
@@ -554,11 +552,9 @@ pub async fn acquire_apply_lock(pool: &PgPool) -> Result<Option<sqlx::PgConnecti
     let mut conn = pool.acquire().await?;
 
     // If rerand tables don't exist yet, skip.
-    match sqlx::query_as::<_, (i64,)>(
-        "SELECT COUNT(*) FROM rerand_progress LIMIT 1",
-    )
-    .fetch_one(&mut *conn)
-    .await
+    match sqlx::query_as::<_, (i64,)>("SELECT COUNT(*) FROM rerand_progress LIMIT 1")
+        .fetch_one(&mut *conn)
+        .await
     {
         Err(e) if is_undefined_table_sqlx(&e) => return Ok(None),
         Err(e) => return Err(e.into()),
@@ -617,11 +613,7 @@ async fn fetch_peer_watermark(host: &str, port: usize) -> Result<Option<(i32, i3
         .await
         .map_err(|e| eyre::eyre!("Failed to reach {} for watermark: {}", url, e))?;
     if !resp.status().is_success() {
-        eyre::bail!(
-            "Peer {} returned HTTP {} for watermark",
-            url,
-            resp.status()
-        );
+        eyre::bail!("Peer {} returned HTTP {} for watermark", url, resp.status());
     }
     let body = resp
         .text()
@@ -636,8 +628,7 @@ async fn fetch_peer_watermark(host: &str, port: usize) -> Result<Option<(i32, i3
     Ok(Some((
         v["epoch"]
             .as_i64()
-            .ok_or_else(|| eyre::eyre!("Missing epoch in watermark from {}", url))?
-            as i32,
+            .ok_or_else(|| eyre::eyre!("Missing epoch in watermark from {}", url))? as i32,
         v["max_applied_chunk"]
             .as_i64()
             .ok_or_else(|| eyre::eyre!("Missing max_applied_chunk in watermark from {}", url))?
@@ -653,10 +644,7 @@ async fn fetch_peer_watermark(host: &str, port: usize) -> Result<Option<(i32, i3
 /// Guarantees: when this returns `Ok(())`, the local worker is frozen and
 /// all parties have the same `(epoch, max_applied_chunk)`.
 /// On any error, the freeze is released before the error propagates.
-pub async fn freeze_and_verify_watermarks(
-    pool: &PgPool,
-    peers: &[(&str, usize)],
-) -> Result<()> {
+pub async fn freeze_and_verify_watermarks(pool: &PgPool, peers: &[(&str, usize)]) -> Result<()> {
     if peers.is_empty() {
         eyre::bail!("freeze_and_verify_watermarks called with no peers");
     }
@@ -674,10 +662,7 @@ pub async fn freeze_and_verify_watermarks(
     result
 }
 
-async fn freeze_and_verify_inner(
-    pool: &PgPool,
-    peers: &[(&str, usize)],
-) -> Result<()> {
+async fn freeze_and_verify_inner(pool: &PgPool, peers: &[(&str, usize)]) -> Result<()> {
     let deadline = tokio::time::Instant::now() + FREEZE_TIMEOUT;
 
     loop {
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 436ed1d54a..1f69cd0468 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -4,8 +4,8 @@ use bytemuck::cast_slice;
 use eyre::Result;
 use futures::StreamExt;
 use iris_mpc_store::rerand::{
-    apply_confirmed_chunk, check_and_handle_freeze, delete_staging_chunk,
-    delete_staging_for_old_epochs, delete_rerand_progress_for_old_epochs, get_current_epoch,
+    apply_confirmed_chunk, check_and_handle_freeze, delete_rerand_progress_for_old_epochs,
+    delete_staging_chunk, delete_staging_for_old_epochs, get_current_epoch,
     get_max_applied_chunk_for_epoch, get_rerand_progress, get_staging_version_map,
     insert_staging_irises, set_all_confirmed, set_staging_written, staging_schema_name,
     upsert_rerand_progress, StagingIrisEntry,
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index 747a69bba6..a409436182 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -20,7 +20,10 @@ impl Manifest {
     /// IDs are 1-based.
     pub fn chunk_range(&self, chunk_id: u32) -> (u64, u64) {
         let start = 1 + (chunk_id as u64) * self.chunk_size;
-        let end = std::cmp::min(start + self.chunk_size, self.max_id_inclusive.saturating_add(1));
+        let end = std::cmp::min(
+            start + self.chunk_size,
+            self.max_id_inclusive.saturating_add(1),
+        );
         (start, end)
     }
 
@@ -359,14 +362,13 @@ pub async fn compute_cross_party_divergent_ids(
     );
 
     use std::collections::HashMap;
-    let all_maps: Vec<HashMap<i64, i16>> =
-        try_join_all((0..NUM_PARTIES).map(|party| {
-            download_chunk_version_map(s3, bucket, epoch, party, chunk_id, poll_interval)
-        }))
-        .await?
-        .into_iter()
-        .map(|v| v.into_iter().collect::<HashMap<_, _>>())
-        .collect();
+    let all_maps: Vec<HashMap<i64, i16>> = try_join_all((0..NUM_PARTIES).map(|party| {
+        download_chunk_version_map(s3, bucket, epoch, party, chunk_id, poll_interval)
+    }))
+    .await?
+    .into_iter()
+    .map(|v| v.into_iter().collect::<HashMap<_, _>>())
+    .collect();
 
     let mut divergent = Vec::new();
     let all_ids: std::collections::BTreeSet<i64> =
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index 08fb570394..e999c57b72 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -3,12 +3,12 @@
 mod test_utils;
 
 use eyre::Result;
-use std::sync::Mutex;
 use iris_mpc_store::rerand as rerand_store;
 use serde_json::json;
+use std::sync::Mutex;
+use test_utils::*;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tokio::net::TcpListener;
-use test_utils::*;
 
 const STACK_SIZE: usize = 16 * 1024 * 1024;
 
@@ -53,12 +53,7 @@ async fn set_live_applied_chunk(pool: &sqlx::PgPool, epoch: i32, max_chunk: i32)
 
 fn spawn_checking_worker(pool: sqlx::PgPool) -> tokio::task::JoinHandle<()> {
     tokio::spawn(async move {
-        loop {
-            match rerand_store::check_and_handle_freeze(&pool, None).await {
-                Ok(true) => {}
-                Ok(false) | Err(_) => break,
-            }
-        }
+        while let Ok(true) = rerand_store::check_and_handle_freeze(&pool, None).await {}
     })
 }
 
@@ -573,7 +568,10 @@ fn phase10_startup_freeze_local_catchup() {
         assert!(startup.is_ok(), "startup freeze converge timed out");
         startup.unwrap()?;
 
-        assert_eq!(rerand_store::get_applied_watermark_from_pool(p0_pool).await?, Some((0, 4)));
+        assert_eq!(
+            rerand_store::get_applied_watermark_from_pool(p0_pool).await?,
+            Some((0, 4))
+        );
         rerand_store::release_rerand_freeze(p0_pool).await?;
         catchup.await?.unwrap();
 
@@ -582,8 +580,14 @@ fn phase10_startup_freeze_local_catchup() {
         )
         .fetch_one(p0_pool)
         .await?;
-        assert!(!control.0, "freeze should be released after startup converge");
-        assert!(control.1.is_none(), "stale freeze generation should be cleared");
+        assert!(
+            !control.0,
+            "freeze should be released after startup converge"
+        );
+        assert!(
+            control.1.is_none(),
+            "stale freeze generation should be cleared"
+        );
 
         worker.abort();
         p1_server.abort();
@@ -638,7 +642,10 @@ fn phase11_startup_freeze_waits_for_peers() {
         assert!(startup.is_ok(), "startup freeze converge timed out");
         startup.unwrap()?;
 
-        assert_eq!(rerand_store::get_applied_watermark_from_pool(p0_pool).await?, Some((0, 4)));
+        assert_eq!(
+            rerand_store::get_applied_watermark_from_pool(p0_pool).await?,
+            Some((0, 4))
+        );
         rerand_store::release_rerand_freeze(p0_pool).await?;
         advance_peers.await??;
 
@@ -647,8 +654,14 @@ fn phase11_startup_freeze_waits_for_peers() {
         )
         .fetch_one(p0_pool)
         .await?;
-        assert!(!control.0, "freeze should be released after startup converge");
-        assert!(control.1.is_none(), "stale freeze generation should be cleared");
+        assert!(
+            !control.0,
+            "freeze should be released after startup converge"
+        );
+        assert!(
+            control.1.is_none(),
+            "stale freeze generation should be cleared"
+        );
 
         worker.abort();
         p1_server.abort();
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index f946a717fa..e0fafbe979 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -433,19 +433,17 @@ async fn validate_rerand_startup_safety(harness: &TestHarness) -> Result<()> {
     let mut confirmed_chunks = Vec::with_capacity(harness.parties.len());
 
     for party in &harness.parties {
-        let (epoch,): (Option<i32>,) =
-            sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
-                .fetch_one(&party.store.pool)
-                .await?;
-        let epoch = epoch.unwrap_or(0);
-
-        let (max_confirmed_chunk,): (Option<i32>,) =
-            sqlx::query_as(
-                "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND all_confirmed = TRUE",
-            )
-            .bind(epoch)
+        let (epoch,): (Option<i32>,) = sqlx::query_as("SELECT MAX(epoch) FROM rerand_progress")
             .fetch_one(&party.store.pool)
             .await?;
+        let epoch = epoch.unwrap_or(0);
+
+        let (max_confirmed_chunk,): (Option<i32>,) = sqlx::query_as(
+            "SELECT MAX(chunk_id) FROM rerand_progress WHERE epoch = $1 AND all_confirmed = TRUE",
+        )
+        .bind(epoch)
+        .fetch_one(&party.store.pool)
+        .await?;
 
         epochs.push(epoch);
         confirmed_chunks.push(max_confirmed_chunk.unwrap_or(-1));
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index f42086d490..467f72556f 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -187,9 +187,7 @@ pub async fn server_main(config: Config) -> Result<()> {
             .zip(sc.healthcheck_ports.iter())
             .enumerate()
             .filter(|(i, _)| *i != config.party_id)
-            .map(|(_, (h, p))| -> eyre::Result<_> {
-                Ok((h.as_str(), p.parse::<usize>()?))
-            })
+            .map(|(_, (h, p))| -> eyre::Result<_> { Ok((h.as_str(), p.parse::<usize>()?)) })
             .collect::<eyre::Result<Vec<_>>>()?;
         rerand_store::freeze_and_verify_watermarks(&iris_store.pool, &peer_addrs).await?;
     }

From f0479f9a2434b63145d234bb4def6a5f5408d9aa Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sun, 1 Mar 2026 11:41:29 +0100
Subject: [PATCH 16/76] run in ci

---
 .../continuous-rerand-e2e-tests.yaml          | 60 +++++++++++++++++++
 .../iris-mpc-upgrade/run-rerand-e2e-tests.sh  | 17 ++++--
 .../tests/continuous_rerand_e2e.rs            | 11 ++++
 3 files changed, 84 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/continuous-rerand-e2e-tests.yaml

diff --git a/.github/workflows/continuous-rerand-e2e-tests.yaml b/.github/workflows/continuous-rerand-e2e-tests.yaml
new file mode 100644
index 0000000000..3446cd1f70
--- /dev/null
+++ b/.github/workflows/continuous-rerand-e2e-tests.yaml
@@ -0,0 +1,60 @@
+name: Continuous Rerand E2E Tests
+
+on:
+  pull_request:
+
+concurrency:
+  group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}"
+  cancel-in-progress: true
+
+jobs:
+  rerand-e2e:
+    timeout-minutes: 30
+    runs-on:
+      labels: ubuntu-22.04-16core
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+
+      - name: Get all test, doc and src files that have changed
+        id: changed-files-yaml
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62
+        with:
+          files_yaml: |
+            src:
+              - Dockerfile*
+              - Cargo.lock
+              - Cargo.toml
+              - deny.toml
+              - rust-toolchain.toml
+              - iris-*/**
+              - iris-mpc-upgrade/**
+              - iris-mpc-store/**
+              - iris-mpc-common/**
+              - docs/specs/rerandomization.md
+              - migrations/**
+              - scripts/**
+              - iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
+              - iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml
+              - .github/workflows/continuous-rerand-e2e-tests.yaml
+
+      - name: Cache Rust build
+        if: steps.changed-files-yaml.outputs.src_any_changed == 'true'
+        uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb
+        id: cache-rust
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            rust-build-${{ runner.os }}-
+
+      - name: Run rerandomization e2e tests
+        if: steps.changed-files-yaml.outputs.src_any_changed == 'true'
+        run: |
+          bash iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh b/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
index 167f2682ae..0cdf4e10bb 100755
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
@@ -13,15 +13,24 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
 COMPOSE_FILE="$SCRIPT_DIR/docker-compose.rand.yaml"
 
+if command -v docker-compose >/dev/null 2>&1; then
+    COMPOSE=(docker-compose)
+elif docker compose version >/dev/null 2>&1; then
+    COMPOSE=(docker compose)
+else
+    echo "Neither docker-compose nor docker compose is available."
+    exit 1
+fi
+
 cleanup() {
     echo "=== Tearing down containers ==="
-    docker-compose -f "$COMPOSE_FILE" down --remove-orphans -v 2>/dev/null || true
+    "${COMPOSE[@]}" -f "$COMPOSE_FILE" down --remove-orphans -v 2>/dev/null || true
 }
 trap cleanup EXIT
 
 echo "=== Starting Postgres + localstack ==="
-docker-compose -f "$COMPOSE_FILE" down --remove-orphans -v 2>/dev/null || true
-docker-compose -f "$COMPOSE_FILE" up -d
+"${COMPOSE[@]}" -f "$COMPOSE_FILE" down --remove-orphans -v 2>/dev/null || true
+"${COMPOSE[@]}" -f "$COMPOSE_FILE" up -d
 
 echo "Waiting for services to be ready..."
 for i in $(seq 1 30); do
@@ -48,6 +57,6 @@ AWS_SECRET_ACCESS_KEY=test \
 AWS_DEFAULT_REGION=us-east-1 \
 AWS_ENDPOINT_URL=http://127.0.0.1:4566 \
 ENVIRONMENT=testing \
-    cargo test -p iris-mpc-upgrade --test continuous_rerand_e2e --features db_dependent -- --nocapture
+    cargo test -p iris-mpc-upgrade --test continuous_rerand_e2e --features db_dependent -- --include-ignored --nocapture
 
 echo "=== All tests passed ==="
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index e999c57b72..2d916bc88e 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -128,6 +128,7 @@ async fn start_peer_watermark_server(
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase1_clean_epoch() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -152,6 +153,7 @@ fn phase1_clean_epoch() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase2_kill_and_resume() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -185,6 +187,7 @@ fn phase2_kill_and_resume() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase3_concurrent_modifications() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -247,6 +250,7 @@ fn phase3_concurrent_modifications() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase4_server_restart_during_rerand() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -279,6 +283,7 @@ fn phase4_server_restart_during_rerand() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase5_staggered_restart() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -319,6 +324,7 @@ fn phase5_staggered_restart() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase6_multiple_epochs() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -351,6 +357,7 @@ fn phase6_multiple_epochs() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase7_startup_validation() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -394,6 +401,7 @@ fn phase7_startup_validation() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase8_reject_desync() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -457,6 +465,7 @@ fn phase8_reject_desync() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase9_asymmetric_modification_consistency() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -519,6 +528,7 @@ fn phase9_asymmetric_modification_consistency() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase10_startup_freeze_local_catchup() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();
@@ -601,6 +611,7 @@ fn phase10_startup_freeze_local_catchup() {
 // ============================================================================
 
 #[test]
+#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
 fn phase11_startup_freeze_waits_for_peers() {
     run_async(async {
         let _ = tracing_subscriber::fmt::try_init();

From 4ccf85e307d5c36795d70aa8622ee44e1764bcd9 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sun, 1 Mar 2026 11:48:15 +0100
Subject: [PATCH 17/76] up

---
 .github/workflows/continuous-rerand-e2e-tests.yaml     | 10 ++++++++++
 .../bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh       |  6 ++++++
 2 files changed, 16 insertions(+)

diff --git a/.github/workflows/continuous-rerand-e2e-tests.yaml b/.github/workflows/continuous-rerand-e2e-tests.yaml
index 3446cd1f70..eeca3808a8 100644
--- a/.github/workflows/continuous-rerand-e2e-tests.yaml
+++ b/.github/workflows/continuous-rerand-e2e-tests.yaml
@@ -54,6 +54,16 @@ jobs:
           restore-keys: |
             rust-build-${{ runner.os }}-
 
+      - name: Install protobuf compiler
+        if: steps.changed-files-yaml.outputs.src_any_changed == 'true'
+        run: |
+          if command -v protoc > /dev/null; then
+            echo "protoc already installed: $(command -v protoc)"
+          else
+            sudo apt-get update
+            sudo apt-get install -y protobuf-compiler
+          fi
+
       - name: Run rerandomization e2e tests
         if: steps.changed-files-yaml.outputs.src_any_changed == 'true'
         run: |
diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh b/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
index 0cdf4e10bb..e85a7cbd07 100755
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/run-rerand-e2e-tests.sh
@@ -22,6 +22,12 @@ else
     exit 1
 fi
 
+if ! command -v protoc >/dev/null 2>&1; then
+    echo "protoc not found. Install protobuf compiler (protobuf-compiler) before running these tests."
+    echo "In GitHub Actions this workflow installs it automatically."
+    exit 1
+fi
+
 cleanup() {
     echo "=== Tearing down containers ==="
     "${COMPOSE[@]}" -f "$COMPOSE_FILE" down --remove-orphans -v 2>/dev/null || true

From dc94b35a4db57ebfaf21145173d37a4cfd5a3a72 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Sun, 1 Mar 2026 12:16:14 +0100
Subject: [PATCH 18/76] fix mermaid

---
 docs/specs/rerandomization.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 6c495a2ead..6d029ab92b 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -395,7 +395,7 @@ sequenceDiagram
             MS->>DB: SET freeze_requested=FALSE
             Note over RW: Poll sees freeze_requested=FALSE
             RW->>RW: Resume chunk processing
-            break
+            Note over MS: Convergence reached; startup continues
         else Local behind max
             MS->>DB: SET freeze_requested=FALSE
             Note over RW: Resume to catch up
@@ -421,7 +421,7 @@ sequenceDiagram
     RW->>DB: SET frozen_generation=G1
     Note over RW: Blocked in freeze loop
 
-    MS1-xMS1: CRASH (freeze_requested still TRUE)
+    MS1->>MS1: CRASH (freeze_requested still TRUE)
 
     Note over MS2: Restart
 

From 022d8f0d8a5f679288e5def3cfa4c75d19f41e72 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Mon, 2 Mar 2026 10:58:48 +0100
Subject: [PATCH 19/76] fix mermaid

---
 docs/specs/rerandomization.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 6d029ab92b..9470a9e545 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -396,11 +396,13 @@ sequenceDiagram
             Note over RW: Poll sees freeze_requested=FALSE
             RW->>RW: Resume chunk processing
             Note over MS: Convergence reached; startup continues
-        else Local behind max
+        else
+            Note over MS: Local behind max
             MS->>DB: SET freeze_requested=FALSE
             Note over RW: Resume to catch up
             MS->>MS: sleep + re-freeze with new request
-        else Local at max, peers behind
+        else
+            Note over MS: Local at max, peers behind
             MS->>MS: sleep briefly
         end
     end
@@ -530,7 +532,7 @@ sequenceDiagram
     Note over MSA,MSC: All parties proceed with DB load
 ```
 
-### Post-staging modification: transient DB inconsistency
+### Post-staging modification: transient DB inconsistency (same as before)
 
 When a modification arrives at one party between staging and apply, but hasn't yet propagated to the others, the version_id CAS causes asymmetric application. The DB shares are temporarily inconsistent but self-correct when the modification propagates. In-memory shares (used for live queries) are unaffected.
 

From 3a4385911497247b72e6516a3be1554cbab39622 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Mon, 2 Mar 2026 11:00:45 +0100
Subject: [PATCH 20/76] more

---
 docs/specs/rerandomization.md | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/docs/specs/rerandomization.md b/docs/specs/rerandomization.md
index 9470a9e545..135721db99 100644
--- a/docs/specs/rerandomization.md
+++ b/docs/specs/rerandomization.md
@@ -341,7 +341,8 @@ sequenceDiagram
     P0->>S3: Download 3 version-map hashes
     alt All hashes match (fast path)
         Note over P0: staging_divergent = empty
-    else Hash mismatch (slow path)
+    else
+        Note over P0: Hash mismatch (slow path)
         P0->>S3: Download 3 full version maps
         Note over P0: staging_divergent = differing IDs
     end
@@ -393,17 +394,18 @@ sequenceDiagram
             MS->>DB: load_iris_db (full DB snapshot into memory)
             MS->>DB: pg_advisory_unlock(APPLY_LOCK)
             MS->>DB: SET freeze_requested=FALSE
-            Note over RW: Poll sees freeze_requested=FALSE
-            RW->>RW: Resume chunk processing
-            Note over MS: Convergence reached; startup continues
+            MS->>RW: Poll sees freeze_requested=FALSE
+            RW->>RW: Resume chunk processing (startup continues)
         else
-            Note over MS: Local behind max
-            MS->>DB: SET freeze_requested=FALSE
-            Note over RW: Resume to catch up
-            MS->>MS: sleep + re-freeze with new request
-        else
-            Note over MS: Local at max, peers behind
-            MS->>MS: sleep briefly
+            alt Local behind max
+                MS->>MS: Local behind max
+                MS->>DB: SET freeze_requested=FALSE
+                MS->>RW: Resume to catch up
+                MS->>MS: sleep + re-freeze with new request
+            else
+                Note over MS: Local at max, peers behind
+                MS->>MS: sleep briefly
+            end
         end
     end
 ```

From e78e96b51dca4140d5899db87b5b8de2af6635d8 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Wed, 4 Mar 2026 12:29:07 +0100
Subject: [PATCH 21/76] pr feedback

---
 Cargo.toml                                    |   2 +-
 iris-mpc-common/src/helpers/sync.rs           |   5 +-
 iris-mpc-store/src/rerand.rs                  |   6 +-
 iris-mpc-upgrade/src/continuous_rerand.rs     |  31 ++--
 iris-mpc-upgrade/src/epoch.rs                 |  12 +-
 iris-mpc-upgrade/src/s3_coordination.rs       |  31 +++-
 .../tests/continuous_rerand_e2e.rs            | 132 +++++++++++++++++-
 iris-mpc-upgrade/tests/test_utils.rs          |  38 ++++-
 iris-mpc/src/server/mod.rs                    |   3 +
 9 files changed, 225 insertions(+), 35 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index fe8e6ea1ae..93dba2155f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,7 +77,7 @@ thiserror = "1"
 tokio = { version = "=1.49", features = ["full", "rt-multi-thread"] }
 tokio-util = "0.7.15"
 toml = { version = "0.8.23", features = ["preserve_order"] }
-uuid = { version = "1", features = ["v4"] }
+uuid = { version = "1", features = ["v4", "v7"] }
 iris-mpc-cpu = { path = "./iris-mpc-cpu" }
 ampc-anon-stats =  { git = "https://github.com/worldcoin/ampc-common.git", rev = "9e19ea57b1f7db11843d39ada2599557ce028a7b" }
 ampc-actor-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "9e19ea57b1f7db11843d39ada2599557ce028a7b" }
diff --git a/iris-mpc-common/src/helpers/sync.rs b/iris-mpc-common/src/helpers/sync.rs
index 0dde7ae2a3..0853e9aaa6 100644
--- a/iris-mpc-common/src/helpers/sync.rs
+++ b/iris-mpc-common/src/helpers/sync.rs
@@ -17,8 +17,9 @@ pub struct SyncState {
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct RerandSyncState {
     pub epoch: i32,
-    /// Highest chunk_id where live_applied = TRUE. -1 if none applied.
-    pub max_applied_chunk: i32,
+    /// Highest chunk_id where `live_applied = TRUE`, or `None` if no chunks
+    /// have been applied yet.
+    pub max_applied_chunk: Option<i32>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq)]
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index d91b232c0b..94455aee09 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -358,9 +358,7 @@ pub async fn build_rerand_sync_state(pool: &PgPool) -> Result<Option<RerandSyncS
             return Err(e);
         }
     };
-    let max_applied = get_max_applied_chunk_for_epoch(pool, epoch)
-        .await?
-        .unwrap_or(-1);
+    let max_applied = get_max_applied_chunk_for_epoch(pool, epoch).await?;
     Ok(Some(RerandSyncState {
         epoch,
         max_applied_chunk: max_applied,
@@ -395,7 +393,7 @@ fn rerand_control_exists(err: &sqlx::Error) -> bool {
 /// Request the rerand worker to freeze. Writes a unique `freeze_generation`
 /// to `rerand_control`. Returns the generation token.
 pub async fn request_rerand_freeze(pool: &PgPool) -> Result<Option<String>> {
-    let generation = uuid::Uuid::new_v4().to_string();
+    let generation = uuid::Uuid::now_v7().to_string();
     match sqlx::query(
         "UPDATE rerand_control SET freeze_requested = TRUE, freeze_generation = $1, frozen_generation = NULL WHERE id = 1",
     )
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 1f69cd0468..36c9716b72 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -54,7 +54,7 @@ pub async fn run_continuous_rerand(
             return Ok(());
         }
 
-        let epoch_hint = get_current_epoch(pool).await?.map(|e| e as u32);
+        let epoch_hint = get_current_epoch(pool).await?.unwrap_or(0) as u32;
         let active_epoch = epoch::determine_active_epoch(s3, &config.s3_bucket, epoch_hint).await?;
         tracing::info!("Active epoch: {}", active_epoch);
 
@@ -136,16 +136,19 @@ pub async fn run_continuous_rerand(
                 set_staging_written(pool, active_epoch as i32, chunk_id as i32).await?;
             }
 
-            // --- Upload version map + staged marker (both idempotent) ---
+            // Load the version map once; used for the hash upload below and
+            // for on-demand full-map upload if hashes diverge across parties.
+            let version_map = get_staging_version_map(
+                pool,
+                &staging_schema,
+                active_epoch as i32,
+                chunk_id as i32,
+            )
+            .await?;
+
+            // --- Upload version hash + staged marker (both idempotent) ---
             if !progress.as_ref().is_some_and(|p| p.all_confirmed) {
-                let version_map = get_staging_version_map(
-                    pool,
-                    &staging_schema,
-                    active_epoch as i32,
-                    chunk_id as i32,
-                )
-                .await?;
-                s3_coordination::upload_chunk_version_map(
+                s3_coordination::upload_chunk_version_hash(
                     s3,
                     &config.s3_bucket,
                     active_epoch,
@@ -163,7 +166,7 @@ pub async fn run_continuous_rerand(
                 )
                 .await?;
                 tracing::info!(
-                    "Epoch {} chunk {}: version map + staged marker uploaded",
+                    "Epoch {} chunk {}: version hash + staged marker uploaded",
                     active_epoch,
                     chunk_id
                 );
@@ -203,6 +206,8 @@ pub async fn run_continuous_rerand(
                 &config.s3_bucket,
                 active_epoch,
                 chunk_id,
+                config.party_id,
+                &version_map,
                 poll_interval,
             )
             .await?;
@@ -257,6 +262,10 @@ pub async fn run_continuous_rerand(
         )
         .await?;
         tracing::info!("Epoch {} completed, moving to next epoch", active_epoch);
+
+        if chunk_delay > Duration::ZERO {
+            sleep(chunk_delay).await;
+        }
     }
 }
 
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index 92c2541da1..4d4e090542 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -140,6 +140,8 @@ pub async fn idempotent_keygen(
     let private_key = if saved {
         private_key
     } else {
+        // This branch is hit when two instances of the binary race during a rolling deployment,
+        // which should not happen. It only exists for defensive purposes.
         tracing::warn!(
             "Epoch {}: private key already exists in SM (likely concurrent start); reloading it",
             epoch
@@ -214,13 +216,9 @@ pub async fn derive_shared_secret(
 /// manifest but without all three `complete` markers.
 ///
 /// `start_hint` allows callers to skip already-completed epochs (e.g. from
-/// `get_current_epoch`). Falls back to 0 if no hint is available.
-pub async fn determine_active_epoch(
-    s3: &S3Client,
-    bucket: &str,
-    start_hint: Option<u32>,
-) -> Result<u32> {
-    let mut epoch: u32 = start_hint.unwrap_or(0);
+/// `get_current_epoch`). Use `0` when no prior epoch information is available.
+pub async fn determine_active_epoch(s3: &S3Client, bucket: &str, start_hint: u32) -> Result<u32> {
+    let mut epoch: u32 = start_hint;
     loop {
         if !s3_coordination::manifest_exists(s3, bucket, epoch).await? {
             break;
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index a409436182..305193ad1f 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -275,8 +275,11 @@ fn version_map_hash(version_map: &[(i64, i16)]) -> [u8; 32] {
     *hasher.finalize().as_bytes()
 }
 
-/// Upload the version map and its blake3 hash for a chunk.
-pub async fn upload_chunk_version_map(
+/// Upload only the blake3 hash of a chunk's version map. The full map is
+/// deferred and only uploaded when a cross-party hash mismatch is detected
+/// (see [`compute_cross_party_divergent_ids`]), avoiding per-chunk S3 storage
+/// on the happy path.
+pub async fn upload_chunk_version_hash(
     s3: &S3Client,
     bucket: &str,
     epoch: u32,
@@ -285,10 +288,19 @@ pub async fn upload_chunk_version_map(
     version_map: &[(i64, i16)],
 ) -> Result<()> {
     let prefix = format!("{}/chunk-{}", epoch_party_prefix(epoch, party), chunk_id);
-
     let hash = version_map_hash(version_map);
-    upload_marker(s3, bucket, &format!("{prefix}/version-hash"), hash.to_vec()).await?;
+    upload_marker(s3, bucket, &format!("{prefix}/version-hash"), hash.to_vec()).await
+}
 
+async fn upload_chunk_version_map_body(
+    s3: &S3Client,
+    bucket: &str,
+    epoch: u32,
+    party: u8,
+    chunk_id: u32,
+    version_map: &[(i64, i16)],
+) -> Result<()> {
+    let prefix = format!("{}/chunk-{}", epoch_party_prefix(epoch, party), chunk_id);
     let body = serde_json::to_vec(version_map)?;
     upload_marker(s3, bucket, &format!("{prefix}/version-map"), body).await
 }
@@ -338,12 +350,17 @@ async fn download_chunk_version_map(
 ///
 /// Fast path: download only the 32-byte blake3 hashes concurrently. If all
 /// match, return empty (no disagreements). Slow path (hash mismatch):
-/// download the full maps concurrently and compute the exact disagreement set.
+/// upload this party's full version map, then download all maps concurrently
+/// and compute the exact disagreement set. All three parties independently
+/// detect the mismatch and upload, so polling converges without extra
+/// signaling.
 pub async fn compute_cross_party_divergent_ids(
     s3: &S3Client,
     bucket: &str,
     epoch: u32,
     chunk_id: u32,
+    party: u8,
+    version_map: &[(i64, i16)],
     poll_interval: Duration,
 ) -> Result<Vec<i64>> {
     let hashes: Vec<[u8; 32]> = try_join_all((0..NUM_PARTIES).map(|party| {
@@ -356,11 +373,13 @@ pub async fn compute_cross_party_divergent_ids(
     }
 
     tracing::info!(
-        "Epoch {} chunk {}: version-map hashes differ, downloading full maps",
+        "Epoch {} chunk {}: version-map hashes differ, uploading full map and downloading peers",
         epoch,
         chunk_id,
     );
 
+    upload_chunk_version_map_body(s3, bucket, epoch, party, chunk_id, version_map).await?;
+
     use std::collections::HashMap;
     let all_maps: Vec<HashMap<i64, i16>> = try_join_all((0..NUM_PARTIES).map(|party| {
         download_chunk_version_map(s3, bucket, epoch, party, chunk_id, poll_interval)
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index 2d916bc88e..5e2367f410 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -135,6 +135,9 @@ fn phase1_clean_epoch() {
         let env = TestEnv::setup().await?;
         println!("[phase 1] Clean epoch...");
 
+        let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
+        let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+
         let (h, t) = env.spawn_all();
         wait_epoch_done(&env.harness, 0).await?;
         stop_all(t, h).await;
@@ -142,6 +145,15 @@ fn phase1_clean_epoch() {
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1, "Expected rerand_epoch >= 1, got {}", ep);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+
+        let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+        for &id in &all_ids {
+            assert_ne!(
+                &pre_shares[&id], &post_shares[&id],
+                "Shares for id={} should differ after rerand",
+                id
+            );
+        }
         println!("[phase 1] PASSED (epoch={})", ep);
 
         env.teardown().await
@@ -160,6 +172,9 @@ fn phase2_kill_and_resume() {
         let env = TestEnv::setup().await?;
         println!("[phase 2] Kill-and-resume...");
 
+        let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
+        let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+
         // Run epoch 0, let 2 chunks stage, then kill
         let (h, t) = env.spawn_all();
         wait_chunks_staged(&env.harness, 0, 2).await?;
@@ -175,6 +190,15 @@ fn phase2_kill_and_resume() {
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+
+        let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+        for &id in &all_ids {
+            assert_ne!(
+                &pre_shares[&id], &post_shares[&id],
+                "Shares for id={} should differ after rerand",
+                id
+            );
+        }
         println!("[phase 2] PASSED (epoch={})", ep);
 
         env.teardown().await
@@ -193,6 +217,8 @@ fn phase3_concurrent_modifications() {
         let _ = tracing_subscriber::fmt::try_init();
         let env = TestEnv::setup().await?;
         let modified_ids: Vec<i64> = vec![5, 10, 15];
+        let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
+        let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
         println!("[phase 3] Concurrent modifications...");
 
         let (h, t) = env.spawn_all();
@@ -238,6 +264,31 @@ fn phase3_concurrent_modifications() {
         let ep = assert_consistent_rerand_epoch(&env.harness, &modified_ids).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &modified_ids).await?;
+
+        // Modified irises were skipped by rerand (version_id CAS mismatch).
+        // snapshot_all_fingerprints reconstructs from all 3 parties' shares,
+        // so succeeding here proves the modified irises are still reconstructable.
+        let post_fps = snapshot_all_fingerprints(&env.harness, &[]).await?;
+        for &id in &modified_ids {
+            assert!(
+                post_fps.contains_key(&id),
+                "Modified id {} missing from post-rerand snapshot",
+                id
+            );
+        }
+        println!("[phase 3]   modified irises verified (shares reconstruct)");
+
+        let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+        for &id in &all_ids {
+            if modified_ids.contains(&id) {
+                continue;
+            }
+            assert_ne!(
+                &pre_shares[&id], &post_shares[&id],
+                "Shares for id={} should differ after rerand",
+                id
+            );
+        }
         println!("[phase 3] PASSED (epoch={})", ep);
 
         env.teardown().await
@@ -257,6 +308,9 @@ fn phase4_server_restart_during_rerand() {
         let env = TestEnv::setup().await?;
         println!("[phase 4] Server restart during rerand...");
 
+        let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
+        let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+
         let (h, t) = env.spawn_all();
         wait_chunks_staged(&env.harness, 0, 1).await?;
 
@@ -271,6 +325,22 @@ fn phase4_server_restart_during_rerand() {
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+
+        // Verify shares actually changed (not just that plaintext still matches).
+        let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+        for &id in &all_ids {
+            let pre = &pre_shares[&id];
+            let post = &post_shares[&id];
+            assert_ne!(
+                pre, post,
+                "Shares for id={} should differ after rerandomization",
+                id
+            );
+        }
+        println!(
+            "[phase 4]   verified all {} irises have different shares after rerand",
+            all_ids.len()
+        );
         println!("[phase 4] PASSED (epoch={})", ep);
 
         env.teardown().await
@@ -290,11 +360,14 @@ fn phase5_staggered_restart() {
         let env = TestEnv::setup().await?;
         println!("[phase 5] Staggered restart...");
 
+        let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
+        let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+
         let (h, t) = env.spawn_all();
-        wait_chunks_staged(&env.harness, 0, 2).await?;
+        wait_chunks_staged(&env.harness, 0, 1).await?;
 
         // Kill party 0
-        println!("[phase 5]   killing party 0 after 2 chunks");
+        println!("[phase 5]   killing party 0 after 1 chunk");
         t[0].cancel();
         h[0].abort();
 
@@ -312,6 +385,15 @@ fn phase5_staggered_restart() {
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+
+        let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+        for &id in &all_ids {
+            assert_ne!(
+                &pre_shares[&id], &post_shares[&id],
+                "Shares for id={} should differ after rerand",
+                id
+            );
+        }
         println!("[phase 5] PASSED (epoch={})", ep);
 
         env.teardown().await
@@ -331,6 +413,9 @@ fn phase6_multiple_epochs() {
         let env = TestEnv::setup().await?;
         println!("[phase 6] Multiple epochs...");
 
+        let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
+        let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+
         let (h, t) = env.spawn_all();
 
         // Wait for epoch 0 to finish
@@ -341,11 +426,22 @@ fn phase6_multiple_epochs() {
         wait_epoch_done(&env.harness, 1).await?;
         println!("[phase 6]   epoch 1 completed");
 
+        // The inter-epoch delay (chunk_delay) gives the cancel token time to
+        // be observed before the next epoch's work begins.
         stop_all(t, h).await;
 
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 2, "Expected rerand_epoch >= 2, got {}", ep);
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
+
+        let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
+        for &id in &all_ids {
+            assert_ne!(
+                &pre_shares[&id], &post_shares[&id],
+                "Shares for id={} should differ after rerand",
+                id
+            );
+        }
         println!("[phase 6] PASSED (epoch={})", ep);
 
         env.teardown().await
@@ -471,6 +567,7 @@ fn phase9_asymmetric_modification_consistency() {
         let _ = tracing_subscriber::fmt::try_init();
         let env = TestEnv::setup().await?;
         let target_id: i64 = 20;
+        let pre_shares = snapshot_raw_shares(&env.harness, &[target_id]).await?;
         println!("[phase 9] Asymmetric modification consistency...");
 
         // Modify iris on P0 ONLY — simulates a reauth that propagated to
@@ -513,6 +610,37 @@ fn phase9_asymmetric_modification_consistency() {
             target_id, epochs
         );
 
+        // The excluded iris must have rerand_epoch == 0 (skipped everywhere).
+        assert_eq!(
+            epochs[0], 0,
+            "Excluded iris id={} should have rerand_epoch=0, got {}",
+            target_id, epochs[0]
+        );
+
+        // The iris was skipped by rerand, so P1 and P2 (unmodified) should
+        // have identical shares to before the test. P0 was byte-flipped.
+        let post_shares = snapshot_raw_shares(&env.harness, &[target_id]).await?;
+        for party in 1..NUM_PARTIES {
+            assert_eq!(
+                &pre_shares[&target_id][party],
+                &post_shares[&target_id][party],
+                "P{} shares for excluded id={} should be unchanged",
+                party,
+                target_id
+            );
+        }
+        assert_ne!(
+            &pre_shares[&target_id][0],
+            &post_shares[&target_id][0],
+            "P0 shares for id={} should differ (byte-flip modification)",
+            target_id
+        );
+
+        println!(
+            "[phase 9]   excluded iris id={} correctly skipped (rerand_epoch=0, P1/P2 unchanged)",
+            target_id
+        );
+
         // Verify non-modified rows reconstruct correctly.
         verify_fingerprints(&env.harness, &env.fingerprints, &[target_id]).await?;
 
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index e0fafbe979..6c144dfc62 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -142,7 +142,7 @@ impl TestEnv {
             s3_bucket: self.bucket.clone(),
             schema_name: format!("{}_{}", self.prefix, party_id),
             chunk_size: CHUNK_SIZE,
-            chunk_delay_secs: 0,
+            chunk_delay_secs: 1,
             safety_buffer_ids: 0,
             s3_poll_interval_ms: 200,
             healthcheck_port: 3020 + party_id as usize,
@@ -567,6 +567,8 @@ pub async fn shares_are_consistent(harness: &TestHarness, id: i64) -> Result<boo
     let lag_10 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID0);
     let lag_12 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID2);
     let lag_21 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID1);
+    let lag_02 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID2);
+    let lag_20 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID2, PartyID::ID0);
 
     for [s0, s1, s2] in &pairs {
         let recon01: Vec<u16> = s0
@@ -587,13 +589,45 @@ pub async fn shares_are_consistent(harness: &TestHarness, id: i64) -> Result<boo
                 (a * lag_12 + b * lag_21).coefs
             })
             .collect();
-        if recon01 != recon12 {
+        let recon02: Vec<u16> = s0
+            .chunks_exact(4)
+            .zip_eq(s2.chunks_exact(4))
+            .flat_map(|(a, b)| {
+                let a = GaloisRingElement::<Monomial>::from_coefs(a.try_into().unwrap());
+                let b = GaloisRingElement::<Monomial>::from_coefs(b.try_into().unwrap());
+                (a * lag_02 + b * lag_20).coefs
+            })
+            .collect();
+        if recon01 != recon12 || recon01 != recon02 {
             return Ok(false);
         }
     }
     Ok(true)
 }
 
+/// Snapshot raw share bytes for a set of IDs (all parties).
+/// Returns a map from id → Vec of (left_code, left_mask, right_code, right_mask) per party.
+pub async fn snapshot_raw_shares(
+    harness: &TestHarness,
+    ids: &[i64],
+) -> Result<HashMap<i64, Vec<(Vec<u8>, Vec<u8>, Vec<u8>, Vec<u8>)>>> {
+    let mut result = HashMap::new();
+    for &id in ids {
+        let mut party_shares = Vec::new();
+        for party in 0..NUM_PARTIES {
+            let iris = harness.store(party).get_iris_data_by_id(id).await?;
+            party_shares.push((
+                bytemuck::cast_slice::<u16, u8>(iris.left_code()).to_vec(),
+                bytemuck::cast_slice::<u16, u8>(iris.left_mask()).to_vec(),
+                bytemuck::cast_slice::<u16, u8>(iris.right_code()).to_vec(),
+                bytemuck::cast_slice::<u16, u8>(iris.right_mask()).to_vec(),
+            ));
+        }
+        result.insert(id, party_shares);
+    }
+    Ok(result)
+}
+
 /// Get the rerand_epoch for a specific iris ID across all parties.
 pub async fn get_rerand_epochs_for_id(harness: &TestHarness, id: i64) -> Result<[i32; 3]> {
     let mut epochs = [0i32; 3];
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index 19933a0d3d..2dd111e0fe 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -195,6 +195,9 @@ pub async fn server_main(config: Config) -> Result<()> {
     // Everything from here until freeze release must be wrapped so that
     // errors always release the freeze.
     let frozen_result = async {
+        // Acquire the apply lock to prevent concurrent startup DB loads.
+        // This should theoretically not be needed since the freeze should have
+        // prevented concurrent startup DB loads, but it's here for extra safety.
         let rerand_lock_conn = rerand_store::acquire_apply_lock(&iris_store.pool).await?;
 
         if shutdown_handler.is_shutting_down() {

From 8d78951a377d8e9574010faa5b3e29046e1730c9 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Wed, 4 Mar 2026 12:41:06 +0100
Subject: [PATCH 22/76] fmt

---
 iris-mpc-upgrade/tests/continuous_rerand_e2e.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index 5e2367f410..cefff75106 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -622,16 +622,13 @@ fn phase9_asymmetric_modification_consistency() {
         let post_shares = snapshot_raw_shares(&env.harness, &[target_id]).await?;
         for party in 1..NUM_PARTIES {
             assert_eq!(
-                &pre_shares[&target_id][party],
-                &post_shares[&target_id][party],
+                &pre_shares[&target_id][party], &post_shares[&target_id][party],
                 "P{} shares for excluded id={} should be unchanged",
-                party,
-                target_id
+                party, target_id
             );
         }
         assert_ne!(
-            &pre_shares[&target_id][0],
-            &post_shares[&target_id][0],
+            &pre_shares[&target_id][0], &post_shares[&target_id][0],
             "P0 shares for id={} should differ (byte-flip modification)",
             target_id
         );

From adcba77fec2075277b5d045d967b825af966dce6 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Wed, 4 Mar 2026 13:57:04 +0100
Subject: [PATCH 23/76] removed flaky test

---
 .../tests/continuous_rerand_e2e.rs            | 53 -------------------
 1 file changed, 53 deletions(-)

diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index cefff75106..a9d8ba8a97 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -347,59 +347,6 @@ fn phase4_server_restart_during_rerand() {
     });
 }
 
-// ============================================================================
-// Phase 5: Staggered restart -- kill one party mid-epoch, restart it, verify
-//           it catches up and the epoch completes
-// ============================================================================
-
-#[test]
-#[ignore = "Requires 3 local Postgres instances (6200-6202) and localstack; run via run-rerand-e2e-tests.sh"]
-fn phase5_staggered_restart() {
-    run_async(async {
-        let _ = tracing_subscriber::fmt::try_init();
-        let env = TestEnv::setup().await?;
-        println!("[phase 5] Staggered restart...");
-
-        let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
-        let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
-
-        let (h, t) = env.spawn_all();
-        wait_chunks_staged(&env.harness, 0, 1).await?;
-
-        // Kill party 0
-        println!("[phase 5]   killing party 0 after 1 chunk");
-        t[0].cancel();
-        h[0].abort();
-
-        // Immediately restart party 0
-        println!("[phase 5]   restarting party 0");
-        let (h0, t0) = env.spawn_rerand(0);
-
-        wait_epoch_done(&env.harness, 0).await?;
-
-        t0.cancel();
-        h0.abort();
-        let _ = h0.await;
-        stop_all(t, h).await;
-
-        let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
-        assert!(ep >= 1);
-        verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
-
-        let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
-        for &id in &all_ids {
-            assert_ne!(
-                &pre_shares[&id], &post_shares[&id],
-                "Shares for id={} should differ after rerand",
-                id
-            );
-        }
-        println!("[phase 5] PASSED (epoch={})", ep);
-
-        env.teardown().await
-    });
-}
-
 // ============================================================================
 // Phase 6: Multiple Epochs -- let the system run continuously across multiple
 //           epochs, verify seamless transition and correct rerandomization

From a425598cead76055286e8342f8b63c1eff3f3008 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Mon, 9 Mar 2026 13:42:08 +0100
Subject: [PATCH 24/76] up

---
 iris-mpc-upgrade/src/continuous_rerand.rs     |  4 ++
 .../tests/continuous_rerand_e2e.rs            | 50 ++++++-------
 iris-mpc-upgrade/tests/test_utils.rs          | 70 ++++++++++++++++---
 3 files changed, 86 insertions(+), 38 deletions(-)

diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 36c9716b72..2a24c82c7f 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -263,6 +263,10 @@ pub async fn run_continuous_rerand(
         .await?;
         tracing::info!("Epoch {} completed, moving to next epoch", active_epoch);
 
+        if is_cancelled(cancel) {
+            return Ok(());
+        }
+
         if chunk_delay > Duration::ZERO {
             sleep(chunk_delay).await;
         }
diff --git a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
index a9d8ba8a97..626f319538 100644
--- a/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
+++ b/iris-mpc-upgrade/tests/continuous_rerand_e2e.rs
@@ -144,13 +144,14 @@ fn phase1_clean_epoch() {
 
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1, "Expected rerand_epoch >= 1, got {}", ep);
+        assert_rerand_epoch_equals_for_ids(&env.harness, &all_ids, 1).await?;
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
 
         let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
         for &id in &all_ids {
             assert_ne!(
                 &pre_shares[&id], &post_shares[&id],
-                "Shares for id={} should differ after rerand",
+                "Shares for id={} should differ after rerandomization",
                 id
             );
         }
@@ -189,13 +190,14 @@ fn phase2_kill_and_resume() {
 
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
+        assert_rerand_epoch_equals_for_ids(&env.harness, &all_ids, 1).await?;
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
 
         let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
         for &id in &all_ids {
             assert_ne!(
                 &pre_shares[&id], &post_shares[&id],
-                "Shares for id={} should differ after rerand",
+                "Shares for id={} should differ after rerandomization",
                 id
             );
         }
@@ -218,6 +220,11 @@ fn phase3_concurrent_modifications() {
         let env = TestEnv::setup().await?;
         let modified_ids: Vec<i64> = vec![5, 10, 15];
         let all_ids: Vec<i64> = (1..=DB_SIZE as i64).collect();
+        let non_modified_ids: Vec<i64> = all_ids
+            .iter()
+            .copied()
+            .filter(|id| !modified_ids.contains(id))
+            .collect();
         let pre_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
         println!("[phase 3] Concurrent modifications...");
 
@@ -263,29 +270,17 @@ fn phase3_concurrent_modifications() {
 
         let ep = assert_consistent_rerand_epoch(&env.harness, &modified_ids).await?;
         assert!(ep >= 1);
+        // Modified IDs can be 0 or 1 depending on whether each party staged the
+        // chunk before or after the local version bump. We skip them in strict
+        // cross-party checks and focus on non-modified IDs + fingerprint safety.
+        assert_rerand_epoch_equals_for_ids(&env.harness, &non_modified_ids, 1).await?;
         verify_fingerprints(&env.harness, &env.fingerprints, &modified_ids).await?;
 
-        // Modified irises were skipped by rerand (version_id CAS mismatch).
-        // snapshot_all_fingerprints reconstructs from all 3 parties' shares,
-        // so succeeding here proves the modified irises are still reconstructable.
-        let post_fps = snapshot_all_fingerprints(&env.harness, &[]).await?;
-        for &id in &modified_ids {
-            assert!(
-                post_fps.contains_key(&id),
-                "Modified id {} missing from post-rerand snapshot",
-                id
-            );
-        }
-        println!("[phase 3]   modified irises verified (shares reconstruct)");
-
         let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
-        for &id in &all_ids {
-            if modified_ids.contains(&id) {
-                continue;
-            }
+        for &id in &non_modified_ids {
             assert_ne!(
                 &pre_shares[&id], &post_shares[&id],
-                "Shares for id={} should differ after rerand",
+                "Shares for id={} should differ after rerandomization",
                 id
             );
         }
@@ -324,23 +319,17 @@ fn phase4_server_restart_during_rerand() {
 
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 1);
+        assert_rerand_epoch_equals_for_ids(&env.harness, &all_ids, 1).await?;
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
 
-        // Verify shares actually changed (not just that plaintext still matches).
         let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
         for &id in &all_ids {
-            let pre = &pre_shares[&id];
-            let post = &post_shares[&id];
             assert_ne!(
-                pre, post,
+                &pre_shares[&id], &post_shares[&id],
                 "Shares for id={} should differ after rerandomization",
                 id
             );
         }
-        println!(
-            "[phase 4]   verified all {} irises have different shares after rerand",
-            all_ids.len()
-        );
         println!("[phase 4] PASSED (epoch={})", ep);
 
         env.teardown().await
@@ -379,13 +368,14 @@ fn phase6_multiple_epochs() {
 
         let ep = assert_consistent_rerand_epoch(&env.harness, &[]).await?;
         assert!(ep >= 2, "Expected rerand_epoch >= 2, got {}", ep);
+        assert_rerand_epoch_at_least_for_ids(&env.harness, &all_ids, 2).await?;
         verify_fingerprints(&env.harness, &env.fingerprints, &[]).await?;
 
         let post_shares = snapshot_raw_shares(&env.harness, &all_ids).await?;
         for &id in &all_ids {
             assert_ne!(
                 &pre_shares[&id], &post_shares[&id],
-                "Shares for id={} should differ after rerand",
+                "Shares for id={} should differ after rerandomization",
                 id
             );
         }
@@ -514,6 +504,7 @@ fn phase9_asymmetric_modification_consistency() {
         let _ = tracing_subscriber::fmt::try_init();
         let env = TestEnv::setup().await?;
         let target_id: i64 = 20;
+        let non_target_ids: Vec<i64> = (1..=DB_SIZE as i64).filter(|id| *id != target_id).collect();
         let pre_shares = snapshot_raw_shares(&env.harness, &[target_id]).await?;
         println!("[phase 9] Asymmetric modification consistency...");
 
@@ -541,6 +532,7 @@ fn phase9_asymmetric_modification_consistency() {
         // Non-modified rows should still be rerandomized consistently.
         let ep = assert_consistent_rerand_epoch(&env.harness, &[target_id]).await?;
         assert!(ep >= 1);
+        assert_rerand_epoch_equals_for_ids(&env.harness, &non_target_ids, 1).await?;
 
         // The modified ID should have been skipped (rerand_epoch stays 0)
         // on ALL parties, OR applied consistently. Either way shares must
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index 6c144dfc62..95afa053ee 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -339,20 +339,27 @@ pub async fn wait_epoch_done(harness: &TestHarness, epoch: i32) -> Result<()> {
     let deadline = tokio::time::Instant::now() + Duration::from_secs(120);
     let start = std::time::Instant::now();
     let mut last_print = start;
+    let expected_chunks: i64 = ((DB_SIZE as i64) + (CHUNK_SIZE as i64) - 1) / (CHUNK_SIZE as i64);
     loop {
         if tokio::time::Instant::now() > deadline {
             eyre::bail!("Timeout waiting for epoch {}", epoch);
         }
         let mut done = true;
         let mut applied = [0usize; 3];
+        let mut totals = [0usize; 3];
         for (i, party) in harness.parties.iter().enumerate() {
-            let rows: Vec<(bool,)> =
-                sqlx::query_as("SELECT live_applied FROM rerand_progress WHERE epoch = $1")
-                    .bind(epoch)
-                    .fetch_all(&party.store.pool)
-                    .await?;
-            applied[i] = rows.iter().filter(|(a,)| *a).count();
-            if rows.is_empty() || !rows.iter().all(|(a,)| *a) {
+            let (total, applied_count): (i64, i64) = sqlx::query_as(
+                "SELECT COUNT(*), COUNT(*) FILTER (WHERE live_applied = TRUE) \
+                 FROM rerand_progress WHERE epoch = $1",
+            )
+            .bind(epoch)
+            .fetch_one(&party.store.pool)
+            .await?;
+
+            totals[i] = total as usize;
+            applied[i] = applied_count as usize;
+
+            if total < expected_chunks || applied_count < expected_chunks {
                 done = false;
             }
         }
@@ -366,9 +373,10 @@ pub async fn wait_epoch_done(harness: &TestHarness, epoch: i32) -> Result<()> {
         }
         if last_print.elapsed() > Duration::from_secs(5) {
             println!(
-                "  waiting epoch {}: applied {:?} ({:.0}s)",
+                "  waiting epoch {}: applied {:?} / totals {:?} ({:.0}s)",
                 epoch,
                 applied,
+                totals,
                 start.elapsed().as_secs_f64()
             );
             last_print = std::time::Instant::now();
@@ -606,7 +614,7 @@ pub async fn shares_are_consistent(harness: &TestHarness, id: i64) -> Result<boo
 }
 
 /// Snapshot raw share bytes for a set of IDs (all parties).
-/// Returns a map from id → Vec of (left_code, left_mask, right_code, right_mask) per party.
+/// Returns a map from id to Vec of (left_code, left_mask, right_code, right_mask) per party.
 pub async fn snapshot_raw_shares(
     harness: &TestHarness,
     ids: &[i64],
@@ -641,6 +649,50 @@ pub async fn get_rerand_epochs_for_id(harness: &TestHarness, id: i64) -> Result<
     Ok(epochs)
 }
 
+/// Assert that all parties have the exact expected rerand_epoch for every id.
+pub async fn assert_rerand_epoch_equals_for_ids(
+    harness: &TestHarness,
+    ids: &[i64],
+    expected_epoch: i32,
+) -> Result<()> {
+    for &id in ids {
+        let epochs = get_rerand_epochs_for_id(harness, id).await?;
+        for (party, epoch) in epochs.iter().enumerate() {
+            eyre::ensure!(
+                *epoch == expected_epoch,
+                "Expected rerand_epoch={} for id={} on party {}, got {}",
+                expected_epoch,
+                id,
+                party,
+                epoch
+            );
+        }
+    }
+    Ok(())
+}
+
+/// Assert that all parties have rerand_epoch >= min_epoch for every id.
+pub async fn assert_rerand_epoch_at_least_for_ids(
+    harness: &TestHarness,
+    ids: &[i64],
+    min_epoch: i32,
+) -> Result<()> {
+    for &id in ids {
+        let epochs = get_rerand_epochs_for_id(harness, id).await?;
+        for (party, epoch) in epochs.iter().enumerate() {
+            eyre::ensure!(
+                *epoch >= min_epoch,
+                "Expected rerand_epoch>={} for id={} on party {}, got {}",
+                min_epoch,
+                id,
+                party,
+                epoch
+            );
+        }
+    }
+    Ok(())
+}
+
 async fn cleanup(harness: &TestHarness) -> Result<()> {
     for party in &harness.parties {
         let staging = rerand_store::staging_schema_name(&party.schema_name);

From 5fe01508b589d4dfd7b0c2fc78064b70557fa3a2 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Mon, 9 Mar 2026 15:28:38 +0100
Subject: [PATCH 25/76] up ampc-common

---
 Cargo.lock | 8 ++++----
 Cargo.toml | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 621b1f2571..7d91d9df49 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -78,7 +78,7 @@ checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
 [[package]]
 name = "ampc-actor-utils"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee#9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=7fe091aa2300ae10c7cec21bf0bff5c9514b6287#7fe091aa2300ae10c7cec21bf0bff5c9514b6287"
 dependencies = [
  "aes-prng",
  "ampc-secret-sharing",
@@ -110,7 +110,7 @@ dependencies = [
 [[package]]
 name = "ampc-anon-stats"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee#9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=7fe091aa2300ae10c7cec21bf0bff5c9514b6287#7fe091aa2300ae10c7cec21bf0bff5c9514b6287"
 dependencies = [
  "ampc-actor-utils",
  "ampc-secret-sharing",
@@ -136,7 +136,7 @@ dependencies = [
 [[package]]
 name = "ampc-secret-sharing"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee#9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=7fe091aa2300ae10c7cec21bf0bff5c9514b6287#7fe091aa2300ae10c7cec21bf0bff5c9514b6287"
 dependencies = [
  "aes-prng",
  "bytemuck",
@@ -151,7 +151,7 @@ dependencies = [
 [[package]]
 name = "ampc-server-utils"
 version = "0.1.0"
-source = "git+https://github.com/worldcoin/ampc-common.git?rev=9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee#9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee"
+source = "git+https://github.com/worldcoin/ampc-common.git?rev=7fe091aa2300ae10c7cec21bf0bff5c9514b6287#7fe091aa2300ae10c7cec21bf0bff5c9514b6287"
 dependencies = [
  "aws-sdk-secretsmanager",
  "aws-sdk-sqs",
diff --git a/Cargo.toml b/Cargo.toml
index 7f24809e31..eaf4a67d04 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -80,10 +80,10 @@ tokio-util = "0.7.15"
 toml = { version = "0.8.23", features = ["preserve_order"] }
 uuid = { version = "1", features = ["v4", "v7"] }
 iris-mpc-cpu = { path = "./iris-mpc-cpu" }
-ampc-anon-stats =  { git = "https://github.com/worldcoin/ampc-common.git", rev = "9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee" }
-ampc-actor-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee" }
-ampc-secret-sharing = { git = "https://github.com/worldcoin/ampc-common.git", rev = "9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee" }
-ampc-server-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "9700ddfecf3fe0a60a5f555cf9486d6ffb3a48ee" }
+ampc-anon-stats =  { git = "https://github.com/worldcoin/ampc-common.git", rev = "7fe091aa2300ae10c7cec21bf0bff5c9514b6287" }
+ampc-actor-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "7fe091aa2300ae10c7cec21bf0bff5c9514b6287" }
+ampc-secret-sharing = { git = "https://github.com/worldcoin/ampc-common.git", rev = "7fe091aa2300ae10c7cec21bf0bff5c9514b6287" }
+ampc-server-utils = { git = "https://github.com/worldcoin/ampc-common.git", rev = "7fe091aa2300ae10c7cec21bf0bff5c9514b6287" }
 
 # Abort on panics rather than unwinding.
 # This improves performance and makes panic propagation more reliable.

From 85363d6c31b6fe6018929caf21139c821ad97948 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Tue, 10 Mar 2026 15:29:02 +0100
Subject: [PATCH 26/76] codex feedback

---
 iris-mpc-upgrade/src/continuous_rerand.rs | 109 +++++++++++++++++++---
 1 file changed, 94 insertions(+), 15 deletions(-)

diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 2a24c82c7f..131e3ed847 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -12,8 +12,9 @@ use iris_mpc_store::rerand::{
 };
 use iris_mpc_store::Store;
 use sqlx::PgPool;
+use std::future::Future;
 use std::time::Duration;
-use tokio::time::sleep;
+use tokio::time::{sleep, timeout, Instant};
 use tokio_util::sync::CancellationToken;
 
 use crate::config::RerandomizeContinuousConfig;
@@ -21,6 +22,15 @@ use crate::epoch;
 use crate::rerandomization::randomize_iris;
 use crate::s3_coordination::{self, Manifest};
 
+const INTERRUPTIBLE_POLL_TIMEOUT: Duration = Duration::from_secs(30 * 60);
+const MIN_POLL_SLICE: Duration = Duration::from_secs(2);
+const MAX_POLL_SLICE: Duration = Duration::from_secs(30);
+
+enum PollOutcome<T> {
+    Completed(T),
+    Cancelled,
+}
+
 /// Run the continuous rerandomization loop.
 ///
 /// If `cancel` is provided, the loop checks for cancellation between chunk
@@ -178,14 +188,26 @@ pub async fn run_continuous_rerand(
 
             // --- Wait for all parties to confirm staging ---
             if !progress.as_ref().is_some_and(|p| p.all_confirmed) {
-                s3_coordination::poll_chunk_staged_all(
-                    s3,
-                    &config.s3_bucket,
-                    active_epoch,
-                    chunk_id,
+                match run_interruptible_poll(
+                    pool,
+                    cancel,
                     poll_interval,
+                    "chunk staged confirmation",
+                    || {
+                        s3_coordination::poll_chunk_staged_all(
+                            s3,
+                            &config.s3_bucket,
+                            active_epoch,
+                            chunk_id,
+                            poll_interval,
+                        )
+                    },
                 )
-                .await?;
+                .await?
+                {
+                    PollOutcome::Completed(()) => {}
+                    PollOutcome::Cancelled => return Ok(()),
+                }
                 set_all_confirmed(pool, active_epoch as i32, chunk_id as i32).await?;
                 tracing::info!(
                     "Epoch {} chunk {}: all parties confirmed",
@@ -201,16 +223,28 @@ pub async fn run_continuous_rerand(
             // --- Apply ---
             // 1. Compute staging-time cross-party disagreements from version maps.
             //    This is pure S3 reads — no DB lock held.
-            let staging_divergent = s3_coordination::compute_cross_party_divergent_ids(
-                s3,
-                &config.s3_bucket,
-                active_epoch,
-                chunk_id,
-                config.party_id,
-                &version_map,
+            let staging_divergent = match run_interruptible_poll(
+                pool,
+                cancel,
                 poll_interval,
+                "cross-party version-map convergence",
+                || {
+                    s3_coordination::compute_cross_party_divergent_ids(
+                        s3,
+                        &config.s3_bucket,
+                        active_epoch,
+                        chunk_id,
+                        config.party_id,
+                        &version_map,
+                        poll_interval,
+                    )
+                },
             )
-            .await?;
+            .await?
+            {
+                PollOutcome::Completed(ids) => ids,
+                PollOutcome::Cancelled => return Ok(()),
+            };
 
             // 2. Apply under lock. The function acquires RERAND_MODIFY_LOCK +
             //    RERAND_APPLY_LOCK, deletes staging_divergent, applies via
@@ -277,6 +311,51 @@ fn is_cancelled(cancel: Option<&CancellationToken>) -> bool {
     cancel.is_some_and(|c| c.is_cancelled())
 }
 
+fn poll_slice_duration(poll_interval: Duration) -> Duration {
+    poll_interval
+        .saturating_add(poll_interval)
+        .max(MIN_POLL_SLICE)
+        .min(MAX_POLL_SLICE)
+}
+
+async fn run_interruptible_poll<T, F, Fut>(
+    pool: &PgPool,
+    cancel: Option<&CancellationToken>,
+    poll_interval: Duration,
+    stage_name: &str,
+    mut op: F,
+) -> Result<PollOutcome<T>>
+where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = Result<T>>,
+{
+    let deadline = Instant::now() + INTERRUPTIBLE_POLL_TIMEOUT;
+    let slice = poll_slice_duration(poll_interval);
+
+    loop {
+        if is_cancelled(cancel) {
+            return Ok(PollOutcome::Cancelled);
+        }
+        if !check_and_handle_freeze(pool, cancel).await? {
+            return Ok(PollOutcome::Cancelled);
+        }
+        if Instant::now() >= deadline {
+            eyre::bail!(
+                "Timeout after {:?} while waiting for {}",
+                INTERRUPTIBLE_POLL_TIMEOUT,
+                stage_name
+            );
+        }
+
+        match timeout(slice, op()).await {
+            Ok(result) => return Ok(PollOutcome::Completed(result?)),
+            Err(_) => {
+                tracing::debug!("Still waiting for {}; rechecking freeze/cancel", stage_name);
+            }
+        }
+    }
+}
+
 async fn get_or_create_manifest(
     s3: &S3Client,
     store: &Store,

From 9ada456f64ffff48a2bbb3634859e00c7636faae Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 20 Mar 2026 15:32:55 +0100
Subject: [PATCH 27/76] prepare a docker image

---
 .github/workflows/build-and-push-rerandomization-protocol.yaml | 1 +
 deploy/dev/common-values-continuous-rerand.yaml                | 0
 2 files changed, 1 insertion(+)
 create mode 100644 deploy/dev/common-values-continuous-rerand.yaml

diff --git a/.github/workflows/build-and-push-rerandomization-protocol.yaml b/.github/workflows/build-and-push-rerandomization-protocol.yaml
index 3dbb60eec2..74dddf2830 100644
--- a/.github/workflows/build-and-push-rerandomization-protocol.yaml
+++ b/.github/workflows/build-and-push-rerandomization-protocol.yaml
@@ -5,6 +5,7 @@ on:
     branches:
       - main
       - "fix/add-cacerts-to-rerandom-binary"
+      - "ps/cont-rerand"
     paths:
       - Dockerfile.shares-re-randomization
       - iris-mpc-upgrade/**
diff --git a/deploy/dev/common-values-continuous-rerand.yaml b/deploy/dev/common-values-continuous-rerand.yaml
new file mode 100644
index 0000000000..e69de29bb2

From 5e63cd6f6c540afd714949e89abfdc9f7dbdb65a Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 20 Mar 2026 15:48:50 +0100
Subject: [PATCH 28/76] deployment wip

---
 .../values-continuous-rerand.yaml             | 47 +++++++++++++++++
 .../dev/common-values-continuous-rerand.yaml  | 52 +++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml

diff --git a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
new file mode 100644
index 0000000000..cb28dd68e3
--- /dev/null
+++ b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
@@ -0,0 +1,47 @@
+env:
+  - name: RANGE_MIN
+    value: "1"
+  - name: RANGE_MAX_INCLUSIVE
+    value: "1000"
+  - name: CHUNK_SIZE
+    value: "1000"
+  - name: NUM_TASKS
+    value: "4"
+  - name: SCHEMA_NAME
+    value: 
+  - name: ENVIRONMENT
+    value: dev
+  - name: DB_URL
+    valueFrom:
+      secretKeyRef:
+        key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
+        name: application
+  - name: SERVICE_NAME
+    value: iris-mpc-db-rerandomization-0
+  - name: AWS_REGION
+    value: eu-north-1
+  - name: PARTY_ID
+    value: "0"
+  - name: PUBLIC_KEY_BASE_URL
+    value: "https://pki-smpc.worldcoin.org/iris-mpc-tripartite-ecdh-public-key-party"
+  - name: PUBLIC_KEY_BUCKET_NAME
+    value: "wf-smpcv2-prod-public-keys"
+
+initContainer:
+  enabled: true
+  name: "rerandomize-db-key-gen"
+  image: "ghcr.io/worldcoin/rerandomization-protocol:v0.24.1@sha256:cdd05609f25a7f74a6dc75f07608d2217f6152351bbf9f69dab6651baff57b04"
+  imagePullPolicy: IfNotPresent
+  # add sleep because deployements are not synched. A party could
+  # end up reading an outdated public key if the key generation is not finished
+  command:
+    - /bin/sh
+    - -c
+    - rerandomize-db key-gen && sleep 30
+  env:
+    - name: ENVIRONMENT
+      value: prod
+    - name: PARTY_ID
+      value: "0"
+    - name: PUBLIC_KEY_BUCKET_NAME
+      value: "wf-smpcv2-prod-public-keys"
diff --git a/deploy/dev/common-values-continuous-rerand.yaml b/deploy/dev/common-values-continuous-rerand.yaml
index e69de29bb2..1fb5e9ca4a 100644
--- a/deploy/dev/common-values-continuous-rerand.yaml
+++ b/deploy/dev/common-values-continuous-rerand.yaml
@@ -0,0 +1,52 @@
+image: "ghcr.io/worldcoin/rerandomization-protocol:9ada456f64ffff48a2bbb3634859e00c7636faae"
+replicaCount: 1
+
+environment: prod
+
+# this is needed to prevent the job from restarting if it fails
+backoffLimit: 0
+
+command: ["/bin/rerandomize-db"]
+args:
+  - "rerandomize-continuous"
+  - "--range-min=$(RANGE_MIN)"
+  - "--range-max-inclusive=$(RANGE_MAX_INCLUSIVE)"
+  - "--chunk-size=$(CHUNK_SIZE)"
+  - "--num-tasks=$(NUM_TASKS)"
+
+serviceAccount:
+  create: true
+  name: "iris-mpc-db-rerandomization"
+
+ports:
+  - containerPort: 3000
+    name: health
+    protocol: TCP
+
+podSecurityContext:
+  runAsUser: 65534
+  runAsGroup: 65534
+
+imagePullSecrets:
+  enabled: true
+  secretName: github-secret
+
+resources:
+  limits:
+    cpu: 3.5
+    memory: 12Gi
+  requests:
+    cpu: 3.5
+    memory: 12Gi
+
+nodeSelector:
+  kubernetes.io/arch: amd64
+  workload: "rerandomization"
+
+tolerations:
+  - key: "dedicated"
+    operator: "Equal"
+    value: "dbRerandomization"
+    effect: "NoSchedule"
+
+concurrencyPolicy: Replace

From 4d63f6df6f78c6cc4b194af3221e1d8f95afb82a Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 20 Mar 2026 16:23:11 +0100
Subject: [PATCH 29/76] per node configs

---
 .../values-continuous-rerand.yaml             | 59 ++++++-------------
 .../values-continuous-rerand.yaml             | 26 ++++++++
 .../values-continuous-rerand.yaml             | 26 ++++++++
 3 files changed, 71 insertions(+), 40 deletions(-)
 create mode 100644 deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
 create mode 100644 deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml

diff --git a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
index cb28dd68e3..25fcd9b161 100644
--- a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
+++ b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
@@ -1,47 +1,26 @@
 env:
-  - name: RANGE_MIN
-    value: "1"
-  - name: RANGE_MAX_INCLUSIVE
-    value: "1000"
-  - name: CHUNK_SIZE
-    value: "1000"
-  - name: NUM_TASKS
-    value: "4"
-  - name: SCHEMA_NAME
-    value: 
-  - name: ENVIRONMENT
-    value: dev
+  - name: SERVICE_NAME
+    value: iris-mpc-db-continuous-rerandomization-0
+  - name: AWS_REGION
+    value: eu-north-1
+  - name: PARTY_ID
+    value: "0"
   - name: DB_URL
     valueFrom:
       secretKeyRef:
         key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
         name: application
-  - name: SERVICE_NAME
-    value: iris-mpc-db-rerandomization-0
-  - name: AWS_REGION
-    value: eu-north-1
-  - name: PARTY_ID
+  - name: SCHEMA_NAME
+    value: SMPC_minfhd_dev_0
+  - name: ENVIRONMENT
+    value: dev
+  - name: RERAND_S3_BUCKET
+    value: ampc-hnsw-continuous-rerand-store-node-0-dev
+  - name: CHUNK_SIZE
+    value: "2000"
+  - name: CHUNK_DELAY_SECS
+    value: "1"
+  - name: SAFETY_BUFFER_IDS
     value: "0"
-  - name: PUBLIC_KEY_BASE_URL
-    value: "https://pki-smpc.worldcoin.org/iris-mpc-tripartite-ecdh-public-key-party"
-  - name: PUBLIC_KEY_BUCKET_NAME
-    value: "wf-smpcv2-prod-public-keys"
-
-initContainer:
-  enabled: true
-  name: "rerandomize-db-key-gen"
-  image: "ghcr.io/worldcoin/rerandomization-protocol:v0.24.1@sha256:cdd05609f25a7f74a6dc75f07608d2217f6152351bbf9f69dab6651baff57b04"
-  imagePullPolicy: IfNotPresent
-  # add sleep because deployements are not synched. A party could
-  # end up reading an outdated public key if the key generation is not finished
-  command:
-    - /bin/sh
-    - -c
-    - rerandomize-db key-gen && sleep 30
-  env:
-    - name: ENVIRONMENT
-      value: prod
-    - name: PARTY_ID
-      value: "0"
-    - name: PUBLIC_KEY_BUCKET_NAME
-      value: "wf-smpcv2-prod-public-keys"
+  - name: S3_POLL_INTERVAL_MS
+    value: "2000"
diff --git a/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
new file mode 100644
index 0000000000..135431f82e
--- /dev/null
+++ b/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
@@ -0,0 +1,26 @@
+env:
+  - name: SERVICE_NAME
+    value: iris-mpc-db-continuous-rerandomization-1
+  - name: AWS_REGION
+    value: eu-north-1
+  - name: PARTY_ID
+    value: "1"
+  - name: DB_URL
+    valueFrom:
+      secretKeyRef:
+        key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
+        name: application
+  - name: SCHEMA_NAME
+    value: SMPC_minfhd_dev_1
+  - name: ENVIRONMENT
+    value: dev
+  - name: RERAND_S3_BUCKET
+    value: ampc-hnsw-continuous-rerand-store-node-1-dev
+  - name: CHUNK_SIZE
+    value: "2000"
+  - name: CHUNK_DELAY_SECS
+    value: "1"
+  - name: SAFETY_BUFFER_IDS
+    value: "0"
+  - name: S3_POLL_INTERVAL_MS
+    value: "2000"
diff --git a/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
new file mode 100644
index 0000000000..f1c2a12b2d
--- /dev/null
+++ b/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
@@ -0,0 +1,26 @@
+env:
+  - name: SERVICE_NAME
+    value: iris-mpc-db-continuous-rerandomization-2
+  - name: AWS_REGION
+    value: eu-north-1
+  - name: PARTY_ID
+    value: "2"
+  - name: DB_URL
+    valueFrom:
+      secretKeyRef:
+        key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
+        name: application
+  - name: SCHEMA_NAME
+    value: SMPC_minfhd_dev_2
+  - name: ENVIRONMENT
+    value: dev
+  - name: RERAND_S3_BUCKET
+    value: ampc-hnsw-continuous-rerand-store-node-2-dev
+  - name: CHUNK_SIZE
+    value: "2000"
+  - name: CHUNK_DELAY_SECS
+    value: "1"
+  - name: SAFETY_BUFFER_IDS
+    value: "0"
+  - name: S3_POLL_INTERVAL_MS
+    value: "2000"

From ab259f93577aa61acd5ef129797d0fc61c284b4b Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 20 Mar 2026 17:47:10 +0100
Subject: [PATCH 30/76] unify naming

---
 deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml | 2 +-
 deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml | 2 +-
 deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml | 2 +-
 deploy/dev/common-values-continuous-rerand.yaml          | 5 +----
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
index 25fcd9b161..513d00fc51 100644
--- a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
+++ b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: iris-mpc-db-continuous-rerandomization-0
+    value: ampc-hnsw-continuous-rerand-0
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
diff --git a/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
index 135431f82e..1068f797b5 100644
--- a/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
+++ b/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: iris-mpc-db-continuous-rerandomization-1
+    value: ampc-hnsw-continuous-rerand-1
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
diff --git a/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
index f1c2a12b2d..00264d639f 100644
--- a/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
+++ b/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: iris-mpc-db-continuous-rerandomization-2
+    value: ampc-hnsw-continuous-rerand-2
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
diff --git a/deploy/dev/common-values-continuous-rerand.yaml b/deploy/dev/common-values-continuous-rerand.yaml
index 1fb5e9ca4a..211c770b94 100644
--- a/deploy/dev/common-values-continuous-rerand.yaml
+++ b/deploy/dev/common-values-continuous-rerand.yaml
@@ -3,9 +3,6 @@ replicaCount: 1
 
 environment: prod
 
-# this is needed to prevent the job from restarting if it fails
-backoffLimit: 0
-
 command: ["/bin/rerandomize-db"]
 args:
   - "rerandomize-continuous"
@@ -16,7 +13,7 @@ args:
 
 serviceAccount:
   create: true
-  name: "iris-mpc-db-rerandomization"
+  name: "ampc-hnsw-continuous-rerand"
 
 ports:
   - containerPort: 3000

From 13c21214f3deb685c720fec059cec1991d8a67af Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 20 Mar 2026 17:47:55 +0100
Subject: [PATCH 31/76] start running on a non-main schema

---
 deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml | 2 +-
 deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml | 2 +-
 deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
index 513d00fc51..6c01c6341a 100644
--- a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
+++ b/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_minfhd_dev_0
+    value: SMPC_dev_0
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
diff --git a/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
index 1068f797b5..81e5ccc501 100644
--- a/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
+++ b/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_minfhd_dev_1
+    value: SMPC_dev_1
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
diff --git a/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml b/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
index 00264d639f..89e2b0ca9c 100644
--- a/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
+++ b/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_minfhd_dev_2
+    value: SMPC_dev_2
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET

From 661343cb2b0e8dd14927aea17885d338a6a56d37 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 23 Mar 2026 12:47:18 +0100
Subject: [PATCH 32/76] first start in iris-mpc stage, not ampc dev

---
 deploy/{dev => stage}/common-values-continuous-rerand.yaml      | 2 +-
 .../smpcv2-0-stage}/values-continuous-rerand.yaml               | 0
 .../smpcv2-1-stage}/values-continuous-rerand.yaml               | 0
 .../smpcv2-2-stage}/values-continuous-rerand.yaml               | 0
 4 files changed, 1 insertion(+), 1 deletion(-)
 rename deploy/{dev => stage}/common-values-continuous-rerand.yaml (97%)
 rename deploy/{dev/ampc-hnsw-0-dev => stage/smpcv2-0-stage}/values-continuous-rerand.yaml (100%)
 rename deploy/{dev/ampc-hnsw-1-dev => stage/smpcv2-1-stage}/values-continuous-rerand.yaml (100%)
 rename deploy/{dev/ampc-hnsw-2-dev => stage/smpcv2-2-stage}/values-continuous-rerand.yaml (100%)

diff --git a/deploy/dev/common-values-continuous-rerand.yaml b/deploy/stage/common-values-continuous-rerand.yaml
similarity index 97%
rename from deploy/dev/common-values-continuous-rerand.yaml
rename to deploy/stage/common-values-continuous-rerand.yaml
index 211c770b94..122750f593 100644
--- a/deploy/dev/common-values-continuous-rerand.yaml
+++ b/deploy/stage/common-values-continuous-rerand.yaml
@@ -1,7 +1,7 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:9ada456f64ffff48a2bbb3634859e00c7636faae"
 replicaCount: 1
 
-environment: prod
+environment: stage
 
 command: ["/bin/rerandomize-db"]
 args:
diff --git a/deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml b/deploy/stage/smpcv2-0-stage/values-continuous-rerand.yaml
similarity index 100%
rename from deploy/dev/ampc-hnsw-0-dev/values-continuous-rerand.yaml
rename to deploy/stage/smpcv2-0-stage/values-continuous-rerand.yaml
diff --git a/deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml b/deploy/stage/smpcv2-1-stage/values-continuous-rerand.yaml
similarity index 100%
rename from deploy/dev/ampc-hnsw-1-dev/values-continuous-rerand.yaml
rename to deploy/stage/smpcv2-1-stage/values-continuous-rerand.yaml
diff --git a/deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml b/deploy/stage/smpcv2-2-stage/values-continuous-rerand.yaml
similarity index 100%
rename from deploy/dev/ampc-hnsw-2-dev/values-continuous-rerand.yaml
rename to deploy/stage/smpcv2-2-stage/values-continuous-rerand.yaml

From 0248649cade82841e481fa43bc5f206f5311e6cc Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 11:51:08 +0100
Subject: [PATCH 33/76] use full service name for smpc stage deploy

---
 ...ommon-values-iris-mpc-continuous-rerandomisation.yaml} | 8 ++------
 ...ml => values-iris-mpc-continuous-rerandomisation.yaml} | 6 +++---
 ...ml => values-iris-mpc-continuous-rerandomisation.yaml} | 6 +++---
 ...ml => values-iris-mpc-continuous-rerandomisation.yaml} | 6 +++---
 4 files changed, 11 insertions(+), 15 deletions(-)
 rename deploy/stage/{common-values-continuous-rerand.yaml => common-values-iris-mpc-continuous-rerandomisation.yaml} (76%)
 rename deploy/stage/smpcv2-0-stage/{values-continuous-rerand.yaml => values-iris-mpc-continuous-rerandomisation.yaml} (75%)
 rename deploy/stage/smpcv2-1-stage/{values-continuous-rerand.yaml => values-iris-mpc-continuous-rerandomisation.yaml} (75%)
 rename deploy/stage/smpcv2-2-stage/{values-continuous-rerand.yaml => values-iris-mpc-continuous-rerandomisation.yaml} (75%)

diff --git a/deploy/stage/common-values-continuous-rerand.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomisation.yaml
similarity index 76%
rename from deploy/stage/common-values-continuous-rerand.yaml
rename to deploy/stage/common-values-iris-mpc-continuous-rerandomisation.yaml
index 122750f593..556d2732af 100644
--- a/deploy/stage/common-values-continuous-rerand.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomisation.yaml
@@ -6,14 +6,10 @@ environment: stage
 command: ["/bin/rerandomize-db"]
 args:
   - "rerandomize-continuous"
-  - "--range-min=$(RANGE_MIN)"
-  - "--range-max-inclusive=$(RANGE_MAX_INCLUSIVE)"
-  - "--chunk-size=$(CHUNK_SIZE)"
-  - "--num-tasks=$(NUM_TASKS)"
 
 serviceAccount:
   create: true
-  name: "ampc-hnsw-continuous-rerand"
+  name: "iris-mpc-continuous-rerand"
 
 ports:
   - containerPort: 3000
@@ -43,7 +39,7 @@ nodeSelector:
 tolerations:
   - key: "dedicated"
     operator: "Equal"
-    value: "dbRerandomization"
+    value: "continuousDbRerandomization"
     effect: "NoSchedule"
 
 concurrencyPolicy: Replace
diff --git a/deploy/stage/smpcv2-0-stage/values-continuous-rerand.yaml b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomisation.yaml
similarity index 75%
rename from deploy/stage/smpcv2-0-stage/values-continuous-rerand.yaml
rename to deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomisation.yaml
index 6c01c6341a..3cacae3776 100644
--- a/deploy/stage/smpcv2-0-stage/values-continuous-rerand.yaml
+++ b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomisation.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: ampc-hnsw-continuous-rerand-0
+    value: iris-mpc-continuous-rerandomisation-0
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
@@ -8,14 +8,14 @@ env:
   - name: DB_URL
     valueFrom:
       secretKeyRef:
-        key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
+        key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
     value: SMPC_dev_0
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
-    value: ampc-hnsw-continuous-rerand-store-node-0-dev
+    value: wf-smpcv2-stage-continuous-rerandomisation
   - name: CHUNK_SIZE
     value: "2000"
   - name: CHUNK_DELAY_SECS
diff --git a/deploy/stage/smpcv2-1-stage/values-continuous-rerand.yaml b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomisation.yaml
similarity index 75%
rename from deploy/stage/smpcv2-1-stage/values-continuous-rerand.yaml
rename to deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomisation.yaml
index 81e5ccc501..96dc46aa29 100644
--- a/deploy/stage/smpcv2-1-stage/values-continuous-rerand.yaml
+++ b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomisation.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: ampc-hnsw-continuous-rerand-1
+    value: iris-mpc-continuous-rerandomisation-1
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
@@ -8,14 +8,14 @@ env:
   - name: DB_URL
     valueFrom:
       secretKeyRef:
-        key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
+        key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
     value: SMPC_dev_1
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
-    value: ampc-hnsw-continuous-rerand-store-node-1-dev
+    value: wf-smpcv2-stage-continuous-rerandomisation
   - name: CHUNK_SIZE
     value: "2000"
   - name: CHUNK_DELAY_SECS
diff --git a/deploy/stage/smpcv2-2-stage/values-continuous-rerand.yaml b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomisation.yaml
similarity index 75%
rename from deploy/stage/smpcv2-2-stage/values-continuous-rerand.yaml
rename to deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomisation.yaml
index 89e2b0ca9c..6bc072836f 100644
--- a/deploy/stage/smpcv2-2-stage/values-continuous-rerand.yaml
+++ b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomisation.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: ampc-hnsw-continuous-rerand-2
+    value: iris-mpc-continuous-rerandomisation-2
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
@@ -8,14 +8,14 @@ env:
   - name: DB_URL
     valueFrom:
       secretKeyRef:
-        key: DATABASE_AURORA_HNSW_FROM_SNAPSHOT_URL
+        key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
     value: SMPC_dev_2
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
-    value: ampc-hnsw-continuous-rerand-store-node-2-dev
+    value: wf-smpcv2-stage-continuous-rerandomisation
   - name: CHUNK_SIZE
     value: "2000"
   - name: CHUNK_DELAY_SECS

From 1cad16f744a0da22e5bf1c8b9114174294826fa4 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 12:33:51 +0100
Subject: [PATCH 34/76] unify naming

---
 ...=> common-values-iris-mpc-continuous-rerandomization.yaml} | 0
 ...n.yaml => values-iris-mpc-continuous-rerandomization.yaml} | 4 ++--
 ...n.yaml => values-iris-mpc-continuous-rerandomization.yaml} | 4 ++--
 ...n.yaml => values-iris-mpc-continuous-rerandomization.yaml} | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)
 rename deploy/stage/{common-values-iris-mpc-continuous-rerandomisation.yaml => common-values-iris-mpc-continuous-rerandomization.yaml} (100%)
 rename deploy/stage/smpcv2-0-stage/{values-iris-mpc-continuous-rerandomisation.yaml => values-iris-mpc-continuous-rerandomization.yaml} (82%)
 rename deploy/stage/smpcv2-1-stage/{values-iris-mpc-continuous-rerandomisation.yaml => values-iris-mpc-continuous-rerandomization.yaml} (82%)
 rename deploy/stage/smpcv2-2-stage/{values-iris-mpc-continuous-rerandomisation.yaml => values-iris-mpc-continuous-rerandomization.yaml} (82%)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomisation.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
similarity index 100%
rename from deploy/stage/common-values-iris-mpc-continuous-rerandomisation.yaml
rename to deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
diff --git a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomisation.yaml b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
similarity index 82%
rename from deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomisation.yaml
rename to deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
index 3cacae3776..257c73f1d9 100644
--- a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomisation.yaml
+++ b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: iris-mpc-continuous-rerandomisation-0
+    value: iris-mpc-continuous-rerandomization-0
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
@@ -15,7 +15,7 @@ env:
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
-    value: wf-smpcv2-stage-continuous-rerandomisation
+    value: wf-smpcv2-stage-continuous-rerandomization
   - name: CHUNK_SIZE
     value: "2000"
   - name: CHUNK_DELAY_SECS
diff --git a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomisation.yaml b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
similarity index 82%
rename from deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomisation.yaml
rename to deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
index 96dc46aa29..04e06ddb3e 100644
--- a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomisation.yaml
+++ b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: iris-mpc-continuous-rerandomisation-1
+    value: iris-mpc-continuous-rerandomization-1
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
@@ -15,7 +15,7 @@ env:
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
-    value: wf-smpcv2-stage-continuous-rerandomisation
+    value: wf-smpcv2-stage-continuous-rerandomization
   - name: CHUNK_SIZE
     value: "2000"
   - name: CHUNK_DELAY_SECS
diff --git a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomisation.yaml b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
similarity index 82%
rename from deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomisation.yaml
rename to deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
index 6bc072836f..1b0b497f51 100644
--- a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomisation.yaml
+++ b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -1,6 +1,6 @@
 env:
   - name: SERVICE_NAME
-    value: iris-mpc-continuous-rerandomisation-2
+    value: iris-mpc-continuous-rerandomization-2
   - name: AWS_REGION
     value: eu-north-1
   - name: PARTY_ID
@@ -15,7 +15,7 @@ env:
   - name: ENVIRONMENT
     value: dev
   - name: RERAND_S3_BUCKET
-    value: wf-smpcv2-stage-continuous-rerandomisation
+    value: wf-smpcv2-stage-continuous-rerandomization
   - name: CHUNK_SIZE
     value: "2000"
   - name: CHUNK_DELAY_SECS

From bf6847208911c2aefb0bbfb58a913f578f9c170e Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 12:34:10 +0100
Subject: [PATCH 35/76] make sure to use secrets prefixed with service name

---
 iris-mpc-upgrade/src/config.rs            |  3 ++
 iris-mpc-upgrade/src/continuous_rerand.rs |  2 ++
 iris-mpc-upgrade/src/epoch.rs             | 42 ++++++++++++++++-------
 iris-mpc-upgrade/tests/test_utils.rs      |  1 +
 4 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/iris-mpc-upgrade/src/config.rs b/iris-mpc-upgrade/src/config.rs
index 82ad7b347d..91e7320d25 100644
--- a/iris-mpc-upgrade/src/config.rs
+++ b/iris-mpc-upgrade/src/config.rs
@@ -354,6 +354,9 @@ pub struct RerandomizeContinuousConfig {
     #[clap(long, env = "ENVIRONMENT")]
     pub env: String,
 
+    #[clap(long, env = "SMPC__SERVICE__SERVICE_NAME")]
+    pub service_name: String,
+
     #[clap(long, env = "RERAND_S3_BUCKET")]
     pub s3_bucket: String,
 
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index 131e3ed847..c857904b58 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -73,6 +73,7 @@ pub async fn run_continuous_rerand(
             s3,
             &config.s3_bucket,
             &config.env,
+            &config.service_name,
             active_epoch,
             config.party_id,
             poll_interval,
@@ -290,6 +291,7 @@ pub async fn run_continuous_rerand(
             s3,
             &config.s3_bucket,
             &config.env,
+            &config.service_name,
             active_epoch,
             config.party_id,
             poll_interval,
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index 4d4e090542..6ed1ea4394 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -8,10 +8,20 @@ use std::time::Duration;
 use crate::s3_coordination;
 use crate::tripartite_dh;
 
-fn secret_id(env: &str, epoch: u32, party_id: u8) -> String {
+fn service_prefix(service_name: &str) -> &str {
+    service_name
+        .rsplit_once('-')
+        .map(|(prefix, _)| prefix)
+        .unwrap_or(service_name)
+}
+
+fn secret_id(env: &str, service_name: &str, epoch: u32, party_id: u8) -> String {
     format!(
-        "{}/iris-mpc-db-rerandomization/epoch-{}/private-key-party-{}",
-        env, epoch, party_id
+        "{}/{}-continuous-rerandomisation/epoch-{}/private-key-party-{}",
+        env,
+        service_prefix(service_name),
+        epoch,
+        party_id
     )
 }
 
@@ -19,10 +29,11 @@ fn secret_id(env: &str, epoch: u32, party_id: u8) -> String {
 async fn load_private_key_from_sm(
     sm: &SecretsManagerClient,
     env: &str,
+    service_name: &str,
     epoch: u32,
     party_id: u8,
 ) -> Result<Option<tripartite_dh::PrivateKey>> {
-    let sid = secret_id(env, epoch, party_id);
+    let sid = secret_id(env, service_name, epoch, party_id);
     match sm
         .get_secret_value()
         .secret_id(&sid)
@@ -53,11 +64,12 @@ async fn load_private_key_from_sm(
 async fn save_private_key_to_sm(
     sm: &SecretsManagerClient,
     env: &str,
+    service_name: &str,
     epoch: u32,
     party_id: u8,
     key: &tripartite_dh::PrivateKey,
 ) -> Result<bool> {
-    let sid = secret_id(env, epoch, party_id);
+    let sid = secret_id(env, service_name, epoch, party_id);
     let b64 = STANDARD.encode(key.serialize());
 
     match sm
@@ -82,10 +94,11 @@ async fn save_private_key_to_sm(
 async fn delete_private_key_from_sm(
     sm: &SecretsManagerClient,
     env: &str,
+    service_name: &str,
     epoch: u32,
     party_id: u8,
 ) -> Result<()> {
-    let sid = secret_id(env, epoch, party_id);
+    let sid = secret_id(env, service_name, epoch, party_id);
     sm.delete_secret()
         .secret_id(&sid)
         .force_delete_without_recovery(true)
@@ -108,16 +121,17 @@ pub async fn idempotent_keygen(
     s3: &S3Client,
     bucket: &str,
     env: &str,
+    service_name: &str,
     epoch: u32,
     party_id: u8,
 ) -> Result<tripartite_dh::PrivateKey> {
     if epoch > 0 {
-        if let Err(e) = delete_private_key_from_sm(sm, env, epoch - 1, party_id).await {
+        if let Err(e) = delete_private_key_from_sm(sm, env, service_name, epoch - 1, party_id).await {
             tracing::debug!("Cleanup of epoch {} key (best-effort): {}", epoch - 1, e);
         }
     }
 
-    if let Some(existing) = load_private_key_from_sm(sm, env, epoch, party_id).await? {
+    if let Some(existing) = load_private_key_from_sm(sm, env, service_name, epoch, party_id).await? {
         tracing::info!(
             "Epoch {}: private key found in SM, re-uploading public key to S3",
             epoch
@@ -136,7 +150,7 @@ pub async fn idempotent_keygen(
     let mut rng = rand::rngs::OsRng;
     let private_key = tripartite_dh::PrivateKey::random(&mut rng);
 
-    let saved = save_private_key_to_sm(sm, env, epoch, party_id, &private_key).await?;
+    let saved = save_private_key_to_sm(sm, env, service_name, epoch, party_id, &private_key).await?;
     let private_key = if saved {
         private_key
     } else {
@@ -146,12 +160,12 @@ pub async fn idempotent_keygen(
             "Epoch {}: private key already exists in SM (likely concurrent start); reloading it",
             epoch
         );
-        load_private_key_from_sm(sm, env, epoch, party_id)
+        load_private_key_from_sm(sm, env, service_name, epoch, party_id)
             .await?
             .ok_or_else(|| {
                 eyre!(
                     "Secret existed but could not be loaded: {}",
-                    secret_id(env, epoch, party_id)
+                    secret_id(env, service_name, epoch, party_id)
                 )
             })?
     };
@@ -169,11 +183,12 @@ pub async fn derive_shared_secret(
     s3: &S3Client,
     bucket: &str,
     env: &str,
+    service_name: &str,
     epoch: u32,
     party_id: u8,
     poll_interval: Duration,
 ) -> Result<[u8; 32]> {
-    let private_key = idempotent_keygen(sm, s3, bucket, env, epoch, party_id).await?;
+    let private_key = idempotent_keygen(sm, s3, bucket, env, service_name, epoch, party_id).await?;
 
     let next_id = (party_id + 1) % 3;
     let prev_id = (party_id + 2) % 3;
@@ -238,6 +253,7 @@ pub async fn complete_epoch(
     s3: &S3Client,
     bucket: &str,
     env: &str,
+    service_name: &str,
     epoch: u32,
     party_id: u8,
     poll_interval: Duration,
@@ -252,6 +268,6 @@ pub async fn complete_epoch(
     s3_coordination::poll_epoch_complete_all(s3, bucket, epoch, poll_interval).await?;
     tracing::info!("Epoch {}: all parties completed", epoch);
 
-    delete_private_key_from_sm(sm, env, epoch, party_id).await?;
+    delete_private_key_from_sm(sm, env, service_name, epoch, party_id).await?;
     Ok(())
 }
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index 95afa053ee..7e6fc45f68 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -139,6 +139,7 @@ impl TestEnv {
                 6200 + party_id as u16
             ),
             env: "testing".to_string(),
+            service_name: format!("iris-mpc-{}", party_id),
             s3_bucket: self.bucket.clone(),
             schema_name: format!("{}_{}", self.prefix, party_id),
             chunk_size: CHUNK_SIZE,

From 9e195ca949a0651f3ba6ec27d79a1d0893c68be1 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 13:32:36 +0100
Subject: [PATCH 36/76] correct service account name

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 556d2732af..f01e88a819 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -9,7 +9,7 @@ args:
 
 serviceAccount:
   create: true
-  name: "iris-mpc-continuous-rerand"
+  name: "iris-mpc-continuous-rerandomization"
 
 ports:
   - containerPort: 3000

From f3f5e512445ea9fcce6f73ed8fa3b787369f5f6e Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 13:34:02 +0100
Subject: [PATCH 37/76] correct node selector

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index f01e88a819..777d6c10a1 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -34,7 +34,7 @@ resources:
 
 nodeSelector:
   kubernetes.io/arch: amd64
-  workload: "rerandomization"
+  workload: "continuous_rerandomization"
 
 tolerations:
   - key: "dedicated"

From c958cad5854779d8e8f18ba45a0af79c74a0f90a Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 13:37:32 +0100
Subject: [PATCH 38/76] proper github-secret notation

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 777d6c10a1..4a37daaf29 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -21,8 +21,7 @@ podSecurityContext:
   runAsGroup: 65534
 
 imagePullSecrets:
-  enabled: true
-  secretName: github-secret
+  - name: github-secret
 
 resources:
   limits:

From 8debfea27f66c6196b7a10e5636646b5a19dd249 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 17:32:48 +0100
Subject: [PATCH 39/76] debug issue with s3

---
 .../bin/iris-mpc-upgrade/rerandomize_db.rs    | 11 +++++++
 iris-mpc-upgrade/src/s3_coordination.rs       | 32 +++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
index da522dfa56..b26d085f00 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
@@ -545,11 +545,22 @@ async fn rerandomize_continuous_main(config: RerandomizeContinuousConfig) -> Res
     background_tasks.check_tasks();
 
     let sdk_config = aws_config::from_env().load().await;
+    tracing::info!(
+        region = ?sdk_config.region(),
+        "AWS SDK config loaded"
+    );
     let s3_config = aws_sdk_s3::config::Builder::from(&sdk_config);
     let sm_config = aws_sdk_secretsmanager::config::Builder::from(&sdk_config);
     let s3_client = S3Client::from_conf(s3_config.build());
     let sm_client = SecretsManagerClient::from_conf(sm_config.build());
 
+    tracing::info!(
+        s3_bucket = %config.s3_bucket,
+        party_id = config.party_id,
+        environment = %config.env,
+        "Continuous rerand starting with config"
+    );
+
     let postgres_client =
         PostgresClient::new(&config.db_url, &config.schema_name, AccessMode::ReadWrite).await?;
     let store = Store::new(&postgres_client).await?;
diff --git a/iris-mpc-upgrade/src/s3_coordination.rs b/iris-mpc-upgrade/src/s3_coordination.rs
index 305193ad1f..ad3e0de930 100644
--- a/iris-mpc-upgrade/src/s3_coordination.rs
+++ b/iris-mpc-upgrade/src/s3_coordination.rs
@@ -1,3 +1,4 @@
+use aws_sdk_s3::error::ProvideErrorMetadata;
 use aws_sdk_s3::Client as S3Client;
 use eyre::{eyre, Result};
 use futures::future::try_join_all;
@@ -49,14 +50,41 @@ pub async fn upload_marker(s3: &S3Client, bucket: &str, key: &str, body: Vec<u8>
 }
 
 pub async fn marker_exists(s3: &S3Client, bucket: &str, key: &str) -> Result<bool> {
+    tracing::debug!(bucket = bucket, key = key, "S3 HeadObject request");
     match s3.head_object().bucket(bucket).key(key).send().await {
-        Ok(_) => Ok(true),
+        Ok(_) => {
+            tracing::debug!(key = key, "S3 HeadObject: exists");
+            Ok(true)
+        }
         Err(e) => {
+            let status = e.raw_response().map(|r| r.status().as_u16());
+            let err_display = format!("{e}");
+            let debug_display = format!("{e:?}");
             let svc_err = e.into_service_error();
+            let code = svc_err.code().unwrap_or("<none>");
+            let message = svc_err.message().unwrap_or("<none>");
+            tracing::warn!(
+                bucket = bucket,
+                key = key,
+                http_status = ?status,
+                is_not_found = svc_err.is_not_found(),
+                error_code = code,
+                error_message = message,
+                error_display = %err_display,
+                error_debug = %debug_display,
+                "S3 HeadObject error details"
+            );
             if svc_err.is_not_found() {
                 Ok(false)
             } else {
-                Err(eyre!("S3 HeadObject failed for key {}: {}", key, svc_err))
+                Err(eyre!(
+                    "S3 HeadObject failed for key {} in bucket {}: status={:?} code={} message={}",
+                    key,
+                    bucket,
+                    status,
+                    code,
+                    message,
+                ))
             }
         }
     }

From 71e7b94726273470510e7f7e5e8ac72162aa93a7 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 17:34:14 +0100
Subject: [PATCH 40/76] fmt

---
 iris-mpc-upgrade/src/epoch.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index 6ed1ea4394..7ca09f3b0a 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -126,12 +126,14 @@ pub async fn idempotent_keygen(
     party_id: u8,
 ) -> Result<tripartite_dh::PrivateKey> {
     if epoch > 0 {
-        if let Err(e) = delete_private_key_from_sm(sm, env, service_name, epoch - 1, party_id).await {
+        if let Err(e) = delete_private_key_from_sm(sm, env, service_name, epoch - 1, party_id).await
+        {
             tracing::debug!("Cleanup of epoch {} key (best-effort): {}", epoch - 1, e);
         }
     }
 
-    if let Some(existing) = load_private_key_from_sm(sm, env, service_name, epoch, party_id).await? {
+    if let Some(existing) = load_private_key_from_sm(sm, env, service_name, epoch, party_id).await?
+    {
         tracing::info!(
             "Epoch {}: private key found in SM, re-uploading public key to S3",
             epoch
@@ -150,7 +152,8 @@ pub async fn idempotent_keygen(
     let mut rng = rand::rngs::OsRng;
     let private_key = tripartite_dh::PrivateKey::random(&mut rng);
 
-    let saved = save_private_key_to_sm(sm, env, service_name, epoch, party_id, &private_key).await?;
+    let saved =
+        save_private_key_to_sm(sm, env, service_name, epoch, party_id, &private_key).await?;
     let private_key = if saved {
         private_key
     } else {

From cb27d6aa39cc808a88ffdbfbca5b079b61f51264 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 17:42:56 +0100
Subject: [PATCH 41/76] new SHA + clippy fix

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 iris-mpc-upgrade/src/epoch.rs                                   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 4a37daaf29..0e46b1595a 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,4 +1,4 @@
-image: "ghcr.io/worldcoin/rerandomization-protocol:9ada456f64ffff48a2bbb3634859e00c7636faae"
+image: "ghcr.io/worldcoin/rerandomization-protocol:71e7b94726273470510e7f7e5e8ac72162aa93a7"
 replicaCount: 1
 
 environment: stage
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index 7ca09f3b0a..ab1b90a80e 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -181,6 +181,7 @@ pub async fn idempotent_keygen(
 }
 
 /// Derive the shared secret for an epoch: keygen + download peer keys + BLS pairing.
+#[allow(clippy::too_many_arguments)]
 pub async fn derive_shared_secret(
     sm: &SecretsManagerClient,
     s3: &S3Client,
@@ -251,6 +252,7 @@ pub async fn determine_active_epoch(s3: &S3Client, bucket: &str, start_hint: u32
 }
 
 /// Upload completion marker, poll for all three, then delete the epoch key from SM.
+#[allow(clippy::too_many_arguments)]
 pub async fn complete_epoch(
     sm: &SecretsManagerClient,
     s3: &S3Client,

From 01d4f9d91f5c65682d8045c50e1dbb9b210d37ce Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 17:57:30 +0100
Subject: [PATCH 42/76] new sha

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 0e46b1595a..99ab16fde7 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,4 +1,4 @@
-image: "ghcr.io/worldcoin/rerandomization-protocol:71e7b94726273470510e7f7e5e8ac72162aa93a7"
+image: "ghcr.io/worldcoin/rerandomization-protocol:cb27d6aa39cc808a88ffdbfbca5b079b61f51264"
 replicaCount: 1
 
 environment: stage

From 0bbe0c379e32ba7312d357ac56f295abcd96da08 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 18:04:59 +0100
Subject: [PATCH 43/76] sha + correct secret prefix

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml     | 3 +++
 .../values-iris-mpc-continuous-rerandomization.yaml            | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml            | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml            | 2 +-
 iris-mpc-upgrade/src/epoch.rs                                  | 2 +-
 5 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 99ab16fde7..ea61b6b2f0 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -7,6 +7,9 @@ command: ["/bin/rerandomize-db"]
 args:
   - "rerandomize-continuous"
 
+strategy:
+  type: Recreate
+
 serviceAccount:
   create: true
   name: "iris-mpc-continuous-rerandomization"
diff --git a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
index 257c73f1d9..3c1bce2664 100644
--- a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 env:
-  - name: SERVICE_NAME
+  - name: SMPC__SERVICE__SERVICE_NAME
     value: iris-mpc-continuous-rerandomization-0
   - name: AWS_REGION
     value: eu-north-1
diff --git a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
index 04e06ddb3e..a757d37e28 100644
--- a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 env:
-  - name: SERVICE_NAME
+  - name: SMPC__SERVICE__SERVICE_NAME
     value: iris-mpc-continuous-rerandomization-1
   - name: AWS_REGION
     value: eu-north-1
diff --git a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
index 1b0b497f51..436f33162e 100644
--- a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 env:
-  - name: SERVICE_NAME
+  - name: SMPC__SERVICE__SERVICE_NAME
     value: iris-mpc-continuous-rerandomization-2
   - name: AWS_REGION
     value: eu-north-1
diff --git a/iris-mpc-upgrade/src/epoch.rs b/iris-mpc-upgrade/src/epoch.rs
index ab1b90a80e..6600c4fa1d 100644
--- a/iris-mpc-upgrade/src/epoch.rs
+++ b/iris-mpc-upgrade/src/epoch.rs
@@ -17,7 +17,7 @@ fn service_prefix(service_name: &str) -> &str {
 
 fn secret_id(env: &str, service_name: &str, epoch: u32, party_id: u8) -> String {
     format!(
-        "{}/{}-continuous-rerandomisation/epoch-{}/private-key-party-{}",
+        "{}/{}/epoch-{}/private-key-party-{}",
         env,
         service_prefix(service_name),
         epoch,

From d794dba4f0c7e9e2048bb0b557e16f659b082283 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 18:55:22 +0100
Subject: [PATCH 44/76] sha + params

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml    | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml           | 4 ++--
 .../values-iris-mpc-continuous-rerandomization.yaml           | 4 ++--
 .../values-iris-mpc-continuous-rerandomization.yaml           | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index ea61b6b2f0..9550d4e7da 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,4 +1,4 @@
-image: "ghcr.io/worldcoin/rerandomization-protocol:cb27d6aa39cc808a88ffdbfbca5b079b61f51264"
+image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
 replicaCount: 1
 
 environment: stage
diff --git a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
index 3c1bce2664..fb6c453912 100644
--- a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,9 +11,9 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_dev_0
+    value: SMPC_stage_0
   - name: ENVIRONMENT
-    value: dev
+    value: stage
   - name: RERAND_S3_BUCKET
     value: wf-smpcv2-stage-continuous-rerandomization
   - name: CHUNK_SIZE
diff --git a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
index a757d37e28..cc10d62daf 100644
--- a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,9 +11,9 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_dev_1
+    value: SMPC_stage_1
   - name: ENVIRONMENT
-    value: dev
+    value: stage
   - name: RERAND_S3_BUCKET
     value: wf-smpcv2-stage-continuous-rerandomization
   - name: CHUNK_SIZE
diff --git a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
index 436f33162e..facd60e02c 100644
--- a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,9 +11,9 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_dev_2
+    value: SMPC_stage_2
   - name: ENVIRONMENT
-    value: dev
+    value: stage
   - name: RERAND_S3_BUCKET
     value: wf-smpcv2-stage-continuous-rerandomization
   - name: CHUNK_SIZE

From 5b9902baf516f4542d2df6dd632427da93313295 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Wed, 25 Mar 2026 18:55:34 +0100
Subject: [PATCH 45/76] scale down for the night

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 9550d4e7da..746d49f176 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 

From f0489931ba1abce015579f8dc6153a7ab88612e9 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Thu, 26 Mar 2026 09:14:14 +0100
Subject: [PATCH 46/76] scale up

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 746d49f176..9550d4e7da 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 0
+replicaCount: 1
 
 environment: stage
 

From 8d3e011932f1d456b1e93a611fb7949c128193da Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Thu, 26 Mar 2026 11:49:20 +0100
Subject: [PATCH 47/76] set health endpoint, enable datadog

---
 ...s-iris-mpc-continuous-rerandomization.yaml | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 9550d4e7da..1c686d80b1 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -14,11 +14,31 @@ serviceAccount:
   create: true
   name: "iris-mpc-continuous-rerandomization"
 
+datadog:
+  enabled: true
+
 ports:
   - containerPort: 3000
     name: health
     protocol: TCP
 
+startupProbe:
+  httpGet:
+    path: /health
+    port: health
+
+livenessProbe:
+  httpGet:
+    path: /health
+    port: health
+
+readinessProbe:
+  periodSeconds: 20
+  failureThreshold: 4
+  httpGet:
+    path: /health
+    port: health
+
 podSecurityContext:
   runAsUser: 65534
   runAsGroup: 65534

From 7b93ea1cd887f1ba6cac2f46e31ca9db70326216 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Thu, 26 Mar 2026 13:13:09 +0100
Subject: [PATCH 48/76] up

---
 iris-mpc-common/src/helpers/smpc_request.rs | 3 +++
 iris-mpc/src/services/processors/batch.rs   | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/iris-mpc-common/src/helpers/smpc_request.rs b/iris-mpc-common/src/helpers/smpc_request.rs
index 21fc4e1cd6..794b05262b 100644
--- a/iris-mpc-common/src/helpers/smpc_request.rs
+++ b/iris-mpc-common/src/helpers/smpc_request.rs
@@ -188,6 +188,9 @@ pub enum ReceiveRequestError {
     BatchPollingTimeout(i32),
     #[error("Failed to parse shares: {0}")]
     FailedToProcessIrisShares(Report),
+
+    #[error("Failed to mark request as deleted: {0}")]
+    FailedToMarkRequestAsDeleted(Report),
 }
 
 impl From<SdkError<ReceiveMessageError>> for ReceiveRequestError {
diff --git a/iris-mpc/src/services/processors/batch.rs b/iris-mpc/src/services/processors/batch.rs
index 69ea82af23..b0bf8d4f27 100644
--- a/iris-mpc/src/services/processors/batch.rs
+++ b/iris-mpc/src/services/processors/batch.rs
@@ -1127,10 +1127,15 @@ impl<'a> BatchProcessor<'a> {
         &self,
         sqs_message: &aws_sdk_sqs::types::Message,
     ) -> Result<(), ReceiveRequestError> {
+        let receipt_handle = sqs_message.receipt_handle.as_deref().ok_or_else(|| {
+            ReceiveRequestError::FailedToMarkRequestAsDeleted(eyre::eyre!(
+                "Missing receipt handle"
+            ))
+        })?;
         self.client
             .delete_message()
             .queue_url(&self.config.requests_queue_url)
-            .receipt_handle(sqs_message.receipt_handle.as_ref().unwrap())
+            .receipt_handle(receipt_handle)
             .send()
             .await
             .map_err(ReceiveRequestError::from)?;

From eeabfa6cee5ea61db20bdf98115aab6956e7858e Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Thu, 26 Mar 2026 13:38:31 +0100
Subject: [PATCH 49/76] up

---
 iris-mpc/src/services/processors/batch.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/iris-mpc/src/services/processors/batch.rs b/iris-mpc/src/services/processors/batch.rs
index b0bf8d4f27..bb497cbdec 100644
--- a/iris-mpc/src/services/processors/batch.rs
+++ b/iris-mpc/src/services/processors/batch.rs
@@ -433,7 +433,7 @@ impl<'a> BatchProcessor<'a> {
             }
             REAUTH_MESSAGE_TYPE => self.process_reauth_request(&message, batch_metadata).await,
             RECOVERY_CHECK_MESSAGE_TYPE => {
-                if !self.config.hawk_server_recovery_enabled {
+                if !self.config.enable_recovery {
                     metrics::counter!("request.skipped", "type" => "recovery_check").increment(1);
                     tracing::warn!("Recovery checks are disabled, skipping recovery check request");
                     self.delete_message(&sqs_message).await?;
@@ -449,7 +449,7 @@ impl<'a> BatchProcessor<'a> {
                 .await
             }
             RESET_CHECK_MESSAGE_TYPE => {
-                if !self.config.hawk_server_resets_enabled {
+                if !self.config.enable_reset {
                     metrics::counter!("request.skipped", "type" => "reset_check").increment(1);
                     tracing::warn!("Resets are disabled, skipping reset request");
                     self.delete_message(&sqs_message).await?;

From bbded4e41ee6e3a06edaadb641dfc66b3f51207a Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Thu, 26 Mar 2026 14:13:53 +0100
Subject: [PATCH 50/76] verify bin

---
 Cargo.lock                                    |   1 +
 iris-mpc-bins/Cargo.toml                      |   5 +
 .../bin/iris-mpc-upgrade/verify_shares.rs     | 127 ++++++++++++++++++
 iris-mpc/src/services/processors/batch.rs     |   4 +-
 4 files changed, 134 insertions(+), 3 deletions(-)
 create mode 100644 iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs

diff --git a/Cargo.lock b/Cargo.lock
index 98fcdd165f..1a7ab7117e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2920,6 +2920,7 @@ dependencies = [
  "axum 0.7.7",
  "base64",
  "blake3",
+ "bytemuck",
  "chrono",
  "clap",
  "clap_builder",
diff --git a/iris-mpc-bins/Cargo.toml b/iris-mpc-bins/Cargo.toml
index bdf398f471..76d3005ffe 100644
--- a/iris-mpc-bins/Cargo.toml
+++ b/iris-mpc-bins/Cargo.toml
@@ -78,6 +78,7 @@ iris-mpc-store = { path = "../iris-mpc-store" }
 iris-mpc-upgrade-hawk = { path = "../iris-mpc-upgrade-hawk" }
 iris-mpc-utils = { path = "../iris-mpc-utils" }
 blake3 = "1.8.2"
+bytemuck.workspace = true
 aws-smithy-types = "1.2.9"
 clap_builder = "4.5.51"
 
@@ -207,6 +208,10 @@ path = "bin/iris-mpc-upgrade/reshare-client.rs"
 name = "rerandomize-db"
 path = "bin/iris-mpc-upgrade/rerandomize_db.rs"
 
+[[bin]]
+name = "verify-shares"
+path = "bin/iris-mpc-upgrade/verify_shares.rs"
+
 
 # ---------------------
 # binaries for iris-mpc-upgrade-hawk
diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
new file mode 100644
index 0000000000..10cdf602ba
--- /dev/null
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
@@ -0,0 +1,127 @@
+// WARNING: This tool reconstructs plaintext iris codes from secret shares.
+// It is intended strictly for local development and staging environments with synthetic test data.
+
+use std::io::Write;
+use std::path::PathBuf;
+
+use clap::Parser;
+use eyre::{ensure, Result};
+use iris_mpc_common::postgres::{AccessMode, PostgresClient};
+use iris_mpc_store::Store;
+use iris_mpc_upgrade::rerandomization::reconstruct_shares;
+
+#[derive(Parser)]
+#[command(
+    name = "verify-shares",
+    about = "Connect to all 3 party databases, reconstruct every iris entry from \
+             all party-pair combinations, and produce per-row + overall hashes.\n\n\
+             WARNING: This tool reconstructs plaintext iris codes from secret shares. \
+             It is intended strictly for local development and staging environments \
+             with synthetic test data."
+)]
+struct Args {
+    #[arg(long, env = "PARTY0_DB_URL")]
+    party0_db_url: String,
+
+    #[arg(long, env = "PARTY1_DB_URL")]
+    party1_db_url: String,
+
+    #[arg(long, env = "PARTY2_DB_URL")]
+    party2_db_url: String,
+
+    #[arg(long, env = "SCHEMA")]
+    schema: String,
+
+    /// Output file for the per-row hash list (one hex hash per line).
+    #[arg(long, default_value = "iris_hashes.txt")]
+    output: PathBuf,
+}
+
+async fn connect(url: &str, schema: &str) -> Result<Store> {
+    let client = PostgresClient::new(url, schema, AccessMode::ReadOnly).await?;
+    Store::new(&client).await
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt::init();
+    let args = Args::parse();
+
+    tracing::warn!("*** This tool reconstructs plaintext iris codes from secret shares.   ***");
+    tracing::warn!("*** Only use with local/staging environments and synthetic test data. ***");
+
+    tracing::info!("Connecting to party databases…");
+    let stores = tokio::try_join!(
+        connect(&args.party0_db_url, &args.schema),
+        connect(&args.party1_db_url, &args.schema),
+        connect(&args.party2_db_url, &args.schema),
+    )?;
+    let stores = [stores.0, stores.1, stores.2];
+
+    let counts: [usize; 3] = [
+        stores[0].count_irises().await?,
+        stores[1].count_irises().await?,
+        stores[2].count_irises().await?,
+    ];
+    tracing::info!(
+        "Row counts: party0={}, party1={}, party2={}",
+        counts[0],
+        counts[1],
+        counts[2]
+    );
+    ensure!(
+        counts[0] == counts[1] && counts[1] == counts[2],
+        "Row counts differ across parties: {:?}",
+        counts
+    );
+    let total = counts[0];
+    if total == 0 {
+        tracing::warn!("Databases are empty, nothing to verify");
+        return Ok(());
+    }
+
+    let mut overall_hasher = blake3::Hasher::new();
+    let mut out = std::io::BufWriter::new(std::fs::File::create(&args.output)?);
+
+    let mut verified = 0u64;
+    let log_interval = (total / 100).max(1);
+
+    for id in 1..=(total as i64) {
+        let rows = tokio::try_join!(
+            stores[0].get_iris_data_by_id(id),
+            stores[1].get_iris_data_by_id(id),
+            stores[2].get_iris_data_by_id(id),
+        )?;
+        let (r0, r1, r2) = rows;
+
+        let left_code = reconstruct_shares(r0.left_code(), r1.left_code(), r2.left_code());
+        let left_mask = reconstruct_shares(r0.left_mask(), r1.left_mask(), r2.left_mask());
+        let right_code = reconstruct_shares(r0.right_code(), r1.right_code(), r2.right_code());
+        let right_mask = reconstruct_shares(r0.right_mask(), r1.right_mask(), r2.right_mask());
+
+        let mut row_hasher = blake3::Hasher::new();
+        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&left_code));
+        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&left_mask));
+        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&right_code));
+        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&right_mask));
+        let row_hash = row_hasher.finalize();
+
+        writeln!(out, "{}:{}", id, row_hash.to_hex())?;
+        overall_hasher.update(row_hash.as_bytes());
+
+        verified += 1;
+        if verified as usize % log_interval == 0 {
+            tracing::info!("Verified {}/{} entries", verified, total);
+        }
+    }
+
+    out.flush()?;
+    let overall_hash = overall_hasher.finalize();
+
+    tracing::info!("Verified all {} entries", total);
+    tracing::info!("Overall hash: {}", overall_hash.to_hex());
+    tracing::info!("Per-row hashes written to {}", args.output.display());
+
+    println!("{}", overall_hash.to_hex());
+    Ok(())
+}
diff --git a/iris-mpc/src/services/processors/batch.rs b/iris-mpc/src/services/processors/batch.rs
index bb497cbdec..5199f7a357 100644
--- a/iris-mpc/src/services/processors/batch.rs
+++ b/iris-mpc/src/services/processors/batch.rs
@@ -1128,9 +1128,7 @@ impl<'a> BatchProcessor<'a> {
         sqs_message: &aws_sdk_sqs::types::Message,
     ) -> Result<(), ReceiveRequestError> {
         let receipt_handle = sqs_message.receipt_handle.as_deref().ok_or_else(|| {
-            ReceiveRequestError::FailedToMarkRequestAsDeleted(eyre::eyre!(
-                "Missing receipt handle"
-            ))
+            ReceiveRequestError::FailedToMarkRequestAsDeleted(eyre::eyre!("Missing receipt handle"))
         })?;
         self.client
             .delete_message()

From c24da43d5fef1737c7dd4989e4ff11c951cdfd35 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Thu, 26 Mar 2026 14:32:57 +0100
Subject: [PATCH 51/76] user last public localstack version

---
 iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml | 2 +-
 iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml
index d87dde3dc3..6d95a79e0c 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml
@@ -42,7 +42,7 @@ services:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   localstack:
-    image: localstack/localstack
+    image: public.ecr.aws/localstack/localstack:4.14
     ports:
       - "127.0.0.1:4566:4566"
       - "127.0.0.1:4571:4571"
diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml
index a4418a5340..be74e98d39 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml
@@ -49,7 +49,7 @@ services:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   localstack:
-    image: localstack/localstack
+    image: public.ecr.aws/localstack/localstack:4.14
     ports:
       - "127.0.0.1:4566:4566"
       - "127.0.0.1:4571:4571"

From ef509e12c96d5950c8a7ca54c4d3074a9a0dedab Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Thu, 26 Mar 2026 14:45:33 +0100
Subject: [PATCH 52/76] build iris-mpc image from current branch

---
 .github/workflows/temp-branch-build-and-push.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/temp-branch-build-and-push.yaml b/.github/workflows/temp-branch-build-and-push.yaml
index 2714d2f15b..94dc40afa5 100644
--- a/.github/workflows/temp-branch-build-and-push.yaml
+++ b/.github/workflows/temp-branch-build-and-push.yaml
@@ -5,6 +5,7 @@ on:
     branches:
       - "dev"
       - "pop-3544-gpu-shutdown-guardrail"
+      - "ps/cont-rerand"
 
 concurrency:
   group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}"

From ca458665bc60a3a5be90b8d37444bfe6ba7ec24c Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Thu, 26 Mar 2026 15:05:51 +0100
Subject: [PATCH 53/76] deploy iris-mpc with new migrations

---
 deploy/stage/common-values-iris-mpc.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index caa368801a..189dca3761 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,4 +1,4 @@
-image: "ghcr.io/worldcoin/iris-mpc:v0.31.5@sha256:af92dd27cabe80eb3a01fcec21960cb79d12a44e01f17459b83cc923c339f4d4"
+image: "ghcr.io/worldcoin/iris-mpc:ef509e12c96d5950c8a7ca54c4d3074a9a0dedab"
 
 environment: stage
 replicaCount: 1

From 94460c67af2ebe535d42c627bb33c498bbaec13a Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Thu, 26 Mar 2026 15:58:28 +0100
Subject: [PATCH 54/76] support different schemas

---
 .../bin/iris-mpc-upgrade/verify_shares.rs     | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
index 10cdf602ba..a99f8ade75 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
@@ -29,9 +29,20 @@ struct Args {
     #[arg(long, env = "PARTY2_DB_URL")]
     party2_db_url: String,
 
+    /// Schema name shared by all parties. Overridden per-party by
+    /// --party{0,1,2}-schema if provided.
     #[arg(long, env = "SCHEMA")]
     schema: String,
 
+    #[arg(long, env = "PARTY0_SCHEMA")]
+    party0_schema: Option<String>,
+
+    #[arg(long, env = "PARTY1_SCHEMA")]
+    party1_schema: Option<String>,
+
+    #[arg(long, env = "PARTY2_SCHEMA")]
+    party2_schema: Option<String>,
+
     /// Output file for the per-row hash list (one hex hash per line).
     #[arg(long, default_value = "iris_hashes.txt")]
     output: PathBuf,
@@ -51,10 +62,14 @@ async fn main() -> Result<()> {
     tracing::warn!("*** Only use with local/staging environments and synthetic test data. ***");
 
     tracing::info!("Connecting to party databases…");
+    let s0 = args.party0_schema.as_deref().unwrap_or(&args.schema);
+    let s1 = args.party1_schema.as_deref().unwrap_or(&args.schema);
+    let s2 = args.party2_schema.as_deref().unwrap_or(&args.schema);
+
     let stores = tokio::try_join!(
-        connect(&args.party0_db_url, &args.schema),
-        connect(&args.party1_db_url, &args.schema),
-        connect(&args.party2_db_url, &args.schema),
+        connect(&args.party0_db_url, s0),
+        connect(&args.party1_db_url, s1),
+        connect(&args.party2_db_url, s2),
     )?;
     let stores = [stores.0, stores.1, stores.2];
 

From 98648896bdef9c21f06efcc1fdeebe2adaccad49 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Thu, 26 Mar 2026 18:47:39 +0100
Subject: [PATCH 55/76] scale down cont rereand

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 1c686d80b1..edea1fe306 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 

From 7b07ce0333ccd4dece0bdffda1bebff72a0c5797 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 27 Mar 2026 14:08:32 +0100
Subject: [PATCH 56/76] do not use s3 loader in stage

---
 deploy/stage/smpcv2-0-stage/values-iris-mpc.yaml | 2 +-
 deploy/stage/smpcv2-1-stage/values-iris-mpc.yaml | 2 +-
 deploy/stage/smpcv2-2-stage/values-iris-mpc.yaml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/deploy/stage/smpcv2-0-stage/values-iris-mpc.yaml b/deploy/stage/smpcv2-0-stage/values-iris-mpc.yaml
index c489b76623..41504b70f4 100644
--- a/deploy/stage/smpcv2-0-stage/values-iris-mpc.yaml
+++ b/deploy/stage/smpcv2-0-stage/values-iris-mpc.yaml
@@ -96,7 +96,7 @@ env:
     value: "wf-smpcv2-stage-sns-requests"
 
   - name: SMPC__ENABLE_S3_IMPORTER
-    value: "true"
+    value: "false"
 
   - name: SMPC__DB_CHUNKS_BUCKET_NAME
     value: "iris-mpc-db-exporter-store-node-0-stage--eun1-az3--x-s3"
diff --git a/deploy/stage/smpcv2-1-stage/values-iris-mpc.yaml b/deploy/stage/smpcv2-1-stage/values-iris-mpc.yaml
index ada9e48614..b50da0249c 100644
--- a/deploy/stage/smpcv2-1-stage/values-iris-mpc.yaml
+++ b/deploy/stage/smpcv2-1-stage/values-iris-mpc.yaml
@@ -96,7 +96,7 @@ env:
     value: "wf-smpcv2-stage-sns-requests"
 
   - name: SMPC__ENABLE_S3_IMPORTER
-    value: "true"
+    value: "false"
 
   - name: SMPC__DB_CHUNKS_BUCKET_NAME
     value: "iris-mpc-db-exporter-store-node-1-stage--eun1-az3--x-s3"
diff --git a/deploy/stage/smpcv2-2-stage/values-iris-mpc.yaml b/deploy/stage/smpcv2-2-stage/values-iris-mpc.yaml
index 4811a3baaf..153f9b65d8 100644
--- a/deploy/stage/smpcv2-2-stage/values-iris-mpc.yaml
+++ b/deploy/stage/smpcv2-2-stage/values-iris-mpc.yaml
@@ -96,7 +96,7 @@ env:
     value: "wf-smpcv2-stage-sns-requests"
 
   - name: SMPC__ENABLE_S3_IMPORTER
-    value: "true"
+    value: "false"
 
   - name: SMPC__DB_CHUNKS_BUCKET_NAME
     value: "iris-mpc-db-exporter-store-node-2-stage--eun1-az3--x-s3"

From 4b98f95fcb9ee3e29ea056540bb5bac4f80f1ed4 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 27 Mar 2026 14:31:54 +0100
Subject: [PATCH 57/76] verification binary

---
 .../bin/iris-mpc-upgrade/verify_shares.rs     | 123 +++++++++++++++---
 iris-mpc-upgrade/src/rerandomization.rs       |  38 +++++-
 2 files changed, 140 insertions(+), 21 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
index a99f8ade75..d464a2f5f5 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
@@ -8,7 +8,7 @@ use clap::Parser;
 use eyre::{ensure, Result};
 use iris_mpc_common::postgres::{AccessMode, PostgresClient};
 use iris_mpc_store::Store;
-use iris_mpc_upgrade::rerandomization::reconstruct_shares;
+use iris_mpc_upgrade::rerandomization::{try_reconstruct_shares, ReconstructionMismatch};
 
 #[derive(Parser)]
 #[command(
@@ -46,6 +46,10 @@ struct Args {
     /// Output file for the per-row hash list (one hex hash per line).
     #[arg(long, default_value = "iris_hashes.txt")]
     output: PathBuf,
+
+    /// Output file for detailed verification failures.
+    #[arg(long, default_value = "verification-output.txt")]
+    failures_output: PathBuf,
 }
 
 async fn connect(url: &str, schema: &str) -> Result<Store> {
@@ -53,6 +57,37 @@ async fn connect(url: &str, schema: &str) -> Result<Store> {
     Store::new(&client).await
 }
 
+fn log_mismatch(
+    out: &mut impl Write,
+    id: i64,
+    component: &str,
+    mismatch: &ReconstructionMismatch,
+    v0: i16,
+    v1: i16,
+    v2: i16,
+) -> std::io::Result<()> {
+    // recon(0,1) vs recon(1,2) vs recon(0,2).
+    // If two pair-reconstructions agree, the party NOT in both agreeing pairs
+    // is the one with the bad share.
+    let divergent_party = match (mismatch.pairs_01_vs_12, mismatch.pairs_01_vs_02) {
+        // recon(0,1) != recon(1,2), but recon(0,1) == recon(0,2)
+        // agreeing pairs share parties 0; party 2 is the outlier
+        (true, false) => "party2 (recon(0,1)==recon(0,2), recon(1,2) differs)",
+        // recon(0,1) == recon(1,2), but recon(0,1) != recon(0,2)
+        // agreeing pairs share party 1; party 0 is the outlier
+        (false, true) => "party0 (recon(0,1)==recon(1,2), recon(0,2) differs)",
+        // all three disagree — cannot isolate a single bad party
+        (true, true) => "unknown (all three pair reconstructions differ)",
+        (false, false) => unreachable!(),
+    };
+
+    let msg = format!(
+        "id={id} component={component} version_ids=[{v0},{v1},{v2}] suspect={divergent_party}"
+    );
+    tracing::error!("{}", msg);
+    writeln!(out, "{}", msg)
+}
+
 #[tokio::main]
 async fn main() -> Result<()> {
     tracing_subscriber::fmt::init();
@@ -97,8 +132,11 @@ async fn main() -> Result<()> {
 
     let mut overall_hasher = blake3::Hasher::new();
     let mut out = std::io::BufWriter::new(std::fs::File::create(&args.output)?);
+    let mut failures_out =
+        std::io::BufWriter::new(std::fs::File::create(&args.failures_output)?);
 
     let mut verified = 0u64;
+    let mut failed = 0u64;
     let log_interval = (total / 100).max(1);
 
     for id in 1..=(total as i64) {
@@ -109,30 +147,85 @@ async fn main() -> Result<()> {
         )?;
         let (r0, r1, r2) = rows;
 
-        let left_code = reconstruct_shares(r0.left_code(), r1.left_code(), r2.left_code());
-        let left_mask = reconstruct_shares(r0.left_mask(), r1.left_mask(), r2.left_mask());
-        let right_code = reconstruct_shares(r0.right_code(), r1.right_code(), r2.right_code());
-        let right_mask = reconstruct_shares(r0.right_mask(), r1.right_mask(), r2.right_mask());
-
-        let mut row_hasher = blake3::Hasher::new();
-        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&left_code));
-        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&left_mask));
-        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&right_code));
-        row_hasher.update(bytemuck::cast_slice::<u16, u8>(&right_mask));
-        let row_hash = row_hasher.finalize();
+        let components: [(&str, &[u16], &[u16], &[u16]); 4] = [
+            ("left_code", r0.left_code(), r1.left_code(), r2.left_code()),
+            ("left_mask", r0.left_mask(), r1.left_mask(), r2.left_mask()),
+            (
+                "right_code",
+                r0.right_code(),
+                r1.right_code(),
+                r2.right_code(),
+            ),
+            (
+                "right_mask",
+                r0.right_mask(),
+                r1.right_mask(),
+                r2.right_mask(),
+            ),
+        ];
+
+        let mut row_ok = true;
+        let mut reconstructed: Vec<Vec<u16>> = Vec::with_capacity(4);
+
+        for (name, s0, s1, s2) in &components {
+            match try_reconstruct_shares(s0, s1, s2) {
+                Ok(plain) => reconstructed.push(plain),
+                Err(mismatch) => {
+                    row_ok = false;
+                    log_mismatch(
+                        &mut failures_out,
+                        id,
+                        name,
+                        &mismatch,
+                        r0.version_id(),
+                        r1.version_id(),
+                        r2.version_id(),
+                    )?;
+                }
+            }
+        }
 
-        writeln!(out, "{}:{}", id, row_hash.to_hex())?;
-        overall_hasher.update(row_hash.as_bytes());
+        if row_ok {
+            let mut row_hasher = blake3::Hasher::new();
+            for plain in &reconstructed {
+                row_hasher.update(bytemuck::cast_slice::<u16, u8>(plain));
+            }
+            let row_hash = row_hasher.finalize();
+            writeln!(out, "{}:{}", id, row_hash.to_hex())?;
+            overall_hasher.update(row_hash.as_bytes());
+        } else {
+            failed += 1;
+        }
 
         verified += 1;
         if verified as usize % log_interval == 0 {
-            tracing::info!("Verified {}/{} entries", verified, total);
+            tracing::info!(
+                "Progress {}/{} ({} failures so far)",
+                verified,
+                total,
+                failed
+            );
         }
     }
 
     out.flush()?;
+    failures_out.flush()?;
     let overall_hash = overall_hasher.finalize();
 
+    if failed > 0 {
+        tracing::error!(
+            "Verification completed with {} inconsistent rows out of {} (details in {})",
+            failed,
+            total,
+            args.failures_output.display()
+        );
+        eyre::bail!(
+            "{} rows have inconsistent shares across parties. See {}",
+            failed,
+            args.failures_output.display()
+        );
+    }
+
     tracing::info!("Verified all {} entries", total);
     tracing::info!("Overall hash: {}", overall_hash.to_hex());
     tracing::info!("Per-row hashes written to {}", args.output.display());
diff --git a/iris-mpc-upgrade/src/rerandomization.rs b/iris-mpc-upgrade/src/rerandomization.rs
index 4038309361..bc8533ff26 100644
--- a/iris-mpc-upgrade/src/rerandomization.rs
+++ b/iris-mpc-upgrade/src/rerandomization.rs
@@ -89,10 +89,22 @@ fn randomize_galois_ring_coefs(coefs: &mut [u16], xof: &mut blake3::OutputReader
     }
 }
 
+/// Which pair(s) of parties disagree during reconstruction.
+#[derive(Debug)]
+pub struct ReconstructionMismatch {
+    pub pairs_01_vs_12: bool,
+    pub pairs_01_vs_02: bool,
+}
+
 /// Reconstruct the plaintext from 3 Shamir shares using Lagrange interpolation.
-/// Verifies consistency by reconstructing from all 3 pairs (0-1, 1-2, 0-2) and
-/// asserting they agree.
-pub fn reconstruct_shares(share0: &[u16], share1: &[u16], share2: &[u16]) -> Vec<u16> {
+///
+/// Returns `Ok(plaintext)` when all 3 pair-wise reconstructions agree, or
+/// `Err(mismatch)` indicating which pairs diverge.
+pub fn try_reconstruct_shares(
+    share0: &[u16],
+    share1: &[u16],
+    share2: &[u16],
+) -> Result<Vec<u16>, ReconstructionMismatch> {
     let lag_01 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID1);
     let lag_10 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID1, PartyID::ID0);
     let lag_02 = ShamirGaloisRingShare::deg_1_lagrange_polys_at_zero(PartyID::ID0, PartyID::ID2);
@@ -133,9 +145,23 @@ pub fn reconstruct_shares(share0: &[u16], share1: &[u16], share2: &[u16]) -> Vec
         })
         .collect_vec();
 
-    assert_eq!(recon01, recon12);
-    assert_eq!(recon01, recon02);
-    recon01
+    let mismatch_01_12 = recon01 != recon12;
+    let mismatch_01_02 = recon01 != recon02;
+    if mismatch_01_12 || mismatch_01_02 {
+        return Err(ReconstructionMismatch {
+            pairs_01_vs_12: mismatch_01_12,
+            pairs_01_vs_02: mismatch_01_02,
+        });
+    }
+    Ok(recon01)
+}
+
+/// Reconstruct the plaintext from 3 Shamir shares using Lagrange interpolation.
+/// Verifies consistency by reconstructing from all 3 pairs (0-1, 1-2, 0-2) and
+/// asserting they agree.
+pub fn reconstruct_shares(share0: &[u16], share1: &[u16], share2: &[u16]) -> Vec<u16> {
+    try_reconstruct_shares(share0, share1, share2)
+        .expect("Reconstruction mismatch: shares are inconsistent across party pairs")
 }
 
 #[cfg(test)]

From b28c5f449cd52bcc07d423672bd5861523e5399f Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 30 Mar 2026 14:38:58 +0200
Subject: [PATCH 58/76] stopping iris-mpc

---
 deploy/stage/common-values-iris-mpc.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index 189dca3761..76214af4e6 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,7 +1,7 @@
 image: "ghcr.io/worldcoin/iris-mpc:ef509e12c96d5950c8a7ca54c4d3074a9a0dedab"
 
 environment: stage
-replicaCount: 1
+replicaCount: 0
 
 strategy:
   type: Recreate

From d412c96c62ba3cf4a3b1bd22e0864eaf9710d5e9 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 30 Mar 2026 14:46:10 +0200
Subject: [PATCH 59/76] scale up rerand

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index edea1fe306..1c686d80b1 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 0
+replicaCount: 1
 
 environment: stage
 

From 949fe6fb466ffe8d43bc78da64059f0a6ce9a102 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 30 Mar 2026 14:48:30 +0200
Subject: [PATCH 60/76] scale down rerand

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 1c686d80b1..edea1fe306 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 

From 8bfcde95faa91a112bc17c46017e10b31d2ce4c0 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 30 Mar 2026 15:31:33 +0200
Subject: [PATCH 61/76] scale up iris-mpc

---
 deploy/stage/common-values-iris-mpc.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index 76214af4e6..189dca3761 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,7 +1,7 @@
 image: "ghcr.io/worldcoin/iris-mpc:ef509e12c96d5950c8a7ca54c4d3074a9a0dedab"
 
 environment: stage
-replicaCount: 0
+replicaCount: 1
 
 strategy:
   type: Recreate

From 05da344f394af4fa2335f550d4dad6951e38e6a4 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 30 Mar 2026 17:48:06 +0200
Subject: [PATCH 62/76] scale up rerand

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index edea1fe306..1c686d80b1 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 0
+replicaCount: 1
 
 environment: stage
 

From ea07805b02a6dcf4e6bee4ccaa0a2c35d6164443 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 30 Mar 2026 18:01:28 +0200
Subject: [PATCH 63/76] iris, rerand down

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 deploy/stage/common-values-iris-mpc.yaml                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index 1c686d80b1..edea1fe306 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 
diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index 189dca3761..76214af4e6 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,7 +1,7 @@
 image: "ghcr.io/worldcoin/iris-mpc:ef509e12c96d5950c8a7ca54c4d3074a9a0dedab"
 
 environment: stage
-replicaCount: 1
+replicaCount: 0
 
 strategy:
   type: Recreate

From 6f409ad7ad62ca28ae14855c7a436a74d88e328a Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Mon, 30 Mar 2026 19:06:36 +0200
Subject: [PATCH 64/76] handle sigterm

---
 .../bin/iris-mpc-upgrade/rerandomize_db.rs    | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
index b26d085f00..d509391d36 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
@@ -544,6 +544,29 @@ async fn rerandomize_continuous_main(config: RerandomizeContinuousConfig) -> Res
         background_tasks.spawn(async move { spawn_healthcheck_server(healthcheck_port).await });
     background_tasks.check_tasks();
 
+    let cancel = tokio_util::sync::CancellationToken::new();
+    let cancel_for_signal = cancel.clone();
+    tokio::spawn(async move {
+        #[cfg(unix)]
+        {
+            use tokio::signal::unix::{signal, SignalKind};
+            let mut sigterm =
+                signal(SignalKind::terminate()).expect("failed to install SIGTERM handler");
+            tokio::select! {
+                _ = tokio::signal::ctrl_c() => {}
+                _ = sigterm.recv() => {}
+            }
+        }
+        #[cfg(not(unix))]
+        {
+            tokio::signal::ctrl_c()
+                .await
+                .expect("failed to install CTRL+C handler");
+        }
+        tracing::info!("Received shutdown signal, requesting graceful rerand shutdown…");
+        cancel_for_signal.cancel();
+    });
+
     let sdk_config = aws_config::from_env().load().await;
     tracing::info!(
         region = ?sdk_config.region(),
@@ -565,8 +588,16 @@ async fn rerandomize_continuous_main(config: RerandomizeContinuousConfig) -> Res
         PostgresClient::new(&config.db_url, &config.schema_name, AccessMode::ReadWrite).await?;
     let store = Store::new(&postgres_client).await?;
 
-    continuous_rerand::run_continuous_rerand(&config, &s3_client, &sm_client, &store, None).await?;
+    continuous_rerand::run_continuous_rerand(
+        &config,
+        &s3_client,
+        &sm_client,
+        &store,
+        Some(&cancel),
+    )
+    .await?;
 
+    tracing::info!("Continuous rerand shut down gracefully");
     background_tasks.abort_and_wait_for_finish().await;
     Ok(())
 }

From 4161e46ca376d391c52139bd1b1d56ca420c1072 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Tue, 31 Mar 2026 11:03:55 +0200
Subject: [PATCH 65/76] clippy + fmt

---
 iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
index d464a2f5f5..384b6d8186 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/verify_shares.rs
@@ -10,6 +10,8 @@ use iris_mpc_common::postgres::{AccessMode, PostgresClient};
 use iris_mpc_store::Store;
 use iris_mpc_upgrade::rerandomization::{try_reconstruct_shares, ReconstructionMismatch};
 
+type ShareComponent<'a> = (&'a str, &'a [u16], &'a [u16], &'a [u16]);
+
 #[derive(Parser)]
 #[command(
     name = "verify-shares",
@@ -132,8 +134,7 @@ async fn main() -> Result<()> {
 
     let mut overall_hasher = blake3::Hasher::new();
     let mut out = std::io::BufWriter::new(std::fs::File::create(&args.output)?);
-    let mut failures_out =
-        std::io::BufWriter::new(std::fs::File::create(&args.failures_output)?);
+    let mut failures_out = std::io::BufWriter::new(std::fs::File::create(&args.failures_output)?);
 
     let mut verified = 0u64;
     let mut failed = 0u64;
@@ -147,7 +148,7 @@ async fn main() -> Result<()> {
         )?;
         let (r0, r1, r2) = rows;
 
-        let components: [(&str, &[u16], &[u16], &[u16]); 4] = [
+        let components: [ShareComponent; 4] = [
             ("left_code", r0.left_code(), r1.left_code(), r2.left_code()),
             ("left_mask", r0.left_mask(), r1.left_mask(), r2.left_mask()),
             (

From 95a1a297faf0704c58f664b8661520b9cf48e3c0 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 14:26:59 +0200
Subject: [PATCH 66/76] prevent limits on rand test

---
 .../bin/iris-mpc-upgrade/docker-compose.rand.yaml    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml
index 6d95a79e0c..6cfce451fa 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.rand.yaml
@@ -1,41 +1,41 @@
 services:
   new-db-1:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6200:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-2:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6201:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-3:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6202:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-4:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6203:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-5:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6204:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-6:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6205:5432"
     environment:

From ef4137a87b36ffb7e334d63b86b6cc934389cdc3 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 14:27:54 +0200
Subject: [PATCH 67/76] public ecr on images

---
 .../bin/iris-mpc-upgrade/docker-compose.yaml     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml
index be74e98d39..d5f0020c33 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/docker-compose.yaml
@@ -1,48 +1,48 @@
 services:
   old-db-shares-1:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6100:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   old-db-shares-2:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6101:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   old-db-masks-1:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6111:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-1:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6200:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-2:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6201:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-3:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6202:5432"
     environment:
       POSTGRES_USER: "postgres"
       POSTGRES_PASSWORD: "postgres"
   new-db-4:
-    image: postgres:16
+    image: public.ecr.aws/docker/library/postgres:16
     ports:
       - "6203:5432"
     environment:
@@ -64,7 +64,7 @@ services:
     ports:
       - "7000:7000"
   nginx:
-    image: nginx:1.27.1
+    image: public.ecr.aws/nginx/nginx:1.27.1
     depends_on:
       - reshare-server-2
     ports:

From 73959b6b7fd860659419a27a0553d3c426c9c506 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 14:29:30 +0200
Subject: [PATCH 68/76] prepare a test with sigterm handle

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml    | 4 ++--
 deploy/stage/common-values-iris-mpc.yaml                      | 4 ++--
 .../values-iris-mpc-continuous-rerandomization.yaml           | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml           | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml           | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index edea1fe306..ad94707a23 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
-image: "ghcr.io/worldcoin/rerandomization-protocol:0bbe0c379e32ba7312d357ac56f295abcd96da08"
-replicaCount: 0
+image: "ghcr.io/worldcoin/rerandomization-protocol:4161e46ca376d391c52139bd1b1d56ca420c1072"
+replicaCount: 1
 
 environment: stage
 
diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index 76214af4e6..3c661cbf99 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,7 +1,7 @@
-image: "ghcr.io/worldcoin/iris-mpc:ef509e12c96d5950c8a7ca54c4d3074a9a0dedab"
+image: "ghcr.io/worldcoin/iris-mpc:v0.31.5@sha256:af92dd27cabe80eb3a01fcec21960cb79d12a44e01f17459b83cc923c339f4d4"
+replicaCount: 1
 
 environment: stage
-replicaCount: 0
 
 strategy:
   type: Recreate
diff --git a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
index fb6c453912..5f1e1b65ea 100644
--- a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_stage_0
+    value: SMPC_rerand_test_0
   - name: ENVIRONMENT
     value: stage
   - name: RERAND_S3_BUCKET
diff --git a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
index cc10d62daf..5712dbe757 100644
--- a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_stage_1
+    value: SMPC_rerand_test_1
   - name: ENVIRONMENT
     value: stage
   - name: RERAND_S3_BUCKET
diff --git a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
index facd60e02c..869d6163e4 100644
--- a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_stage_2
+    value: SMPC_rerand_test_2
   - name: ENVIRONMENT
     value: stage
   - name: RERAND_S3_BUCKET

From 1a4f04deae08128d6adc491f54a8ab7ed58263ef Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 16:36:43 +0200
Subject: [PATCH 69/76] gracefuly close rerand

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index ad94707a23..aba66dea81 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:4161e46ca376d391c52139bd1b1d56ca420c1072"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 

From 2c6f99a85a96c3f9aa9e8cbfddf2cc55f1237c70 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 17:01:32 +0200
Subject: [PATCH 70/76] deploy iris + rerand

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 deploy/stage/common-values-iris-mpc.yaml                        | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml             | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml             | 2 +-
 .../values-iris-mpc-continuous-rerandomization.yaml             | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index aba66dea81..ad94707a23 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:4161e46ca376d391c52139bd1b1d56ca420c1072"
-replicaCount: 0
+replicaCount: 1
 
 environment: stage
 
diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index 3c661cbf99..e3c1bc85ef 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,4 +1,4 @@
-image: "ghcr.io/worldcoin/iris-mpc:v0.31.5@sha256:af92dd27cabe80eb3a01fcec21960cb79d12a44e01f17459b83cc923c339f4d4"
+image: "ghcr.io/worldcoin/iris-mpc:4161e46ca376d391c52139bd1b1d56ca420c1072"
 replicaCount: 1
 
 environment: stage
diff --git a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
index 5f1e1b65ea..fb6c453912 100644
--- a/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-0-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_rerand_test_0
+    value: SMPC_stage_0
   - name: ENVIRONMENT
     value: stage
   - name: RERAND_S3_BUCKET
diff --git a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
index 5712dbe757..cc10d62daf 100644
--- a/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-1-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_rerand_test_1
+    value: SMPC_stage_1
   - name: ENVIRONMENT
     value: stage
   - name: RERAND_S3_BUCKET
diff --git a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
index 869d6163e4..facd60e02c 100644
--- a/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/smpcv2-2-stage/values-iris-mpc-continuous-rerandomization.yaml
@@ -11,7 +11,7 @@ env:
         key: DATABASE_AURORA_URL
         name: application
   - name: SCHEMA_NAME
-    value: SMPC_rerand_test_2
+    value: SMPC_stage_2
   - name: ENVIRONMENT
     value: stage
   - name: RERAND_S3_BUCKET

From 6f8f2915a7de1298b7cf93576f143f321f1fa332 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 17:23:41 +0200
Subject: [PATCH 71/76] service down

---
 .../common-values-iris-mpc-continuous-rerandomization.yaml      | 2 +-
 deploy/stage/common-values-iris-mpc.yaml                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
index ad94707a23..aba66dea81 100644
--- a/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
+++ b/deploy/stage/common-values-iris-mpc-continuous-rerandomization.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/rerandomization-protocol:4161e46ca376d391c52139bd1b1d56ca420c1072"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 
diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index e3c1bc85ef..3ded5894cc 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/iris-mpc:4161e46ca376d391c52139bd1b1d56ca420c1072"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 

From a036712cae9a0b60c5972490837eb8d03e10cbc6 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 17:41:19 +0200
Subject: [PATCH 72/76] restore main iris-mpc version

---
 deploy/stage/common-values-iris-mpc.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index 3ded5894cc..3c661cbf99 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,5 +1,5 @@
-image: "ghcr.io/worldcoin/iris-mpc:4161e46ca376d391c52139bd1b1d56ca420c1072"
-replicaCount: 0
+image: "ghcr.io/worldcoin/iris-mpc:v0.31.5@sha256:af92dd27cabe80eb3a01fcec21960cb79d12a44e01f17459b83cc923c339f4d4"
+replicaCount: 1
 
 environment: stage
 

From 9b2a8c664495c58c679af6b30e7abed1fc5724e4 Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 17:52:08 +0200
Subject: [PATCH 73/76] stop iris

---
 deploy/stage/common-values-iris-mpc.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index 3c661cbf99..e4e42e097c 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/iris-mpc:v0.31.5@sha256:af92dd27cabe80eb3a01fcec21960cb79d12a44e01f17459b83cc923c339f4d4"
-replicaCount: 1
+replicaCount: 0
 
 environment: stage
 

From eb4c57dd66927f3b75553e8a4284d730c3a08a0d Mon Sep 17 00:00:00 2001
From: Wojciech Sromek <wojciech.sromek@toolsforhumanity.com>
Date: Fri, 3 Apr 2026 17:59:41 +0200
Subject: [PATCH 74/76] run iris

---
 deploy/stage/common-values-iris-mpc.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/stage/common-values-iris-mpc.yaml b/deploy/stage/common-values-iris-mpc.yaml
index e4e42e097c..3c661cbf99 100644
--- a/deploy/stage/common-values-iris-mpc.yaml
+++ b/deploy/stage/common-values-iris-mpc.yaml
@@ -1,5 +1,5 @@
 image: "ghcr.io/worldcoin/iris-mpc:v0.31.5@sha256:af92dd27cabe80eb3a01fcec21960cb79d12a44e01f17459b83cc923c339f4d4"
-replicaCount: 0
+replicaCount: 1
 
 environment: stage
 

From 359bc0d7d20d709a753c30ad99ee142011cb69a2 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Thu, 23 Apr 2026 17:24:54 +0200
Subject: [PATCH 75/76] pr feedback

---
 iris-mpc-bins/bin/iris-mpc/server.rs |   14 +-
 iris-mpc-store/src/rerand.rs         |   94 +-
 iris-mpc/src/server/mod.rs           |   14 +-
 spec/iris_mpc_server.qnt             | 1235 ++++++++++++++++++++++++++
 4 files changed, 1347 insertions(+), 10 deletions(-)
 create mode 100644 spec/iris_mpc_server.qnt

diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 47d4c395e2..79258b1f5a 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -516,8 +516,18 @@ async fn server_main(config: Config) -> Result<()> {
     }
     .await;
 
-    // Always release freeze, even on error.
-    rerand_store::release_rerand_freeze(&freeze_pool).await?;
+    // Always attempt freeze release, but never let its failure undo a
+    // successful startup. `release_rerand_freeze` already retries internally,
+    // and a subsequent startup will re-issue a freeze with a new generation
+    // that the worker re-acknowledges (see `check_and_handle_freeze` generation
+    // change handling).
+    if let Err(e) = rerand_store::release_rerand_freeze(&freeze_pool).await {
+        tracing::error!(
+            "Failed to release rerand freeze after startup: {:?}. \
+             Worker will re-acknowledge on next startup freeze.",
+            e
+        );
+    }
 
     let (mut handle, store) = match frozen_result? {
         None => return Ok(()),
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 94455aee09..44f69a4d81 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -320,12 +320,21 @@ pub async fn delete_staging_for_old_epochs(
     Ok(result.rows_affected())
 }
 
-/// Delete rerand progress rows for epochs older than `current_epoch`.
+/// Delete rerand progress rows for epochs strictly older than the one
+/// immediately preceding `current_epoch`.
+///
+/// We intentionally keep the rows from the immediately prior epoch so that
+/// `get_applied_watermark_from_pool` does not transiently return `None`
+/// between the end of epoch `E` and the first applied chunk of epoch `E+1`.
+/// A transient `None` here used to cause the cross-party startup watermark
+/// check (`freeze_and_verify_inner`) to spuriously classify this party as
+/// behind any peer still reporting `Some((E, last_chunk))`, producing a
+/// release / re-freeze oscillation during rolling deploys at epoch bumps.
 pub async fn delete_rerand_progress_for_old_epochs(
     pool: &PgPool,
     current_epoch: i32,
 ) -> Result<u64> {
-    let result = sqlx::query("DELETE FROM rerand_progress WHERE epoch < $1")
+    let result = sqlx::query("DELETE FROM rerand_progress WHERE epoch < $1 - 1")
         .bind(current_epoch)
         .execute(pool)
         .await?;
@@ -390,6 +399,25 @@ fn rerand_control_exists(err: &sqlx::Error) -> bool {
     !is_undefined_table_sqlx(err)
 }
 
+/// Strict-less-than comparator for applied watermarks that documents the
+/// intended semantics at the call site and avoids accidentally relying on
+/// `Option<T>`'s derived ordering (which treats `None < Some(_)`).
+///
+/// Semantics:
+/// - `None` means "never applied anything" — this is the legitimate day-0
+///   state before any epoch has completed its first chunk on any party.
+/// - When combined with
+///   [`delete_rerand_progress_for_old_epochs`]'s retain-prior-epoch policy,
+///   `None` should only arise on genuinely fresh deployments, not as a
+///   transient epoch-boundary artifact.
+fn watermark_lt(a: Option<(i32, i32)>, b: Option<(i32, i32)>) -> bool {
+    match (a, b) {
+        (None, Some(_)) => true,
+        (Some(x), Some(y)) => x < y,
+        _ => false,
+    }
+}
+
 /// Request the rerand worker to freeze. Writes a unique `freeze_generation`
 /// to `rerand_control`. Returns the generation token.
 pub async fn request_rerand_freeze(pool: &PgPool) -> Result<Option<String>> {
@@ -668,11 +696,36 @@ async fn freeze_and_verify_inner(pool: &PgPool, peers: &[(&str, usize)]) -> Resu
             Some(g) => g,
             None => return Ok(()), // pre-migration, no rerand tables
         };
-        wait_for_rerand_frozen(pool, &gen).await?;
+
+        // Bound the ack wait by whatever remains of the outer deadline. The
+        // helper's own `FREEZE_TIMEOUT` resets per call, so without this the
+        // total elapsed time across repeated catchup iterations can exceed the
+        // advertised `FREEZE_TIMEOUT` by a large factor.
+        let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
+        if remaining.is_zero() {
+            let _ = release_rerand_freeze(pool).await;
+            eyre::bail!(
+                "Rerand freeze+convergence timeout after {:?}. \
+                 Ensure all rerand workers and main servers are healthy.",
+                FREEZE_TIMEOUT,
+            );
+        }
+        match tokio::time::timeout(remaining, wait_for_rerand_frozen(pool, &gen)).await {
+            Ok(r) => r?,
+            Err(_) => {
+                let _ = release_rerand_freeze(pool).await;
+                eyre::bail!(
+                    "Rerand freeze+convergence timeout after {:?} (ack wait; generation={}). \
+                     Ensure all rerand workers and main servers are healthy.",
+                    FREEZE_TIMEOUT,
+                    gen,
+                );
+            }
+        }
 
         loop {
             if tokio::time::Instant::now() >= deadline {
-                release_rerand_freeze(pool).await?;
+                let _ = release_rerand_freeze(pool).await;
                 eyre::bail!(
                     "Rerand watermark convergence timeout after {:?}. \
                      Ensure all rerand workers and main servers are healthy.",
@@ -689,7 +742,7 @@ async fn freeze_and_verify_inner(pool: &PgPool, peers: &[(&str, usize)]) -> Resu
                 if peer != local {
                     all_equal = false;
                 }
-                if peer > max_wm {
+                if watermark_lt(max_wm, peer) {
                     max_wm = peer;
                 }
             }
@@ -702,7 +755,7 @@ async fn freeze_and_verify_inner(pool: &PgPool, peers: &[(&str, usize)]) -> Resu
                 return Ok(());
             }
 
-            if local < max_wm {
+            if watermark_lt(local, max_wm) {
                 tracing::info!(
                     "Local watermark {:?} behind max {:?}, releasing freeze to catch up",
                     local,
@@ -740,4 +793,33 @@ mod tests {
     fn test_validate_identifier_rejects_injection() {
         assert!(validate_identifier("public; DROP TABLE irises").is_err());
     }
+
+    #[test]
+    fn test_watermark_lt_both_none_is_false() {
+        assert!(!watermark_lt(None, None));
+    }
+
+    #[test]
+    fn test_watermark_lt_none_lt_some() {
+        assert!(watermark_lt(None, Some((0, 0))));
+    }
+
+    #[test]
+    fn test_watermark_lt_some_not_lt_none() {
+        // Regression guard for the epoch-boundary bug: a local `Some((E, k))`
+        // must not be considered behind a peer reporting `None`, since peers
+        // reporting `None` are strictly at earlier progress than any applied
+        // chunk.
+        assert!(!watermark_lt(Some((0, 0)), None));
+        assert!(!watermark_lt(Some((3, 42)), None));
+    }
+
+    #[test]
+    fn test_watermark_lt_some_some_uses_lexicographic() {
+        assert!(watermark_lt(Some((0, 4)), Some((1, 0))));
+        assert!(watermark_lt(Some((1, 0)), Some((1, 1))));
+        assert!(!watermark_lt(Some((1, 1)), Some((1, 1))));
+        assert!(!watermark_lt(Some((1, 1)), Some((1, 0))));
+        assert!(!watermark_lt(Some((1, 0)), Some((0, 99))));
+    }
 }
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index 56b5dc2ebc..7c71a3eeb8 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -238,8 +238,18 @@ pub async fn server_main(config: Config) -> Result<()> {
     }
     .await;
 
-    // Always release freeze, even on error.
-    rerand_store::release_rerand_freeze(&iris_store.pool).await?;
+    // Always attempt freeze release, but never let its failure undo a
+    // successful startup. `release_rerand_freeze` already retries internally,
+    // and a subsequent startup will re-issue a freeze with a new generation
+    // that the worker re-acknowledges (see `check_and_handle_freeze` generation
+    // change handling).
+    if let Err(e) = rerand_store::release_rerand_freeze(&iris_store.pool).await {
+        tracing::error!(
+            "Failed to release rerand freeze after startup: {:?}. \
+             Worker will re-acknowledge on next startup freeze.",
+            e
+        );
+    }
 
     let hawk_actor = match frozen_result? {
         None => return Ok(()),
diff --git a/spec/iris_mpc_server.qnt b/spec/iris_mpc_server.qnt
new file mode 100644
index 0000000000..efee8cee29
--- /dev/null
+++ b/spec/iris_mpc_server.qnt
@@ -0,0 +1,1235 @@
+// Formal specification of the iris-mpc server synchronization protocol.
+//
+// Models 3 MPC server nodes coordinating to process iris biometric queries.
+// Each node has its own database of iris shares and a modification log.
+// The spec covers:
+//   - Server lifecycle (startup, sync, processing, shutdown)
+//   - Startup sync protocol (compare_modifications algorithm)
+//   - Batch processing with cross-node batch sync and valid_entries filtering
+//   - Crash/restart recovery and graceful shutdown
+//   - Deletion as tombstone (overwrite, not removal)
+//   - Reauth success/failure (persisted only on success)
+//
+// MPC computations are abstracted as a nondeterministic oracle.
+//
+// Reference implementation:
+//   - sync.rs: compare_modifications (lines 218-321)
+//   - server/mod.rs: server_main lifecycle (lines 60-211)
+//   - modifications_sync.rs: sync_modifications (lines 23-149)
+//   - batch.rs: receive_batch, sync_batch_entries, valid_entries AND logic
+//   - job.rs: process_job_result, deletion tombstones, reauth success/fail
+
+module iris_mpc_server {
+
+  // ---------------------------------------------------------------------------
+  // Constants
+  // ---------------------------------------------------------------------------
+
+  const NODES: Set[int]
+  const MAX_DB_SIZE: int
+  const MAX_BATCH_SIZE: int
+  const LOOKBACK: int
+
+  // ---------------------------------------------------------------------------
+  // Types
+  // ---------------------------------------------------------------------------
+
+  type RequestType =
+    | Uniqueness
+    | Reauth
+    | Deletion
+    | Update
+
+  type ModStatus =
+    | InProgress
+    | Completed
+
+  type Modification = {
+    id: int,
+    serial_id: int,              // 0 = unassigned (for uniqueness before MPC)
+    request_type: RequestType,
+    status: ModStatus,
+    persisted: bool
+  }
+
+  type BatchRequest = {
+    seq_num: int,
+    request_id: int,
+    request_type: RequestType,
+    target_serial_id: int        // 0 for uniqueness, >0 for others
+  }
+
+  type SyncStateRec = {
+    db_len: int,
+    mods: Set[Modification],
+    next_seq_num: int            // 0 = empty queue
+  }
+
+  type NodePhase =
+    | Down
+    | WaitingPeersSync
+    | SyncingMods
+    | SyncingQueue
+    | LoadingDB
+    | WaitingPeersReady
+    | Ready
+    | Processing
+    | PersistingResults
+    | ShuttingDown               // graceful shutdown: finish pending, reject new
+
+  type RequestResult = {
+    is_match: bool,              // for uniqueness: matched existing iris?
+    assigned_serial_id: int,     // for non-match uniqueness: newly assigned serial_id
+    success: bool                // for reauth: did it succeed?
+  }
+
+  /// Iris entry status: live data or tombstone (deletion overwrites with dummy).
+  type IrisStatus =
+    | Live
+    | Tombstoned
+
+  // ---------------------------------------------------------------------------
+  // State variables
+  // ---------------------------------------------------------------------------
+
+  var phase: int -> NodePhase
+  /// Iris database per node: serial_id -> status (Live or Tombstoned).
+  /// Deletions overwrite with dummy data, keeping the slot occupied.
+  var db: int -> (int -> IrisStatus)
+  var db_len: int -> int
+  var mods: int -> Set[Modification]
+  var queue_cursor: int -> int
+  var queue: List[BatchRequest]
+  var next_seq_num: int
+  var published_sync: int -> SyncStateRec
+  var node_ready: int -> bool
+  var current_batch: int -> List[BatchRequest]
+  /// Per-node valid_entries: which batch entries passed decryption.
+  /// After AND across nodes, invalid entries are filtered out.
+  var valid_entries: int -> List[bool]
+  var batch_results: int -> List[RequestResult]
+  var global_mod_id: int
+
+  // ---------------------------------------------------------------------------
+  // Helpers
+  // ---------------------------------------------------------------------------
+
+  pure def max_of(s: Set[int], default: int): int =
+    s.fold(default, (acc, x) => if (x > acc) x else acc)
+
+  pure def min_of(s: Set[int], default: int): int =
+    s.fold(default, (acc, x) => if (x < acc) x else acc)
+
+  /// All occupied serial IDs (both Live and Tombstoned).
+  pure def occupied_ids(iris_db: int -> IrisStatus): Set[int] =
+    iris_db.keys()
+
+  val empty_sync: SyncStateRec = { db_len: 0, mods: Set(), next_seq_num: 0 }
+
+  // ---------------------------------------------------------------------------
+  // Initialization
+  // ---------------------------------------------------------------------------
+
+  action init = all {
+    phase'          = NODES.mapBy(_ => Down),
+    db'             = NODES.mapBy(_ => Map()),
+    db_len'         = NODES.mapBy(_ => 0),
+    mods'           = NODES.mapBy(_ => Set()),
+    queue_cursor'   = NODES.mapBy(_ => 0),
+    queue'          = List(),
+    next_seq_num'   = 1,
+    published_sync' = NODES.mapBy(_ => empty_sync),
+    node_ready'     = NODES.mapBy(_ => false),
+    current_batch'  = NODES.mapBy(_ => List()),
+    valid_entries'  = NODES.mapBy(_ => List()),
+    batch_results'  = NODES.mapBy(_ => List()),
+    global_mod_id'  = 1,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Environment: external requests arrive in the queue
+  // ---------------------------------------------------------------------------
+
+  action enqueue_uniqueness = all {
+    queue.length() < MAX_DB_SIZE,
+    queue' = queue.append({
+      seq_num: next_seq_num,
+      request_id: next_seq_num,
+      request_type: Uniqueness,
+      target_serial_id: 0
+    }),
+    next_seq_num' = next_seq_num + 1,
+    phase' = phase, db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, published_sync' = published_sync,
+    node_ready' = node_ready, current_batch' = current_batch,
+    valid_entries' = valid_entries,
+    batch_results' = batch_results, global_mod_id' = global_mod_id,
+  }
+
+  action enqueue_reauth(target: int): bool = all {
+    target >= 1,
+    queue.length() < MAX_DB_SIZE,
+    queue' = queue.append({
+      seq_num: next_seq_num,
+      request_id: next_seq_num,
+      request_type: Reauth,
+      target_serial_id: target
+    }),
+    next_seq_num' = next_seq_num + 1,
+    phase' = phase, db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, published_sync' = published_sync,
+    node_ready' = node_ready, current_batch' = current_batch,
+    valid_entries' = valid_entries,
+    batch_results' = batch_results, global_mod_id' = global_mod_id,
+  }
+
+  action enqueue_deletion(target: int): bool = all {
+    target >= 1,
+    queue.length() < MAX_DB_SIZE,
+    queue' = queue.append({
+      seq_num: next_seq_num,
+      request_id: next_seq_num,
+      request_type: Deletion,
+      target_serial_id: target
+    }),
+    next_seq_num' = next_seq_num + 1,
+    phase' = phase, db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, published_sync' = published_sync,
+    node_ready' = node_ready, current_batch' = current_batch,
+    valid_entries' = valid_entries,
+    batch_results' = batch_results, global_mod_id' = global_mod_id,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Startup: node starts up, builds sync state, publishes it
+  // ---------------------------------------------------------------------------
+
+  action node_start(node: int): bool =
+    val my_mods = mods.get(node)
+    val max_mod_id = max_of(my_mods.map(m => m.id), 0)
+    val lookback_mods = my_mods.filter(m => m.id > max_mod_id - LOOKBACK)
+    // In the real system, get_next_sns_seq_num peeks at the shared SQS queue.
+    val queue_next = if (queue.length() > 0) queue[0].seq_num else 0
+    val sync_state: SyncStateRec = {
+      db_len: db_len.get(node),
+      mods: lookback_mods,
+      next_seq_num: queue_next
+    }
+    all {
+      phase.get(node) == Down,
+      phase' = phase.set(node, WaitingPeersSync),
+      published_sync' = published_sync.set(node, sync_state),
+      node_ready' = node_ready.set(node, false),
+      current_batch' = current_batch.set(node, List()),
+      valid_entries' = valid_entries.set(node, List()),
+      batch_results' = batch_results.set(node, List()),
+      db' = db, db_len' = db_len, mods' = mods,
+      queue_cursor' = queue_cursor, queue' = queue,
+      next_seq_num' = next_seq_num, global_mod_id' = global_mod_id,
+    }
+
+  // ---------------------------------------------------------------------------
+  // Wait for peers, then begin sync
+  // ---------------------------------------------------------------------------
+
+  action begin_sync(node: int): bool = all {
+    phase.get(node) == WaitingPeersSync,
+    NODES.filter(n => phase.get(n) != Down).forall(n => phase.get(n) != Down),
+    phase' = phase.set(node, SyncingMods),
+    db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, queue' = queue,
+    next_seq_num' = next_seq_num, published_sync' = published_sync,
+    node_ready' = node_ready, current_batch' = current_batch,
+    valid_entries' = valid_entries,
+    batch_results' = batch_results, global_mod_id' = global_mod_id,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Sync modifications: the core compare_modifications algorithm
+  // Mirrors sync.rs lines 218-321 and modifications_sync.rs lines 23-149
+  // ---------------------------------------------------------------------------
+
+  action sync_modifications(node: int): bool =
+    val running = NODES.filter(n => phase.get(n) != Down)
+    val all_published_mods: Set[Modification] = running.fold(Set(), (acc, n) =>
+      acc.union(published_sync.get(n).mods)
+    )
+    val all_mod_ids: Set[int] = all_published_mods.map(m => m.id)
+    // Safety check: max completed mod_id diff <= LOOKBACK (sync.rs 227-249)
+    val completed_max_per_node: Set[int] = running.map(n =>
+      max_of(
+        published_sync.get(n).mods.filter(m => m.status == Completed).map(m => m.id),
+        0
+      )
+    ).filter(x => x > 0)
+    val safe = if (completed_max_per_node.size() <= 1) true
+               else max_of(completed_max_per_node, 0) - min_of(completed_max_per_node, 0) <= LOOKBACK
+    val my_mods = mods.get(node)
+    val local_mod_ids = my_mods.map(m => m.id)
+    // to_delete: mod IDs where ALL nodes have InProgress
+    val to_delete_ids: Set[int] = all_mod_ids.filter(mid =>
+      running.forall(n =>
+        published_sync.get(n).mods.forall(m => m.id != mid or m.status == InProgress)
+      )
+    )
+    // to_update: mod IDs where ANY node has Completed
+    val to_update_ids: Set[int] = all_mod_ids.filter(mid =>
+      running.exists(n =>
+        published_sync.get(n).mods.exists(m => m.id == mid and m.status == Completed)
+      )
+    )
+    // For each to_update mod, find a completed copy
+    val completed_copies: Set[Modification] = to_update_ids.map(mid =>
+      all_published_mods.filter(m => m.id == mid and m.status == Completed).fold(
+        { id: mid, serial_id: 0, request_type: Uniqueness, status: Completed, persisted: false },
+        (_, m) => m
+      )
+    )
+    val kept_mods = my_mods.filter(m =>
+      not(to_delete_ids.contains(m.id)) and not(to_update_ids.contains(m.id))
+    )
+    val updated_mods = completed_copies.filter(m => local_mod_ids.contains(m.id))
+    val new_mods = kept_mods.union(updated_mods)
+    // Apply iris changes for newly-persisted modifications
+    val new_uniqueness_inserts: Set[int] = updated_mods
+      .filter(m => m.persisted and m.request_type == Uniqueness and m.serial_id > 0)
+      .map(m => m.serial_id)
+    // Apply deletion tombstones
+    val new_deletion_targets: Set[int] = updated_mods
+      .filter(m => m.persisted and m.request_type == Deletion and m.serial_id > 0)
+      .map(m => m.serial_id)
+    val my_db = db.get(node)
+    // Add new uniqueness inserts as Live
+    val db_with_inserts = new_uniqueness_inserts.fold(my_db, (acc, sid) =>
+      acc.put(sid, Live)
+    )
+    // Mark deletions as Tombstoned
+    val db_with_deletions = new_deletion_targets.fold(db_with_inserts, (acc, sid) =>
+      if (acc.keys().contains(sid)) acc.set(sid, Tombstoned) else acc
+    )
+    val new_db_len = max_of(db_with_deletions.keys(), 0)
+    all {
+      phase.get(node) == SyncingMods,
+      safe,
+      mods' = mods.set(node, new_mods),
+      db' = db.set(node, db_with_deletions),
+      db_len' = db_len.set(node, new_db_len),
+      phase' = phase.set(node, SyncingQueue),
+      queue_cursor' = queue_cursor, queue' = queue,
+      next_seq_num' = next_seq_num, published_sync' = published_sync,
+      node_ready' = node_ready, current_batch' = current_batch,
+      valid_entries' = valid_entries,
+      batch_results' = batch_results, global_mod_id' = global_mod_id,
+    }
+
+  // ---------------------------------------------------------------------------
+  // Sync SQS queue: advance cursor to max across all nodes
+  // ---------------------------------------------------------------------------
+
+  action sync_queue(node: int): bool =
+    val running = NODES.filter(n => phase.get(n) != Down)
+    val seq_nums: Set[int] = running.map(n => published_sync.get(n).next_seq_num)
+    val any_empty = seq_nums.contains(0)
+    val any_nonempty = seq_nums.exists(s => s > 0)
+    val max_seq = max_of(seq_nums, 0)
+    val cleaned_queue = queue.select(r => r.seq_num >= max_seq)
+    all {
+      phase.get(node) == SyncingQueue,
+      not(any_empty and any_nonempty),
+      queue_cursor' = queue_cursor.set(node, max_seq),
+      queue' = cleaned_queue,
+      phase' = phase.set(node, LoadingDB),
+      db' = db, db_len' = db_len, mods' = mods,
+      next_seq_num' = next_seq_num,
+      published_sync' = published_sync, node_ready' = node_ready,
+      current_batch' = current_batch, valid_entries' = valid_entries,
+      batch_results' = batch_results, global_mod_id' = global_mod_id,
+    }
+
+  // ---------------------------------------------------------------------------
+  // Load database and signal ready
+  // ---------------------------------------------------------------------------
+
+  action finish_loading(node: int): bool = all {
+    phase.get(node) == LoadingDB,
+    phase' = phase.set(node, WaitingPeersReady),
+    node_ready' = node_ready.set(node, true),
+    db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, queue' = queue,
+    next_seq_num' = next_seq_num, published_sync' = published_sync,
+    current_batch' = current_batch, valid_entries' = valid_entries,
+    batch_results' = batch_results, global_mod_id' = global_mod_id,
+  }
+
+  action all_nodes_ready(node: int): bool = all {
+    phase.get(node) == WaitingPeersReady,
+    NODES.filter(n => phase.get(n) != Down).forall(n => node_ready.get(n)),
+    phase' = phase.set(node, Ready),
+    db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, queue' = queue,
+    next_seq_num' = next_seq_num, published_sync' = published_sync,
+    node_ready' = node_ready, current_batch' = current_batch,
+    valid_entries' = valid_entries,
+    batch_results' = batch_results, global_mod_id' = global_mod_id,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Batch processing
+  // ---------------------------------------------------------------------------
+
+  /// All ready nodes receive the next batch. Each node nondeterministically
+  /// determines which entries it can decrypt (valid_entries). The valid_entries
+  /// are AND-ed across all nodes (batch.rs sync_batch_entries, job.rs 241).
+  action receive_batch =
+    val ready_nodes = NODES.filter(n => phase.get(n) == Ready)
+    val min_cursor = min_of(ready_nodes.map(n => queue_cursor.get(n)), 0)
+    val available = queue.select(r => r.seq_num >= min_cursor)
+    val batch_size = if (available.length() > MAX_BATCH_SIZE) MAX_BATCH_SIZE
+                     else available.length()
+    val batch = if (batch_size > 0) available.slice(0, batch_size) else List()
+    val new_cursor = if (batch_size > 0) batch[batch_size - 1].seq_num + 1 else min_cursor
+    // Create IN_PROGRESS modifications for each batch entry
+    val new_modifications: Set[Modification] = range(0, batch_size).foldl(Set(), (acc, i) =>
+      acc.union(Set({
+        id: global_mod_id + i,
+        serial_id: batch[i].target_serial_id,
+        request_type: batch[i].request_type,
+        status: InProgress,
+        persisted: false
+      }))
+    )
+    all {
+      ready_nodes == NODES,
+      batch_size > 0,
+      // Nondeterministically choose valid_entries per node, then AND them.
+      // In the real system, decryption failures cause entries to be invalid.
+      nondet invalid_set = range(0, batch_size).foldl(Set(), (acc, i) =>
+        acc.union(Set(i))
+      ).powerset().oneOf()
+      // AND logic: entry valid only if ALL nodes consider it valid
+      val and_valid: List[bool] = range(0, batch_size).foldl(List(), (acc, i) =>
+        acc.append(not(invalid_set.contains(i)))
+      )
+      all {
+        current_batch' = ready_nodes.fold(current_batch, (acc, n) =>
+          acc.set(n, batch)
+        ),
+        valid_entries' = ready_nodes.fold(valid_entries, (acc, n) =>
+          acc.set(n, and_valid)
+        ),
+        queue_cursor' = ready_nodes.fold(queue_cursor, (acc, n) =>
+          acc.set(n, new_cursor)
+        ),
+        phase' = ready_nodes.fold(phase, (acc, n) =>
+          acc.set(n, Processing)
+        ),
+        mods' = ready_nodes.fold(mods, (acc, n) =>
+          acc.set(n, acc.get(n).union(new_modifications))
+        ),
+        global_mod_id' = global_mod_id + batch_size,
+        db' = db, db_len' = db_len,
+        queue' = queue, next_seq_num' = next_seq_num,
+        published_sync' = published_sync, node_ready' = node_ready,
+        batch_results' = batch_results,
+      }
+    }
+
+  /// MPC processing: nondeterministic oracle decides results.
+  /// For uniqueness: nondeterministically choose match/no-match.
+  /// For reauth: nondeterministically choose success/failure.
+  /// Invalid entries (valid_entries[i] == false) get match=true to skip them.
+  action process_batch =
+    val processing_nodes = NODES.filter(n => phase.get(n) == Processing)
+    val ref_node = min_of(processing_nodes, 0)
+    val batch = current_batch.get(ref_node)
+    val ve = valid_entries.get(ref_node)
+    val batch_size = batch.length()
+    val uniqueness_indices: Set[int] = range(0, batch_size).foldl(Set(), (acc, i) =>
+      if (batch[i].request_type == Uniqueness and ve[i]) acc.union(Set(i)) else acc
+    )
+    val reauth_indices: Set[int] = range(0, batch_size).foldl(Set(), (acc, i) =>
+      if (batch[i].request_type == Reauth and ve[i]) acc.union(Set(i)) else acc
+    )
+    all {
+      processing_nodes == NODES,
+      batch_size > 0,
+      nondet match_set = uniqueness_indices.powerset().oneOf()
+      nondet reauth_fail_set = reauth_indices.powerset().oneOf()
+      val base_serial = max_of(processing_nodes.map(n => db_len.get(n)), 0)
+      val results: List[RequestResult] = range(0, batch_size).foldl(
+        { res: List(), next_id: base_serial + 1 },
+        (state, i) =>
+          if (not(ve[i])) {
+            // Invalid entry: skip (treated as match / no-op)
+            { res: state.res.append({ is_match: true, assigned_serial_id: 0, success: false }),
+              next_id: state.next_id }
+          } else if (batch[i].request_type == Uniqueness) {
+            if (match_set.contains(i)) {
+              { res: state.res.append({ is_match: true, assigned_serial_id: 0, success: true }),
+                next_id: state.next_id }
+            } else {
+              { res: state.res.append({ is_match: false, assigned_serial_id: state.next_id, success: true }),
+                next_id: state.next_id + 1 }
+            }
+          } else if (batch[i].request_type == Reauth) {
+            // Reauth can fail (e.g., no matching iris found)
+            val succeeded = not(reauth_fail_set.contains(i))
+            { res: state.res.append({ is_match: false, assigned_serial_id: 0, success: succeeded }),
+              next_id: state.next_id }
+          } else {
+            // Deletion, Update: always succeed
+            { res: state.res.append({ is_match: false, assigned_serial_id: 0, success: true }),
+              next_id: state.next_id }
+          }
+      ).res
+      all {
+        batch_results' = processing_nodes.fold(batch_results, (acc, n) =>
+          acc.set(n, results)
+        ),
+        phase' = processing_nodes.fold(phase, (acc, n) =>
+          acc.set(n, PersistingResults)
+        ),
+        db' = db, db_len' = db_len, mods' = mods,
+        queue_cursor' = queue_cursor, queue' = queue,
+        next_seq_num' = next_seq_num, published_sync' = published_sync,
+        node_ready' = node_ready, current_batch' = current_batch,
+        valid_entries' = valid_entries,
+        global_mod_id' = global_mod_id,
+      }
+    }
+
+  /// Persist results to database (atomic transaction per node).
+  /// - Non-matching uniqueness: insert new iris as Live
+  /// - Successful reauth: update existing iris (persisted=true)
+  /// - Failed reauth: no DB write (persisted=false)
+  /// - Deletion: overwrite iris with dummy data (Tombstoned), persisted=true
+  /// - Invalid entries: persisted=false
+  action persist_results(node: int): bool =
+    val batch = current_batch.get(node)
+    val results = batch_results.get(node)
+    val ve = valid_entries.get(node)
+    val batch_size = batch.length()
+    val batch_mod_base = global_mod_id - batch_size
+    // New iris IDs from non-matching uniqueness (insert as Live)
+    val new_iris_ids: Set[int] = range(0, batch_size).foldl(Set(), (acc, i) =>
+      if (ve[i] and batch[i].request_type == Uniqueness and not(results[i].is_match)
+          and results[i].assigned_serial_id > 0) {
+        acc.union(Set(results[i].assigned_serial_id))
+      } else {
+        acc
+      }
+    )
+    // Deletion targets: overwrite with tombstone
+    val deletion_targets: Set[int] = range(0, batch_size).foldl(Set(), (acc, i) =>
+      if (ve[i] and batch[i].request_type == Deletion and batch[i].target_serial_id > 0) {
+        acc.union(Set(batch[i].target_serial_id))
+      } else {
+        acc
+      }
+    )
+    // Update modifications to COMPLETED
+    val my_mods = mods.get(node)
+    val updated_mods: Set[Modification] = my_mods.map(m =>
+      if (m.id >= batch_mod_base and m.id < global_mod_id and m.status == InProgress) {
+        val batch_idx = m.id - batch_mod_base
+        val res = results[batch_idx]
+        val entry_valid = ve[batch_idx]
+        val was_persisted = if (not(entry_valid)) false
+          else if (batch[batch_idx].request_type == Uniqueness) not(res.is_match)
+          else if (batch[batch_idx].request_type == Reauth) res.success
+          else true  // Deletion, Update: always persisted
+        val new_serial = if (batch[batch_idx].request_type == Uniqueness and not(res.is_match) and entry_valid)
+                           res.assigned_serial_id
+                         else m.serial_id
+        { ...m, status: Completed, persisted: was_persisted, serial_id: new_serial }
+      } else {
+        m
+      }
+    )
+    // Apply DB changes
+    val my_db = db.get(node)
+    val db_with_inserts = new_iris_ids.fold(my_db, (acc, sid) => acc.put(sid, Live))
+    val db_with_tombstones = deletion_targets.fold(db_with_inserts, (acc, sid) =>
+      if (acc.keys().contains(sid)) acc.set(sid, Tombstoned) else acc
+    )
+    val new_db_len = max_of(db_with_tombstones.keys(), 0)
+    all {
+      phase.get(node) == PersistingResults,
+      batch_size > 0,
+      db' = db.set(node, db_with_tombstones),
+      db_len' = db_len.set(node, new_db_len),
+      mods' = mods.set(node, updated_mods),
+      phase' = phase.set(node, Ready),
+      current_batch' = current_batch.set(node, List()),
+      valid_entries' = valid_entries.set(node, List()),
+      batch_results' = batch_results.set(node, List()),
+      queue_cursor' = queue_cursor, queue' = queue,
+      next_seq_num' = next_seq_num, published_sync' = published_sync,
+      node_ready' = node_ready, global_mod_id' = global_mod_id,
+    }
+
+  // ---------------------------------------------------------------------------
+  // Crash, shutdown, restart
+  // ---------------------------------------------------------------------------
+
+  /// A node crashes. In-memory state is lost but DB persists.
+  action crash(node: int): bool = all {
+    phase.get(node) != Down,
+    phase' = phase.set(node, Down),
+    node_ready' = node_ready.set(node, false),
+    current_batch' = current_batch.set(node, List()),
+    valid_entries' = valid_entries.set(node, List()),
+    batch_results' = batch_results.set(node, List()),
+    db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, queue' = queue,
+    next_seq_num' = next_seq_num, published_sync' = published_sync,
+    global_mod_id' = global_mod_id,
+  }
+
+  /// Graceful shutdown: node finishes pending batch then stops.
+  /// If in PersistingResults, persist first then go Down.
+  /// Otherwise, go Down immediately (main loop won't start new batches).
+  action graceful_shutdown(node: int): bool = all {
+    phase.get(node) == Ready or phase.get(node) == WaitingPeersReady,
+    phase' = phase.set(node, Down),
+    node_ready' = node_ready.set(node, false),
+    current_batch' = current_batch.set(node, List()),
+    valid_entries' = valid_entries.set(node, List()),
+    batch_results' = batch_results.set(node, List()),
+    db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, queue' = queue,
+    next_seq_num' = next_seq_num, published_sync' = published_sync,
+    global_mod_id' = global_mod_id,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Stuttering
+  // ---------------------------------------------------------------------------
+
+  action stuttering = all {
+    phase' = phase, db' = db, db_len' = db_len, mods' = mods,
+    queue_cursor' = queue_cursor, queue' = queue,
+    next_seq_num' = next_seq_num, published_sync' = published_sync,
+    node_ready' = node_ready, current_batch' = current_batch,
+    valid_entries' = valid_entries,
+    batch_results' = batch_results, global_mod_id' = global_mod_id,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Step
+  // ---------------------------------------------------------------------------
+
+  action step = any {
+    enqueue_uniqueness,
+    nondet target = 1.to(MAX_DB_SIZE).oneOf()
+    enqueue_reauth(target),
+    nondet target = 1.to(MAX_DB_SIZE).oneOf()
+    enqueue_deletion(target),
+
+    nondet node = NODES.oneOf()
+    any {
+      node_start(node),
+      begin_sync(node),
+      sync_modifications(node),
+      sync_queue(node),
+      finish_loading(node),
+      all_nodes_ready(node),
+      persist_results(node),
+      crash(node),
+      graceful_shutdown(node),
+    },
+
+    receive_batch,
+    process_batch,
+    stuttering,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Invariants
+  // ---------------------------------------------------------------------------
+
+  /// Serial ID contiguity: occupied IDs form range 1..db_len.
+  /// Tombstoned entries still occupy their slot (deletion does not compact).
+  val serial_id_contiguity: bool =
+    NODES.forall(n => db.get(n).keys() == 1.to(db_len.get(n)))
+
+  /// DB length equals count of occupied slots (including tombstones).
+  val db_len_consistent: bool =
+    NODES.forall(n => db_len.get(n) == db.get(n).keys().size())
+
+  /// DB size bounded.
+  val db_size_bounded: bool =
+    NODES.forall(n => db_len.get(n) <= MAX_DB_SIZE)
+
+  /// Modification ID safety window across running nodes.
+  val mod_id_safety_window: bool = {
+    val running = NODES.filter(n => phase.get(n) != Down)
+    val completed_maxes: Set[int] = running.map(n =>
+      max_of(mods.get(n).filter(m => m.status == Completed).map(m => m.id), 0)
+    ).filter(x => x > 0)
+    if (completed_maxes.size() <= 1) true
+    else max_of(completed_maxes, 0) - min_of(completed_maxes, 0) <= LOOKBACK
+  }
+
+  /// Batch consistency: all batch-phase nodes have same batch.
+  val batch_consistency: bool = {
+    val batch_nodes = NODES.filter(n =>
+      phase.get(n) == Processing or phase.get(n) == PersistingResults
+    )
+    batch_nodes.forall(n1 => batch_nodes.forall(n2 =>
+      current_batch.get(n1) == current_batch.get(n2)
+    ))
+  }
+
+  /// When all nodes are Ready, their completed mods agree.
+  val ready_state_consistency: bool = {
+    val ready_nodes = NODES.filter(n => phase.get(n) == Ready)
+    if (ready_nodes.size() < 2) true
+    else ready_nodes.forall(n1 => ready_nodes.forall(n2 =>
+      mods.get(n1).filter(m => m.status == Completed).map(m => m.id)
+      ==
+      mods.get(n2).filter(m => m.status == Completed).map(m => m.id)
+    ))
+  }
+
+  /// When all nodes are Ready, their databases are identical.
+  val ready_db_consistency: bool = {
+    val ready_nodes = NODES.filter(n => phase.get(n) == Ready)
+    if (ready_nodes.size() < 2) true
+    else ready_nodes.forall(n1 => ready_nodes.forall(n2 =>
+      db.get(n1) == db.get(n2) and db_len.get(n1) == db_len.get(n2)
+    ))
+  }
+
+  /// Completed persisted uniqueness mods must have their iris in db.
+  val persisted_mods_have_iris: bool =
+    NODES.forall(n =>
+      mods.get(n).forall(m =>
+        not(m.status == Completed and m.persisted and m.request_type == Uniqueness and m.serial_id > 0)
+        or
+        db.get(n).keys().contains(m.serial_id)
+      )
+    )
+
+  /// If two nodes both completed the same modification (by ID),
+  /// they must agree on serial_id and persisted flag.
+  val completed_mods_agree: bool =
+    NODES.forall(n1 => NODES.forall(n2 =>
+      mods.get(n1).forall(m1 =>
+        mods.get(n2).forall(m2 =>
+          not(m1.id == m2.id and m1.status == Completed and m2.status == Completed)
+          or
+          (m1.serial_id == m2.serial_id and m1.persisted == m2.persisted)
+        )
+      )
+    ))
+
+  /// Same mod ID across nodes must have same request_type.
+  val mod_ids_consistent: bool =
+    NODES.forall(n1 => NODES.forall(n2 =>
+      mods.get(n1).forall(m1 =>
+        mods.get(n2).forall(m2 =>
+          not(m1.id == m2.id)
+          or
+          m1.request_type == m2.request_type
+        )
+      )
+    ))
+
+  /// No two persisted uniqueness mods on same node have same serial_id.
+  val no_duplicate_serial_ids: bool =
+    NODES.forall(n =>
+      mods.get(n).forall(m1 =>
+        mods.get(n).forall(m2 =>
+          not(m1.id != m2.id
+              and m1.request_type == Uniqueness and m2.request_type == Uniqueness
+              and m1.persisted and m2.persisted
+              and m1.serial_id > 0 and m2.serial_id > 0)
+          or
+          m1.serial_id != m2.serial_id
+        )
+      )
+    )
+
+  /// Deleted iris entries must be Tombstoned, not Live.
+  /// If a completed persisted deletion mod references a serial_id,
+  /// that entry should be Tombstoned in the node's DB.
+  val deletions_are_tombstoned: bool =
+    NODES.forall(n =>
+      mods.get(n).forall(m =>
+        not(m.status == Completed and m.persisted and m.request_type == Deletion
+            and m.serial_id > 0 and db.get(n).keys().contains(m.serial_id))
+        or
+        db.get(n).get(m.serial_id) == Tombstoned
+      )
+    )
+
+  /// Failed reauths must NOT be persisted.
+  val failed_reauth_not_persisted: bool =
+    NODES.forall(n =>
+      mods.get(n).forall(m =>
+        // If it's a completed reauth with persisted=false, that's fine (failed).
+        // What we check: if persisted=true, there must be a matching iris.
+        not(m.status == Completed and m.request_type == Reauth
+            and m.persisted and m.serial_id > 0)
+        or
+        db.get(n).keys().contains(m.serial_id)
+      )
+    )
+
+  /// Valid entries consistency: during batch processing, all nodes have
+  /// the same valid_entries vector (AND logic ensures consensus).
+  val valid_entries_consistency: bool = {
+    val batch_nodes = NODES.filter(n =>
+      phase.get(n) == Processing or phase.get(n) == PersistingResults
+    )
+    batch_nodes.forall(n1 => batch_nodes.forall(n2 =>
+      valid_entries.get(n1) == valid_entries.get(n2)
+    ))
+  }
+
+  val all_invariants: bool = and {
+    serial_id_contiguity,
+    db_len_consistent,
+    db_size_bounded,
+    mod_id_safety_window,
+    batch_consistency,
+    ready_state_consistency,
+    ready_db_consistency,
+    persisted_mods_have_iris,
+    completed_mods_agree,
+    mod_ids_consistent,
+    no_duplicate_serial_ids,
+    deletions_are_tombstoned,
+    failed_reauth_not_persisted,
+    valid_entries_consistency,
+  }
+
+  // ---------------------------------------------------------------------------
+  // Test runs
+  // ---------------------------------------------------------------------------
+
+  /// Happy path: 3 nodes start, sync (empty state), process one batch.
+  run happy_path_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      .then(all {
+        assert(all_invariants),
+        stuttering,
+      })
+  }
+
+  /// Two sequential batches, second includes a deletion.
+  run two_batches_with_deletion_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      // First batch: insert 2 irises
+      .then(enqueue_uniqueness)
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      // Second batch: delete iris 1
+      .then(enqueue_deletion(1))
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      .then(all {
+        assert(all_invariants),
+        // Serial ID 1 should still exist but be Tombstoned
+        assert(deletions_are_tombstoned),
+        stuttering,
+      })
+  }
+
+  /// Node 2 crashes after MPC but before persist. Recovery via sync.
+  run crash_after_processing_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(crash(2))
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(crash(0))
+      .then(crash(1))
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(all {
+        assert(ready_db_consistency),
+        assert(ready_state_consistency),
+        assert(persisted_mods_have_iris),
+        stuttering,
+      })
+  }
+
+  /// Crash recovery: node 2 crashes after persist_results of 0,1.
+  run crash_recovery_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(crash(2))
+      .then(crash(0))
+      .then(crash(1))
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(all {
+        assert(ready_db_consistency),
+        assert(ready_state_consistency),
+        stuttering,
+      })
+  }
+
+  /// Multiple batches with a crash between them.
+  run multi_batch_crash_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      // Batch 1: all succeed
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      // Batch 2: node 2 crashes after persist of 0,1
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(crash(2))
+      // Full restart and sync
+      .then(crash(0))
+      .then(crash(1))
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(all {
+        assert(all_invariants),
+        stuttering,
+      })
+  }
+
+  /// Graceful shutdown test: node shuts down cleanly.
+  run graceful_shutdown_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      // Graceful shutdown of node 0
+      .then(graceful_shutdown(0))
+      // Restart all
+      .then(crash(1))
+      .then(crash(2))
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(all {
+        assert(all_invariants),
+        stuttering,
+      })
+  }
+  /// Mixed batch: uniqueness + reauth + deletion in sequence.
+  run mixed_operations_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      // Insert 3 irises
+      .then(enqueue_uniqueness)
+      .then(enqueue_uniqueness)
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      // Reauth iris 1, delete iris 2
+      .then(enqueue_reauth(1))
+      .then(enqueue_deletion(2))
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      .then(all {
+        assert(all_invariants),
+        stuttering,
+      })
+  }
+
+  /// Crash after mixed batch with deletion, verify tombstone survives sync.
+  run deletion_crash_recovery_test = {
+    init
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      // Insert iris
+      .then(enqueue_uniqueness)
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(persist_results(2))
+      // Delete iris 1, node 2 crashes before persist
+      .then(enqueue_deletion(1))
+      .then(receive_batch)
+      .then(process_batch)
+      .then(persist_results(0))
+      .then(persist_results(1))
+      .then(crash(2))
+      // Restart and sync
+      .then(crash(0))
+      .then(crash(1))
+      .then(node_start(0))
+      .then(node_start(1))
+      .then(node_start(2))
+      .then(begin_sync(0))
+      .then(begin_sync(1))
+      .then(begin_sync(2))
+      .then(sync_modifications(0))
+      .then(sync_modifications(1))
+      .then(sync_modifications(2))
+      .then(sync_queue(0))
+      .then(sync_queue(1))
+      .then(sync_queue(2))
+      .then(finish_loading(0))
+      .then(finish_loading(1))
+      .then(finish_loading(2))
+      .then(all_nodes_ready(0))
+      .then(all_nodes_ready(1))
+      .then(all_nodes_ready(2))
+      .then(all {
+        assert(ready_db_consistency),
+        assert(ready_state_consistency),
+        assert(deletions_are_tombstoned),
+        stuttering,
+      })
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Concrete instance for model checking
+// ---------------------------------------------------------------------------
+
+module iris_mpc_server_3 {
+  import iris_mpc_server(
+    NODES = Set(0, 1, 2),
+    MAX_DB_SIZE = 8,
+    MAX_BATCH_SIZE = 3,
+    LOOKBACK = 6
+  ).*
+}

From d66d314463c70b2e854f333bb4322261c4bd3d77 Mon Sep 17 00:00:00 2001
From: Philipp Sippl <philsippl@gmail.com>
Date: Mon, 27 Apr 2026 14:35:09 +0200
Subject: [PATCH 76/76] rerand_enabled config and check

---
 .../bin/iris-mpc-upgrade/rerandomize_db.rs    | 12 +++
 iris-mpc-bins/bin/iris-mpc/server.rs          | 16 +++-
 iris-mpc-common/src/config/mod.rs             |  6 ++
 iris-mpc-store/src/rerand.rs                  | 84 +++++++++++++++++++
 iris-mpc-upgrade/src/config.rs                |  6 ++
 iris-mpc-upgrade/src/continuous_rerand.rs     |  7 ++
 iris-mpc-upgrade/tests/test_utils.rs          |  1 +
 iris-mpc/src/server/mod.rs                    | 16 +++-
 ...260226000005_add_worker_heartbeat.down.sql |  1 +
 ...20260226000005_add_worker_heartbeat.up.sql |  2 +
 10 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 migrations/20260226000005_add_worker_heartbeat.down.sql
 create mode 100644 migrations/20260226000005_add_worker_heartbeat.up.sql

diff --git a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
index d509391d36..27c650b07a 100644
--- a/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
+++ b/iris-mpc-bins/bin/iris-mpc-upgrade/rerandomize_db.rs
@@ -588,6 +588,18 @@ async fn rerandomize_continuous_main(config: RerandomizeContinuousConfig) -> Res
         PostgresClient::new(&config.db_url, &config.schema_name, AccessMode::ReadWrite).await?;
     let store = Store::new(&postgres_client).await?;
 
+    // Publish a DB-side heartbeat so the main server can detect that a rerand
+    // worker is actively running. The main server refuses to start with
+    // `rerand_enabled=false` while this heartbeat is fresh, catching the
+    // dangerous "worker deployed but server says disabled" misconfig.
+    let heartbeat_pool = store.pool.clone();
+    let heartbeat_cancel = cancel.clone();
+    let _heartbeat_abort = background_tasks.spawn(async move {
+        iris_mpc_store::rerand::run_worker_heartbeat_loop(&heartbeat_pool, heartbeat_cancel).await;
+        Ok(())
+    });
+    background_tasks.check_tasks();
+
     continuous_rerand::run_continuous_rerand(
         &config,
         &s3_client,
diff --git a/iris-mpc-bins/bin/iris-mpc/server.rs b/iris-mpc-bins/bin/iris-mpc/server.rs
index 79258b1f5a..cacf7e997c 100644
--- a/iris-mpc-bins/bin/iris-mpc/server.rs
+++ b/iris-mpc-bins/bin/iris-mpc/server.rs
@@ -390,7 +390,7 @@ async fn server_main(config: Config) -> Result<()> {
     }
 
     // --- Coordinated rerand freeze with watermark convergence ---
-    {
+    if config.rerand_enabled {
         eyre::ensure!(
             server_coord_config.node_hostnames.len() == server_coord_config.healthcheck_ports.len(),
             "node_hostnames ({}) and healthcheck_ports ({}) must have the same length",
@@ -406,6 +406,20 @@ async fn server_main(config: Config) -> Result<()> {
             .map(|(_, (h, p))| -> eyre::Result<_> { Ok((h.as_str(), p.parse::<usize>()?)) })
             .collect::<eyre::Result<Vec<_>>>()?;
         rerand_store::freeze_and_verify_watermarks(&store.pool, &peer_addrs).await?;
+    } else if rerand_store::is_worker_alive(&store.pool).await? {
+        // Worker heartbeat is fresh but this server is configured with rerand
+        // off. Starting up now would skip freeze/watermark coordination and
+        // risk loading a cross-party-inconsistent DB snapshot. Fail closed.
+        eyre::bail!(
+            "rerand_enabled=false in config but the rerand worker is alive \
+             (heartbeat within the last {:?}). Either set SMPC__RERAND_ENABLED=true, \
+             or stop the rerand worker on all parties before restarting this server.",
+            rerand_store::WORKER_HEARTBEAT_STALE_AFTER,
+        );
+    } else {
+        tracing::info!(
+            "rerand_enabled=false and no fresh worker heartbeat — skipping rerand coordination"
+        );
     }
     // Worker is now frozen with verified equal watermarks.
     // Everything from here until freeze release must be wrapped so that
diff --git a/iris-mpc-common/src/config/mod.rs b/iris-mpc-common/src/config/mod.rs
index 63c4816cdb..48b17685c9 100644
--- a/iris-mpc-common/src/config/mod.rs
+++ b/iris-mpc-common/src/config/mod.rs
@@ -255,6 +255,9 @@ pub struct Config {
     #[serde(default)]
     pub enable_modifications_replay: bool,
 
+    #[serde(default)]
+    pub rerand_enabled: bool,
+
     #[serde(default = "default_pprof_s3_bucket")]
     pub pprof_s3_bucket: String,
 
@@ -664,6 +667,7 @@ pub struct CommonConfig {
     max_modifications_lookback: usize,
     enable_modifications_sync: bool,
     enable_modifications_replay: bool,
+    rerand_enabled: bool,
     sqs_sync_long_poll_seconds: i32,
     schema_name: String,
     hnsw_schema_name_suffix: String,
@@ -750,6 +754,7 @@ impl From<Config> for CommonConfig {
             max_modifications_lookback,
             enable_modifications_sync,
             enable_modifications_replay,
+            rerand_enabled,
             sqs_sync_long_poll_seconds,
             schema_name,
             hnsw_schema_name_suffix,
@@ -819,6 +824,7 @@ impl From<Config> for CommonConfig {
             max_modifications_lookback,
             enable_modifications_sync,
             enable_modifications_replay,
+            rerand_enabled,
             sqs_sync_long_poll_seconds,
             schema_name,
             hnsw_schema_name_suffix,
diff --git a/iris-mpc-store/src/rerand.rs b/iris-mpc-store/src/rerand.rs
index 44f69a4d81..00dcc77f83 100644
--- a/iris-mpc-store/src/rerand.rs
+++ b/iris-mpc-store/src/rerand.rs
@@ -388,6 +388,90 @@ fn is_undefined_table_sqlx(err: &sqlx::Error) -> bool {
     false
 }
 
+// ---------------------------------------------------------------------------
+// Worker heartbeat: lets the main server detect that a rerand worker is
+// actively running, independently of any config flag. Paired with the
+// server's `rerand_enabled` config flag to catch misconfigs at startup.
+// ---------------------------------------------------------------------------
+
+/// How often the worker writes its heartbeat while alive.
+pub const WORKER_HEARTBEAT_WRITE_INTERVAL: Duration = Duration::from_secs(10);
+
+/// How long after the last heartbeat we still consider the worker alive.
+/// Must be comfortably larger than `WORKER_HEARTBEAT_WRITE_INTERVAL` to avoid
+/// false "dead" verdicts during transient DB lag or worker restarts.
+pub const WORKER_HEARTBEAT_STALE_AFTER: Duration = Duration::from_secs(60);
+
+fn is_pre_heartbeat_schema(err: &sqlx::Error) -> bool {
+    // 42P01 = undefined_table (rerand_control doesn't exist yet).
+    // 42703 = undefined_column (heartbeat column not yet migrated).
+    if !rerand_control_exists(err) {
+        return true;
+    }
+    if let sqlx::Error::Database(pg) = err {
+        return pg.code().as_deref() == Some("42703");
+    }
+    false
+}
+
+/// Write `NOW()` into `rerand_control.worker_last_heartbeat`.
+///
+/// Silently succeeds when the table or column does not exist yet
+/// (pre-migration); the worker can still run before the server deploys the
+/// heartbeat migration.
+pub async fn write_worker_heartbeat(pool: &PgPool) -> Result<()> {
+    match sqlx::query("UPDATE rerand_control SET worker_last_heartbeat = NOW() WHERE id = 1")
+        .execute(pool)
+        .await
+    {
+        Ok(_) => Ok(()),
+        Err(e) if is_pre_heartbeat_schema(&e) => Ok(()),
+        Err(e) => Err(e.into()),
+    }
+}
+
+/// Drive the worker heartbeat in a loop until cancelled.
+///
+/// Writes an immediate heartbeat on entry, then writes one every
+/// `WORKER_HEARTBEAT_WRITE_INTERVAL` until `cancel` is triggered. Errors are
+/// logged but do not terminate the loop — a transient DB outage should not
+/// take the worker down.
+pub async fn run_worker_heartbeat_loop(pool: &PgPool, cancel: tokio_util::sync::CancellationToken) {
+    loop {
+        if let Err(e) = write_worker_heartbeat(pool).await {
+            tracing::warn!("Failed to write rerand worker heartbeat: {:?}", e);
+        }
+        tokio::select! {
+            _ = cancel.cancelled() => return,
+            _ = tokio::time::sleep(WORKER_HEARTBEAT_WRITE_INTERVAL) => {}
+        }
+    }
+}
+
+/// Returns `true` iff `rerand_control.worker_last_heartbeat` is set and
+/// younger than `WORKER_HEARTBEAT_STALE_AFTER`.
+///
+/// Returns `false` when the table or column does not exist (pre-migration),
+/// when no heartbeat has ever been written, or when the last heartbeat is
+/// older than the staleness threshold.
+pub async fn is_worker_alive(pool: &PgPool) -> Result<bool> {
+    let stale_secs = WORKER_HEARTBEAT_STALE_AFTER.as_secs() as i64;
+    let row = sqlx::query_as::<_, (Option<bool>,)>(
+        "SELECT worker_last_heartbeat > NOW() - make_interval(secs => $1) \
+         FROM rerand_control WHERE id = 1",
+    )
+    .bind(stale_secs)
+    .fetch_optional(pool)
+    .await;
+
+    match row {
+        Ok(Some((Some(true),))) => Ok(true),
+        Ok(_) => Ok(false),
+        Err(e) if is_pre_heartbeat_schema(&e) => Ok(false),
+        Err(e) => Err(e.into()),
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Freeze protocol: coordinated pause of the rerand worker during startup
 // ---------------------------------------------------------------------------
diff --git a/iris-mpc-upgrade/src/config.rs b/iris-mpc-upgrade/src/config.rs
index 91e7320d25..49568a2aff 100644
--- a/iris-mpc-upgrade/src/config.rs
+++ b/iris-mpc-upgrade/src/config.rs
@@ -377,4 +377,10 @@ pub struct RerandomizeContinuousConfig {
 
     #[clap(long, default_value = "3000", env = "HEALTHCHECK_PORT")]
     pub healthcheck_port: usize,
+
+    /// Must match the server's `SMPC__RERAND_ENABLED` setting. The worker
+    /// refuses to start when this is `false` so that an accidentally-deployed
+    /// worker can't run against a server that doesn't expect it.
+    #[clap(long, default_value = "false", env = "RERAND_ENABLED")]
+    pub rerand_enabled: bool,
 }
diff --git a/iris-mpc-upgrade/src/continuous_rerand.rs b/iris-mpc-upgrade/src/continuous_rerand.rs
index c857904b58..5e1e381df5 100644
--- a/iris-mpc-upgrade/src/continuous_rerand.rs
+++ b/iris-mpc-upgrade/src/continuous_rerand.rs
@@ -50,6 +50,13 @@ pub async fn run_continuous_rerand(
         eyre::bail!("s3_poll_interval_ms must be > 0");
     }
 
+    if !config.rerand_enabled {
+        eyre::bail!(
+            "RERAND_ENABLED is false — continuous rerand worker exiting. \
+             Set RERAND_ENABLED=true on the worker (and SMPC__RERAND_ENABLED=true on the server) to enable."
+        );
+    }
+
     let pool = &store.pool;
     let staging_schema = staging_schema_name(&store.schema_name);
     let poll_interval = Duration::from_millis(config.s3_poll_interval_ms);
diff --git a/iris-mpc-upgrade/tests/test_utils.rs b/iris-mpc-upgrade/tests/test_utils.rs
index 7e6fc45f68..dbb94b7ad3 100644
--- a/iris-mpc-upgrade/tests/test_utils.rs
+++ b/iris-mpc-upgrade/tests/test_utils.rs
@@ -147,6 +147,7 @@ impl TestEnv {
             safety_buffer_ids: 0,
             s3_poll_interval_ms: 200,
             healthcheck_port: 3020 + party_id as usize,
+            rerand_enabled: true,
         }
     }
 
diff --git a/iris-mpc/src/server/mod.rs b/iris-mpc/src/server/mod.rs
index 7c71a3eeb8..9ac9fac5c6 100644
--- a/iris-mpc/src/server/mod.rs
+++ b/iris-mpc/src/server/mod.rs
@@ -174,7 +174,7 @@ pub async fn server_main(config: Config) -> Result<()> {
     sync_sqs_queues(&config, &sync_result, &aws_clients).await?;
 
     // --- Coordinated rerand freeze with watermark convergence ---
-    {
+    if config.rerand_enabled {
         let sc = config.server_coordination.as_ref().unwrap();
         eyre::ensure!(
             sc.node_hostnames.len() == sc.healthcheck_ports.len(),
@@ -191,6 +191,20 @@ pub async fn server_main(config: Config) -> Result<()> {
             .map(|(_, (h, p))| -> eyre::Result<_> { Ok((h.as_str(), p.parse::<usize>()?)) })
             .collect::<eyre::Result<Vec<_>>>()?;
         rerand_store::freeze_and_verify_watermarks(&iris_store.pool, &peer_addrs).await?;
+    } else if rerand_store::is_worker_alive(&iris_store.pool).await? {
+        // Worker heartbeat is fresh but this server is configured with rerand
+        // off. Starting up now would skip freeze/watermark coordination and
+        // risk loading a cross-party-inconsistent DB snapshot. Fail closed.
+        eyre::bail!(
+            "rerand_enabled=false in config but the rerand worker is alive \
+             (heartbeat within the last {:?}). Either set SMPC__RERAND_ENABLED=true, \
+             or stop the rerand worker on all parties before restarting this server.",
+            rerand_store::WORKER_HEARTBEAT_STALE_AFTER,
+        );
+    } else {
+        tracing::info!(
+            "rerand_enabled=false and no fresh worker heartbeat — skipping rerand coordination"
+        );
     }
     // Worker is now frozen with verified equal watermarks.
     // Everything from here until freeze release must be wrapped so that
diff --git a/migrations/20260226000005_add_worker_heartbeat.down.sql b/migrations/20260226000005_add_worker_heartbeat.down.sql
new file mode 100644
index 0000000000..f8b72d943c
--- /dev/null
+++ b/migrations/20260226000005_add_worker_heartbeat.down.sql
@@ -0,0 +1 @@
+ALTER TABLE rerand_control DROP COLUMN IF EXISTS worker_last_heartbeat;
diff --git a/migrations/20260226000005_add_worker_heartbeat.up.sql b/migrations/20260226000005_add_worker_heartbeat.up.sql
new file mode 100644
index 0000000000..73c4c14e1a
--- /dev/null
+++ b/migrations/20260226000005_add_worker_heartbeat.up.sql
@@ -0,0 +1,2 @@
+ALTER TABLE rerand_control
+    ADD COLUMN IF NOT EXISTS worker_last_heartbeat TIMESTAMPTZ;