From a379befdbf85ebf1536c292e2bd96a2cb0b96001 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 25 Jun 2026 07:57:39 -0500 Subject: [PATCH 1/9] fix(node): NFC-normalize visibility matcher to close NFC/NFD under-withholding (#101) A deny rule authored in NFC byte-compared unequal to a path committed in NFD (or vice versa), so blob_paths/withheld_blob_oids failed to classify the blob and it shipped in cleartext on the replication/pack path. Normalize both the glob prefix and the candidate path to NFC at the single matcher seam (glob_matches + specificity); NFC not NFKC so compatibility forms are not folded into distinct paths. Live serve is unaffected (git resolves byte-exact; the gate only becomes more restrictive). Adds a matcher unit test and a replication-path integration test with the variant byte inside the rule-covered directory name. --- Cargo.lock | 1 + crates/gitlawb-node/Cargo.toml | 3 + .../gitlawb-node/src/git/visibility_pack.rs | 70 +++++++++++++++++++ crates/gitlawb-node/src/visibility.rs | 55 ++++++++++++++- 4 files changed, 127 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2866a23..849e081 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3403,6 +3403,7 @@ dependencies = [ "tower-http", "tracing", "tracing-subscriber", + "unicode-normalization", "uuid", "zstd", ] diff --git a/crates/gitlawb-node/Cargo.toml b/crates/gitlawb-node/Cargo.toml index 881d4bd..2c4a329 100644 --- a/crates/gitlawb-node/Cargo.toml +++ b/crates/gitlawb-node/Cargo.toml @@ -39,6 +39,9 @@ hmac = { workspace = true } http-body-util = "0.1" tokio-util = { version = "0.7", features = ["io"] } dirs-next = "2" +# NFC-normalize visibility globs/paths at the matcher seam so a deny rule +# withholds a canonically-equivalent path regardless of NFC/NFD form (#101). +unicode-normalization = "0.1" reqwest = { workspace = true } libp2p-core = "0.43" libp2p-gossipsub = "0.49.4" diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index 9c4d804..7428b87 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -836,6 +836,76 @@ mod tests { ); } + #[test] + fn withholds_secret_blob_across_nfc_nfd_normalization_skew() { + // #101: the secret lives under a directory whose name is committed in NFD + // ("se" + combining acute U+0301), while the deny rule is authored in NFC + // ("é" = U+00E9). The variant byte sits INSIDE the rule-covered directory + // name, so a byte-exact matcher under-withholds and leaks the blob on the + // replication path. NFC normalization at the matcher seam closes it. (The + // sibling café.txt test does not exercise this: there the rule prefix + // "/secret" is pure ASCII and byte-identical regardless of how é is encoded + // in the filename, so it passes for the wrong reason.) + let nfd_dir = "se\u{0301}cret"; // decomposed + let nfc_rule = "/s\u{00e9}cret/**"; // composed + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(work.join(nfd_dir)).unwrap(); + std::fs::write(work.join("public.txt"), b"public\n").unwrap(); + std::fs::write(work.join(nfd_dir).join("key.pem"), b"TOP SECRET\n").unwrap(); + let run = |args: &[&str], dir: &Path| { + assert!( + Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["config", "core.precomposeunicode", "false"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret_oid = oid(&format!("{nfd_dir}/key.pem")); + let public_oid = oid("public.txt"); + // Guard against a vacuous pass: the NFD-named blob must actually exist. + assert_eq!(secret_oid.len(), 40, "secret blob was not stored under the NFD path"); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let rules = [rule(nfc_rule, &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "NFC-authored deny rule must withhold the secret blob under the NFD-named directory" + ); + assert!( + !withheld.contains(&public_oid), + "public blob must NOT be withheld" + ); + } + // TAB/newline are legal filename bytes on unix but rejected by the Windows // filesystem, so building the fixture only makes sense (and only compiles the // OsStr handling) under cfg(unix), matching fails_closed_on_non_utf8_path. diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index afe6a7c..9410e13 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -5,6 +5,7 @@ //! `is_public` flag. It performs no I/O so it is exhaustively unit tested. use crate::db::VisibilityRule; +use unicode_normalization::UnicodeNormalization; #[derive(Debug, PartialEq, Eq)] pub enum Decision { @@ -12,6 +13,17 @@ pub enum Decision { Deny, } +/// NFC-normalize a glob prefix or path so the matcher compares canonically +/// equivalent strings byte-for-byte. Without this, a deny rule authored NFC +/// (`é` = U+00E9) byte-compares unequal to a path committed NFD (`e` + U+0301) +/// and the blob slips past the rule on the replication path (#101). NFC, not +/// NFKC: compatibility folding (ligatures, full-width forms) would merge paths +/// the filesystem treats as distinct and over-withhold. Both sides of every +/// comparison must pass through this, or the skew just moves. +fn nfc(s: &str) -> String { + s.nfc().collect() +} + /// True if `caller` is the repo owner (matches full did:key or its short form), /// mirroring the owner-match idiom in `api/protect.rs`. fn is_owner(owner_did: &str, caller: &str) -> bool { @@ -36,12 +48,17 @@ fn glob_matches(glob: &str, path: &str) -> bool { if prefix == "/" { return true; } + // Compare in NFC so an NFC rule matches a canonically-equivalent NFD path + // (and vice versa). Both sides normalized here — the single matcher seam. + let prefix = nfc(prefix); + let path = nfc(path); path == prefix || path.starts_with(&format!("{prefix}/")) } -/// Specificity = length of the match prefix; longer is more specific. +/// Specificity = length of the (normalized) match prefix; longer is more +/// specific. Normalized so ranking stays consistent with `glob_matches`. fn specificity(glob: &str) -> usize { - glob_prefix(glob).len() + nfc(glob_prefix(glob)).len() } /// Decide whether `caller` (None = anonymous) may read `path` in a repo. @@ -371,4 +388,38 @@ mod tests { true )); } + + // #101: a deny rule must withhold a path that denotes the same characters in + // a different Unicode normalization form. Without NFC normalization at the + // matcher seam, an NFC-authored rule byte-compares unequal to an NFD-stored + // path and the blob leaks on the replication path. + #[test] + fn matcher_withholds_across_nfc_nfd_normalization_skew() { + // "é" composed (NFC, U+00E9) in the rule; decomposed (NFD, e + U+0301) + // in the committed path. + let nfc_rule = "/s\u{00e9}cret/**"; + let nfd_path = "/se\u{0301}cret/key.pem"; + let rules = [rule(nfc_rule, VisibilityMode::B, &["did:key:z6MkFriend"])]; + assert_eq!( + visibility_check(&rules, true, OWNER, None, nfd_path), + Decision::Deny, + "NFC-authored deny rule must withhold the canonically-equivalent NFD path" + ); + + // Mirror: rule authored NFD, path committed NFC. + let nfd_rule = "/se\u{0301}cret/**"; + let nfc_path = "/s\u{00e9}cret/key.pem"; + let rules2 = [rule(nfd_rule, VisibilityMode::B, &["did:key:z6MkFriend"])]; + assert_eq!( + visibility_check(&rules2, true, OWNER, None, nfc_path), + Decision::Deny, + "NFD-authored deny rule must withhold the canonically-equivalent NFC path" + ); + + // A genuinely different path is still allowed (no over-withholding). + assert_eq!( + visibility_check(&rules, true, OWNER, None, "/public/x.txt"), + Decision::Allow + ); + } } From f99599cf81ad5d04a27a2e3f2c6c6101229166a0 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:12:22 -0500 Subject: [PATCH 2/9] fix(node): fail closed on the full-scan pin path to stop dangling-blob leak (#99) The withheld set is computed over reachable objects (rev-list --all + HEAD), but the full-scan pin fallback (cat-file --batch-all-objects) includes dangling objects the reachable walk never classified. Subtracting the reachable-only withheld set therefore could not catch a private blob orphaned by a force-push/amend, which pinned to IPFS/Pinata in cleartext. Invert the default for blobs on the full-scan path: a candidate replicates only if it is a non-blob (commits/trees are structural) or a reachable, visibility-allowed blob; anything else (a dangling blob, or a withheld one) is dropped. The delta path keeps the cheaper reachable-only subtraction since delta candidates are always reachable, so no extra walk on the hot path. Adds list_all_objects_with_type/all_blob_oids (typed enumeration), replicable_blob_set + replicable_objects_fail_closed, and threads a full_scan flag through resolve_candidates_for_push. Pin functions now take a pre-filtered object list. Regression tests: dangling blob excluded end-to-end, fail-closed unit semantics, and the all-blob universe includes dangling + excludes non-blobs. --- crates/gitlawb-node/src/api/repos.rs | 86 +++++++--- crates/gitlawb-node/src/git/push_delta.rs | 134 ++++++++++++--- .../gitlawb-node/src/git/visibility_pack.rs | 153 +++++++++++++++++- crates/gitlawb-node/src/ipfs_pin.rs | 14 +- crates/gitlawb-node/src/pinata.rs | 19 +-- 5 files changed, 343 insertions(+), 63 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 6afe68f..4686864 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -887,7 +887,13 @@ pub async fn git_receive_pack( // introduced); on any non-commit tip or git error it fails closed to a full // repo scan. Only needed when something will actually replicate. All // degraded paths log inside the helper rather than failing silently. - let candidates: Vec = if withheld.is_some() { + // Resolve the pin candidate set, then filter to what may actually + // replicate. Delta path: the reachable-only `withheld` set suffices (delta + // objects are reachable). Full-scan path: the candidate set can include + // dangling blobs the withheld set never classified, so fail closed — + // replicate a blob only if it is reachable AND visibility-allowed (#99). On + // any error in the fail-closed walks, pin nothing this push. + let object_list: Vec = if let Some(withheld_set) = withheld.clone() { let new_tips: Vec = ref_updates .iter() .map(|u| u.new_sha.clone()) @@ -898,16 +904,55 @@ pub async fn git_receive_pack( .map(|u| u.old_sha.clone()) .filter(|s| s != ZERO_SHA) .collect(); - crate::git::push_delta::resolve_candidates_for_push(disk_path.clone(), new_tips, old_tips) + let pin_set = crate::git::push_delta::resolve_candidates_for_push( + disk_path.clone(), + new_tips, + old_tips, + ) + .await; + if pin_set.full_scan { + let p = disk_path.clone(); + let rules_for_blobs = rules_opt.clone().unwrap_or_default(); + let owner = record.owner_did.clone(); + let is_pub = record.is_public; + let candidates = pin_set.candidates; + tokio::task::spawn_blocking(move || -> anyhow::Result> { + let allowed = crate::git::visibility_pack::replicable_blob_set( + &p, + &rules_for_blobs, + is_pub, + &owner, + )?; + let all_blobs = crate::git::push_delta::all_blob_oids(&p)?; + Ok(crate::git::visibility_pack::replicable_objects_fail_closed( + candidates, + &allowed, + &all_blobs, + )) + }) .await + .map_err(|e| { + tracing::warn!(err = %e, "fail-closed blob walk task panicked; pinning nothing this push") + }) + .ok() + .and_then(|r| { + r.map_err(|e| { + tracing::warn!(err = %e, "fail-closed blob walk failed; pinning nothing this push") + }) + .ok() + }) + .unwrap_or_default() + } else { + crate::git::visibility_pack::replicable_objects(pin_set.candidates, &withheld_set) + } } else { Vec::new() }; // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty). // Skipped entirely when the public cannot read the repo (withheld == None). - if let Some(withheld_ipfs) = withheld.clone() { - let candidates_ipfs = candidates.clone(); + if withheld.is_some() { + let object_list_ipfs = object_list.clone(); let ipfs_api = state.config.ipfs_api.clone(); let repo_path_clone = disk_path.clone(); let db_clone = state.db.clone(); @@ -924,9 +969,8 @@ pub async fn git_receive_pack( let pinned = crate::ipfs_pin::pin_new_objects( &ipfs_api, &repo_path_clone, - candidates_ipfs, + object_list_ipfs, &db_clone, - &withheld_ipfs, ) .await; if !pinned.is_empty() { @@ -1030,23 +1074,21 @@ pub async fn git_receive_pack( let owner_did_for_arweave = record.owner_did.clone(); let self_public_url = state.config.public_url.clone(); let node_keypair = Arc::clone(&state.node_keypair); - let withheld_pinata = withheld; - let candidates_pinata = candidates; + let object_list_pinata = object_list; + let do_pinata_replication = withheld.is_some(); tokio::spawn(async move { - let pinned = match &withheld_pinata { - Some(withheld) => { - crate::pinata::pin_new_objects( - &http_client, - &pinata_upload_url, - &pinata_jwt, - &repo_path_clone, - candidates_pinata, - &db_clone, - withheld, - ) - .await - } - None => Vec::new(), + let pinned = if do_pinata_replication { + crate::pinata::pin_new_objects( + &http_client, + &pinata_upload_url, + &pinata_jwt, + &repo_path_clone, + object_list_pinata, + &db_clone, + ) + .await + } else { + Vec::new() }; if !pinned.is_empty() { diff --git a/crates/gitlawb-node/src/git/push_delta.rs b/crates/gitlawb-node/src/git/push_delta.rs index 21965db..7ab0081 100644 --- a/crates/gitlawb-node/src/git/push_delta.rs +++ b/crates/gitlawb-node/src/git/push_delta.rs @@ -10,16 +10,24 @@ //! ## Correctness framing (do not confuse with the #84 withholding direction) //! //! The pin enumeration is part of the *exposure* set, not the withheld filter. -//! Narrowing it per push only *shrinks* what is pinned, so it cannot create an -//! under-withholding leak — the withheld filter (`visibility_pack`) still -//! subtracts from whatever, smaller, set we feed it. The only risk here is -//! *under-pinning* (a durability gap), which the reconciliation sweep backstops. +//! On the *delta* path, narrowing per push only *shrinks* what is pinned, so it +//! cannot create an under-withholding leak — the reachable-only withheld filter +//! (`visibility_pack`) still subtracts from the smaller set we feed it. The only +//! risk there is *under-pinning* (a durability gap) the sweep backstops. +//! +//! The *full-scan* path is different: `list_all_objects` includes dangling +//! objects the reachable withheld set never classified, so subtracting that set +//! is not enough — a dangling private blob would slip through (#99). The caller +//! signals a full scan via [`PinCandidateSet::full_scan`] and must then apply +//! the fail-closed blob filter (`visibility_pack::replicable_objects_fail_closed`) +//! instead of the plain reachable-only subtraction. //! //! Because the pin candidate set needs only the OID *set* (never the per-path //! information the withheld classifier needs), `git rev-list --objects //! --no-object-names` is safe here. The "rev-list reports one path per object" //! trap from #84 applies only to `visibility_pack`'s per-path `ls-tree` walk. +use std::collections::HashSet; use std::path::{Path, PathBuf}; use std::process::Command; @@ -191,6 +199,61 @@ pub fn list_all_objects(repo_path: &Path) -> Result> { .collect()) } +/// Like [`list_all_objects`] but pairs each OID with its object type, via +/// `--batch-check='%(objectname) %(objecttype)'`. The pin path's fail-closed +/// filter needs to tell blobs (content, withholdable) from commits/trees +/// (structural, never withheld) without typing the candidate list itself. +pub fn list_all_objects_with_type(repo_path: &Path) -> Result> { + let output = Command::new("git") + .args([ + "cat-file", + "--batch-all-objects", + "--batch-check=%(objectname) %(objecttype)", + ]) + .current_dir(repo_path) + .output() + .context("failed to run git cat-file")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + bail!("git cat-file failed: {stderr}"); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + Ok(stdout + .lines() + .filter_map(|l| { + let mut it = l.split_whitespace(); + let oid = it.next()?; + let ty = it.next()?; + Some((oid.to_string(), ty.to_string())) + }) + .collect()) +} + +/// Every blob OID in the repo, including unreachable/dangling ones. The +/// fail-closed pin filter drops any candidate blob absent from the reachable, +/// visibility-allowed set; a dangling private blob is in this set but not the +/// allowed set, so it never replicates (#99). +pub fn all_blob_oids(repo_path: &Path) -> Result> { + Ok(list_all_objects_with_type(repo_path)? + .into_iter() + .filter(|(_, ty)| ty == "blob") + .map(|(oid, _)| oid) + .collect()) +} + +/// The pin candidate OIDs for a push plus whether they came from a full-repo +/// scan. `full_scan` is true when the delta could not be used and the whole +/// object DB (including dangling objects) was enumerated — the caller must then +/// apply the fail-closed blob filter, because the reachable-only withheld set +/// cannot classify dangling blobs (#99). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PinCandidateSet { + pub candidates: Vec, + pub full_scan: bool, +} + /// Resolve the pin candidate OID set for a push, off the async worker. /// /// Runs [`resolve_push_delta`] in `spawn_blocking` (git subprocess) and applies @@ -202,28 +265,29 @@ pub fn list_all_objects(repo_path: &Path) -> Result> { /// failed full scan, and a panicked blocking task each emit a warning. On a /// failed full scan or a task panic the candidate set is empty (pin nothing /// this push); that is a durability gap the reconciliation sweep backstops, and -/// it can never leak because the withheld filter still runs on whatever set is -/// returned. +/// it can never leak because the withheld/fail-closed filter still runs on +/// whatever set is returned. `full_scan` rides on the returned set so the caller +/// knows when the dangling-inclusive filter is required. pub async fn resolve_candidates_for_push( repo_path: PathBuf, new_tips: Vec, old_tips: Vec, -) -> Vec { +) -> PinCandidateSet { tokio::task::spawn_blocking(move || { let new_refs: Vec<&str> = new_tips.iter().map(String::as_str).collect(); let old_refs: Vec<&str> = old_tips.iter().map(String::as_str).collect(); match resolve_push_delta(&repo_path, &new_refs, &old_refs) { PinCandidates::Delta(objs) => { tracing::info!(delta = objs.len(), repo = %repo_path.display(), "pin candidate set from push delta"); - objs + PinCandidateSet { candidates: objs, full_scan: false } } PinCandidates::FullScanRequired => { tracing::warn!(repo = %repo_path.display(), "pin delta unavailable (non-commit tip, git error, or force-full-scan) — full-scan fallback"); match list_all_objects(&repo_path) { - Ok(objs) => objs, + Ok(objs) => PinCandidateSet { candidates: objs, full_scan: true }, Err(e) => { tracing::warn!(repo = %repo_path.display(), err = %e, "full-scan fallback failed; pinning nothing this push (reconciliation sweep backstops)"); - Vec::new() + PinCandidateSet { candidates: Vec::new(), full_scan: false } } } } @@ -232,7 +296,7 @@ pub async fn resolve_candidates_for_push( .await .unwrap_or_else(|e| { tracing::warn!(err = %e, "pin candidate computation task panicked; pinning nothing this push (reconciliation sweep backstops)"); - Vec::new() + PinCandidateSet { candidates: Vec::new(), full_scan: false } }) } @@ -452,11 +516,11 @@ mod tests { let repo = Repo::new(); let c1 = repo.commit_file("a.txt", "one\n"); let c2 = repo.commit_file("b.txt", "two\n"); - let got: HashSet = + let set = resolve_candidates_for_push(repo.path.clone(), vec![c2.clone()], vec![c1.clone()]) - .await - .into_iter() - .collect(); + .await; + assert!(!set.full_scan, "happy-path delta is not a full scan"); + let got: HashSet = set.candidates.into_iter().collect(); let new_blob = repo.rev("HEAD:b.txt"); assert!( got.contains(&c2) && got.contains(&new_blob), @@ -473,11 +537,9 @@ mod tests { repo.commit_file("a.txt", "one\n"); let blob = repo.rev("HEAD:a.txt"); let all: HashSet = list_all_objects(&repo.path).unwrap().into_iter().collect(); - let got: HashSet = - resolve_candidates_for_push(repo.path.clone(), vec![blob], vec![]) - .await - .into_iter() - .collect(); + let set = resolve_candidates_for_push(repo.path.clone(), vec![blob], vec![]).await; + assert!(set.full_scan, "non-commit tip is signalled as a full scan"); + let got: HashSet = set.candidates.into_iter().collect(); assert_eq!(got, all, "non-commit tip falls back to full repo scan"); } @@ -566,4 +628,36 @@ mod tests { // 2 commits + 2 trees + 2 blobs = 6 objects. assert_eq!(all.len(), 6, "got: {all:?}"); } + + #[test] + fn all_blob_oids_includes_dangling_and_excludes_nonblobs() { + // The all-blob universe must include a dangling blob (the #99 hazard) and + // exclude commits/trees, so the fail-closed filter can tell a blob from a + // structural object without typing the candidate list. + let repo = Repo::new(); + let c1 = repo.commit_file("a.txt", "one\n"); + let reachable_blob = repo.rev("HEAD:a.txt"); + let tree = repo.rev("HEAD^{tree}"); + // Write a blob referenced by no tree/commit — dangling. + std::fs::write(repo.path.join("orphan.bin"), b"dangling\n").unwrap(); + let dangling = repo.git(&["hash-object", "-w", "orphan.bin"]); + + let blobs = all_blob_oids(&repo.path).unwrap(); + assert!(blobs.contains(&reachable_blob), "reachable blob present"); + assert!( + blobs.contains(&dangling), + "dangling blob present in the all-blob universe" + ); + assert!(!blobs.contains(&c1), "commit is not a blob"); + assert!(!blobs.contains(&tree), "tree is not a blob"); + + // The typed lister tags each object; spot-check the dangling blob's type. + let typed = list_all_objects_with_type(&repo.path).unwrap(); + assert!( + typed + .iter() + .any(|(oid, ty)| oid == &dangling && ty == "blob"), + "dangling object is typed as a blob" + ); + } } diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index 7428b87..a3e0492 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -298,6 +298,47 @@ pub fn replicable_objects(all: Vec, withheld: &HashSet) -> Vec Result> { + let pairs = blob_paths(repo_path)?; + let mut allowed = HashSet::new(); + for (oid, path) in &pairs { + if visibility_check(rules, is_public, owner_did, None, path) == Decision::Allow { + allowed.insert(oid.clone()); + } + } + Ok(allowed) +} + +/// Objects safe to replicate, failing closed on blobs (#99). A candidate +/// replicates iff it is NOT a blob (`all_blob_oids` — commits and trees are +/// structural, never content-withheld) OR it is in `allowed_blobs` (reachable +/// and visibility-allowed). This drops both withheld reachable blobs and +/// dangling/unreachable blobs the reachable walk never classified, without +/// tagging the candidate list with per-object types. Used on the full-scan pin +/// path, where the candidate set can contain dangling objects the reachable-only +/// withheld set cannot cover; the delta path keeps `replicable_objects`. +pub fn replicable_objects_fail_closed( + candidates: Vec, + allowed_blobs: &HashSet, + all_blob_oids: &HashSet, +) -> Vec { + candidates + .into_iter() + .filter(|oid| !all_blob_oids.contains(oid) || allowed_blobs.contains(oid)) + .collect() +} + /// For every blob withheld from anonymous, the DIDs allowed to read it: the /// owner plus any reader DID that `visibility_check` Allows at some path the /// blob appears at. Least-privilege: a reader of one private subtree is not a @@ -596,6 +637,112 @@ mod tests { assert_eq!(got, all); } + #[test] + fn fail_closed_keeps_nonblobs_and_allowed_blobs_only() { + // Non-blob objects (commit/tree) always pass; a blob passes only if it + // is in the allowed set. A withheld blob and a dangling blob (both in + // all_blob_oids, neither in allowed) are dropped. + let allowed: HashSet = ["b_pub".to_string()].into_iter().collect(); + let all_blobs: HashSet = ["b_pub", "b_secret", "b_dangling"] + .into_iter() + .map(String::from) + .collect(); + let candidates = vec![ + "commit1".to_string(), + "tree1".to_string(), + "b_pub".to_string(), + "b_secret".to_string(), + "b_dangling".to_string(), + ]; + let got = replicable_objects_fail_closed(candidates, &allowed, &all_blobs); + assert_eq!( + got, + vec![ + "commit1".to_string(), + "tree1".to_string(), + "b_pub".to_string() + ] + ); + } + + #[test] + fn fail_closed_drops_dangling_private_blob() { + // #99: a private blob orphaned by a force-push/amend is unreachable but + // still present in the object DB. The full-scan candidate set includes + // it; the reachable-only allowed walk never classifies it. The + // fail-closed filter must drop it — it is a blob not in the allowed set. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"public bytes\n").unwrap(); + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(&work) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"]); + run(&["config", "user.email", "t@t"]); + run(&["config", "user.name", "t"]); + run(&["add", "."]); + run(&["commit", "-qm", "init"]); + let oid_of = |rev: &str| { + let out = Command::new("git") + .args(["rev-parse", rev]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let public_oid = oid_of("HEAD:public/a.txt"); + + // Write a blob straight into the object DB, referenced by no tree or + // commit — exactly the dangling state #99 is about. + std::fs::write(work.join("orphan.bin"), b"DANGLING SECRET\n").unwrap(); + let dangling_oid = { + let out = Command::new("git") + .args(["hash-object", "-w", "orphan.bin"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + + let all_blobs = crate::git::push_delta::all_blob_oids(&work).unwrap(); + assert!( + all_blobs.contains(&dangling_oid), + "precondition: the dangling blob is in the all-objects universe" + ); + + let rules: Vec = vec![]; + let allowed = replicable_blob_set(&work, &rules, true, OWNER).unwrap(); + assert!( + !allowed.contains(&dangling_oid), + "dangling blob is unreachable, so never in the allowed set" + ); + assert!( + allowed.contains(&public_oid), + "reachable public blob is in the allowed set" + ); + + // Full-scan candidate set includes the dangling blob; fail-closed drops it. + let candidates = vec![dangling_oid.clone(), public_oid.clone()]; + let replicable = replicable_objects_fail_closed(candidates, &allowed, &all_blobs); + assert!( + !replicable.contains(&dangling_oid), + "#99: a dangling private blob must not replicate" + ); + assert!( + replicable.contains(&public_oid), + "the public blob still replicates" + ); + } + #[test] fn recipients_are_owner_plus_allowed_readers_only() { let (_td, repo, secret_oid, public_oid) = fixture(); @@ -882,7 +1029,11 @@ mod tests { let secret_oid = oid(&format!("{nfd_dir}/key.pem")); let public_oid = oid("public.txt"); // Guard against a vacuous pass: the NFD-named blob must actually exist. - assert_eq!(secret_oid.len(), 40, "secret blob was not stored under the NFD path"); + assert_eq!( + secret_oid.len(), + 40, + "secret blob was not stored under the NFD path" + ); run( &[ "clone", diff --git a/crates/gitlawb-node/src/ipfs_pin.rs b/crates/gitlawb-node/src/ipfs_pin.rs index ac1fde0..3b34619 100644 --- a/crates/gitlawb-node/src/ipfs_pin.rs +++ b/crates/gitlawb-node/src/ipfs_pin.rs @@ -7,8 +7,6 @@ //! If `ipfs_api` is empty the functions are no-ops, so the node works fine //! without a local IPFS daemon. -use std::collections::HashSet; - use anyhow::Result; use gitlawb_core::cid::Cid; @@ -88,9 +86,10 @@ pub async fn cat(ipfs_api: &str, cid: &str) -> Result> { /// Pin any of the given candidate git objects that are not yet recorded in /// `pinned_cids`. /// -/// `candidates` is the OID set to consider — the per-push delta on the push -/// path, or the whole-repo list on the reconciliation sweep / full-scan -/// fallback (see `git::push_delta`). `repo_path` is still needed to read each +/// `object_list` is the already-withheld-filtered OID set to pin: the caller +/// applies `visibility_pack::replicable_objects` on the delta path or the +/// `..._fail_closed` filter on the full-scan path before calling, so this +/// function never sees a withheld blob. `repo_path` is still needed to read each /// object's bytes. The twin in `pinata.rs` mirrors this shape — change both in /// lockstep. /// @@ -98,16 +97,13 @@ pub async fn cat(ipfs_api: &str, cid: &str) -> Result> { pub async fn pin_new_objects( ipfs_api: &str, repo_path: &std::path::Path, - candidates: Vec, + object_list: Vec, db: &crate::db::Db, - withheld: &HashSet, ) -> Vec<(String, String)> { if ipfs_api.is_empty() { return vec![]; } - let object_list = crate::git::visibility_pack::replicable_objects(candidates, withheld); - let mut pinned = Vec::new(); for sha in object_list { diff --git a/crates/gitlawb-node/src/pinata.rs b/crates/gitlawb-node/src/pinata.rs index cbd8885..6c9c0bf 100644 --- a/crates/gitlawb-node/src/pinata.rs +++ b/crates/gitlawb-node/src/pinata.rs @@ -7,7 +7,6 @@ //! no-op, so nodes without Pinata backing work fine. use anyhow::Result; -use std::collections::HashSet; /// Pin a single git object's raw bytes on Pinata (v3 API). /// @@ -70,27 +69,25 @@ pub async fn pin_object( /// Pin any of the given candidate git objects that haven't yet been sent to /// Pinata. /// -/// `candidates` is the OID set to consider — the per-push delta on the push -/// path, or the whole-repo list on the reconciliation sweep / full-scan -/// fallback (see `git::push_delta`). `repo_path` is still needed to read each -/// object's bytes. The twin in `ipfs_pin.rs` mirrors this shape — change both -/// in lockstep. Objects already recorded with a `pinata_cid` are skipped. -/// Returns `(sha_hex, cid)` pairs for each newly pinned object. +/// `object_list` is the already-withheld-filtered OID set to pin: the caller +/// applies `visibility_pack::replicable_objects` on the delta path or the +/// `..._fail_closed` filter on the full-scan path before calling. `repo_path` is +/// still needed to read each object's bytes. The twin in `ipfs_pin.rs` mirrors +/// this shape — change both in lockstep. Objects already recorded with a +/// `pinata_cid` are skipped. Returns `(sha_hex, cid)` pairs for each newly +/// pinned object. pub async fn pin_new_objects( client: &reqwest::Client, upload_url: &str, jwt: &str, repo_path: &std::path::Path, - candidates: Vec, + object_list: Vec, db: &crate::db::Db, - withheld: &HashSet, ) -> Vec<(String, String)> { if jwt.is_empty() { return vec![]; } - let object_list = crate::git::visibility_pack::replicable_objects(candidates, withheld); - let mut pinned = Vec::new(); for sha in object_list { From e12a030f91b7a0c78a91c6d9a8186bb35d5e716e Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:22:24 -0500 Subject: [PATCH 3/9] fix(node): gate repo-listing surfaces on visibility to stop private-repo enumeration (#97) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit list_repos, list_federated_repos, and the GraphQL repos query returned every repo row with no visibility check, so an unauthenticated caller could enumerate private repos' name, owner, description, and clone URL — and X-Total-Count leaked the private-repo count even when rows were filtered. Each surface now keeps a row only when visibility_check at "/" allows the caller, the same decision the per-repo content endpoints make — not a bare is_public filter, which would diverge for is_public=false-with-root-reader and is_public=true-with-root-deny. The "/" decision depends on owner short/full-DID matching and JSON reader-DID membership, so it is applied in Rust over an over-fetched set rather than pushed into SQL; the paged path derives X-Total-Count from the filtered set (dropping the SQL window count) and paginates after filtering, so neither the page nor the count leaks. Adds a batch root-rule query (list_visibility_rules_for_repos) to avoid N+1, and converts list_all_repos_paged into list_all_repos_deduped (mirror dedup preserved, pagination moved to Rust). Tests: anonymous hides private repo + count, owner sees own private repo, authorized root-reader sees an is_public=false repo (proving the visibility_check path, not is_public), and federated listing hides private from anonymous. --- crates/gitlawb-node/src/api/repos.rs | 85 ++++++---- crates/gitlawb-node/src/db/mod.rs | 84 ++++++---- crates/gitlawb-node/src/graphql/query.rs | 18 ++ crates/gitlawb-node/src/test_support.rs | 200 +++++++++++++++++++++++ crates/gitlawb-node/src/visibility.rs | 14 ++ 5 files changed, 339 insertions(+), 62 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 4686864..7199c4f 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -207,50 +207,61 @@ pub struct ListReposQuery { /// present, returns one page and the `X-Total-Count` response header carries the /// total matching row count. Without `limit`, falls back to returning every row /// (kept for backwards compat with peer sync and existing CLI tooling). +/// +/// Every returned row passes the per-caller `"/"` visibility gate +/// (`crate::visibility::listable_at_root`), the same decision the per-repo +/// content endpoints make, so neither the page nor `X-Total-Count` leaks a repo +/// (or its mere count) the caller may not read (#97). pub async fn list_repos( State(state): State, Query(query): Query, + auth: Option>, ) -> Result { use axum::http::HeaderValue; use axum::response::IntoResponse; - if let Some(raw_limit) = query.limit { - let limit = raw_limit.clamp(1, 200); - let offset = query.offset.unwrap_or(0).max(0); - let (rows, total) = state - .db - .list_all_repos_paged(query.owner.as_deref(), limit, offset) - .await?; - let body: Vec = rows - .into_iter() - .map(|(r, stars)| to_response(&r, &state, stars)) - .collect(); - let mut response = Json(body).into_response(); - response.headers_mut().insert( - "X-Total-Count", - HeaderValue::from_str(&total.to_string()).unwrap_or(HeaderValue::from_static("0")), - ); - return Ok(response); - } + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + + // Over-fetch the deduped set (did:key-aware DEDUP_CTE collapses mirror rows), + // then apply the per-repo "/" visibility gate in Rust BEFORE pagination so + // neither the page nor X-Total-Count leaks a repo the caller may not read — + // including its mere count. The "/" decision depends on owner short/full-DID + // matching and JSON reader-DID membership, so it cannot be a clean SQL + // predicate without drifting from visibility_check; the count is derived from + // the visible set (#97). + let owner_filtered = state + .db + .list_all_repos_deduped_with_stars(query.owner.as_deref()) + .await?; - let repos = state.db.list_all_repos_with_stars().await?; - let filtered: Vec<(RepoRecord, i64)> = repos + let ids: Vec = owner_filtered.iter().map(|(r, _)| r.id.clone()).collect(); + let rules_by_repo = state.db.list_visibility_rules_for_repos(&ids).await?; + let visible: Vec<(crate::db::RepoRecord, i64)> = owner_filtered .into_iter() .filter(|(r, _)| { - if let Some(owner) = &query.owner { - crate::api::did_matches(owner.as_str(), &r.owner_did) - } else { - true - } + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, caller) }) .collect(); - let deduped = dedupe_canonical_repos(filtered); - let total = deduped.len() as i64; - let resp: Vec<_> = deduped + + let total = visible.len() as i64; + + // Paginate in Rust when a limit is set: SQL LIMIT/OFFSET cannot run before + // the visibility filter without returning short pages and a leaked count. + let page: Vec<(crate::db::RepoRecord, i64)> = match query.limit { + Some(raw_limit) => { + let limit = raw_limit.clamp(1, 200) as usize; + let offset = query.offset.unwrap_or(0).max(0) as usize; + visible.into_iter().skip(offset).take(limit).collect() + } + None => visible, + }; + + let body: Vec = page .into_iter() .map(|(r, stars)| to_response(&r, &state, stars)) .collect(); - let mut response = Json(resp).into_response(); + let mut response = Json(body).into_response(); response.headers_mut().insert( "X-Total-Count", HeaderValue::from_str(&total.to_string()).unwrap_or(HeaderValue::from_static("0")), @@ -1245,8 +1256,24 @@ pub async fn list_refs( /// which node hosts it. Results from unreachable peers are silently omitted. pub async fn list_federated_repos( State(state): State, + auth: Option>, ) -> Result> { + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); let local_repos = dedupe_canonical_repos(state.db.list_all_repos_with_stars().await?); + + // Hide local repos the caller may not read at "/" before federating them, so + // the federated surface does not enumerate private repos (#97). Peer repos + // arrive already filtered by each peer's own /api/v1/repos (anonymous view). + let ids: Vec = local_repos.iter().map(|(r, _)| r.id.clone()).collect(); + let rules_by_repo = state.db.list_visibility_rules_for_repos(&ids).await?; + let local_repos: Vec<(crate::db::RepoRecord, i64)> = local_repos + .into_iter() + .filter(|(r, _)| { + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, caller) + }) + .collect(); + let local_node_url = state .config .public_url diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 36e44ff..2e9c91f 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -919,7 +919,7 @@ impl Db { /// stay byte-identical or Postgres stops using the index. /// The canonical row wins (mirror rows carry a slash-form `id` written only by /// `upsert_mirror_repo`), ties broken by earliest `created_at` then `id` so a - /// full tie still picks a deterministic survivor. `list_all_repos_paged`, + /// full tie still picks a deterministic survivor. `list_all_repos_deduped_with_stars`, /// `list_all_repos_deduped`, and the marker logic in /// `crate::api::repos::dedupe_canonical_repos` must stay in sync. const DEDUP_CTE: &'static str = "WITH deduped AS ( @@ -944,62 +944,41 @@ impl Db { created_at ASC, id ASC )"; - pub async fn list_all_repos_paged( + /// All repos with star counts, mirror-deduplicated via `DEDUP_CTE` and + /// ordered newest-first, optionally filtered to one owner. Returns the full + /// set (no SQL pagination): the listing surface filters by per-caller `"/"` + /// visibility in Rust and paginates after, so neither the page nor the count + /// leaks a repo the caller may not read (#97). + pub async fn list_all_repos_deduped_with_stars( &self, owner_filter: Option<&str>, - limit: i64, - offset: i64, - ) -> Result<(Vec<(RepoRecord, i64)>, i64)> { + ) -> Result> { let sql = format!( "{} SELECT d.id, d.name, d.owner_did, d.description, d.is_public, d.default_branch, d.created_at, d.updated_at, d.disk_path, d.forked_from, d.machine_id, - COALESCE(s.cnt, 0) AS star_count, - COUNT(*) OVER () AS total_count + COALESCE(s.cnt, 0) AS star_count FROM deduped d LEFT JOIN ( SELECT repo_id, COUNT(*) AS cnt FROM repo_stars GROUP BY repo_id ) s ON s.repo_id = d.id - ORDER BY d.updated_at DESC - LIMIT $2 OFFSET $3", + ORDER BY d.updated_at DESC", Self::DEDUP_CTE ); let rows = sqlx::query(&sql) .bind(owner_filter) - .bind(limit) - .bind(offset) .fetch_all(&self.pool) .await?; - let total = rows - .first() - .map(|r| r.get::("total_count")) - .unwrap_or(0); - let out: Vec<(RepoRecord, i64)> = rows + Ok(rows .into_iter() .map(|r| { let stars: i64 = r.get("star_count"); (row_to_repo(r), stars) }) - .collect(); - - let total = if out.is_empty() { - let row = sqlx::query( - "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt - FROM repos - WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1)", - ) - .bind(owner_filter) - .fetch_one(&self.pool) - .await?; - row.get::("cnt") - } else { - total - }; - - Ok((out, total)) + .collect()) } /// Deduped repo list (no stars, no paging) for the unfiltered read surfaces @@ -2468,6 +2447,45 @@ impl Db { }) .collect()) } + + /// All visibility rules for a set of repos, grouped by `repo_id`, in one + /// query. The listing surfaces use this to apply the same `"/"` visibility + /// decision the per-repo endpoints make without an N+1 per-repo rule fetch + /// (#97). Repos with no rules are simply absent from the map. + pub async fn list_visibility_rules_for_repos( + &self, + repo_ids: &[String], + ) -> Result>> { + use std::collections::HashMap; + if repo_ids.is_empty() { + return Ok(HashMap::new()); + } + let rows = sqlx::query( + "SELECT id, repo_id, path_glob, mode, reader_dids, created_by, created_at + FROM visibility_rules WHERE repo_id = ANY($1) ORDER BY path_glob", + ) + .bind(repo_ids) + .fetch_all(&self.pool) + .await?; + let mut out: HashMap> = HashMap::new(); + for r in rows { + let readers: String = r.get("reader_dids"); + let created_at: String = r.get("created_at"); + let rule = VisibilityRule { + id: r.get("id"), + repo_id: r.get("repo_id"), + path_glob: r.get("path_glob"), + mode: VisibilityMode::from_db(&r.get::("mode")), + reader_dids: serde_json::from_str(&readers).unwrap_or_default(), + created_by: r.get("created_by"), + created_at: created_at + .parse::>() + .unwrap_or_else(|_| Utc::now()), + }; + out.entry(rule.repo_id.clone()).or_default().push(rule); + } + Ok(out) + } } // ── Repo Stars ──────────────────────────────────────────────────────────────── diff --git a/crates/gitlawb-node/src/graphql/query.rs b/crates/gitlawb-node/src/graphql/query.rs index c7673f6..2caeefb 100644 --- a/crates/gitlawb-node/src/graphql/query.rs +++ b/crates/gitlawb-node/src/graphql/query.rs @@ -15,8 +15,26 @@ impl QueryRoot { .list_all_repos_deduped() .await .map_err(|e| async_graphql::Error::new(e.to_string()))?; + + // Apply the same "/" visibility gate the REST/per-repo endpoints use so + // this surface does not enumerate private repos (#97). The caller DID is + // threaded onto the context by optional_signature; absent = anonymous. + let caller = ctx + .data::() + .ok() + .map(|d| d.0.as_str()); + let ids: Vec = repos.iter().map(|r| r.id.clone()).collect(); + let rules_by_repo = db + .list_visibility_rules_for_repos(&ids) + .await + .map_err(|e| async_graphql::Error::new(e.to_string()))?; + Ok(repos .into_iter() + .filter(|r| { + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, caller) + }) .map(|r| RepoType { name: r.name, owner_did: r.owner_did, diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 154db14..874f819 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -607,4 +607,204 @@ mod tests { "stats must count logical repos (mirror+canonical collapsed)" ); } + + // ── #97: repo-listing surfaces are visibility-gated ────────────────────── + + fn seed_private_repo(owner_did: &str, name: &str) -> RepoRecord { + RepoRecord { + is_public: false, + ..seed_repo(owner_did, name) + } + } + + fn anon_get(uri: &str) -> Request { + Request::builder() + .method(Method::GET) + .uri(uri) + .body(Body::empty()) + .expect("request builder") + } + + async fn json_body(resp: axum::response::Response) -> serde_json::Value { + let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX) + .await + .expect("body bytes"); + serde_json::from_slice(&bytes).expect("json body") + } + + fn names_in(v: &serde_json::Value) -> Vec { + v.as_array() + .expect("array body") + .iter() + .filter_map(|r| r["name"].as_str().map(str::to_string)) + .collect() + } + + fn list_repos_router(state: AppState) -> Router { + Router::new() + .route( + "/api/v1/repos", + axum::routing::get(crate::api::repos::list_repos), + ) + .with_state(state) + } + + #[sqlx::test] + async fn list_repos_hides_private_repo_and_count_from_anonymous(pool: PgPool) { + let owner = "did:key:zLISTOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"pub-repo".to_string()), + "public repo listed" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "private repo must not be enumerable anonymously (#97)" + ); + assert_eq!( + total.as_deref(), + Some("1"), + "X-Total-Count must not leak the private repo's existence" + ); + } + + #[sqlx::test] + async fn list_repos_shows_owner_their_private_repo(pool: PgPool) { + let owner = "did:key:zLISTOWNER2BBBBBBBBBBBBBBBBBBBBBBBBBBBBB"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(signed_request_as( + owner, + Method::GET, + "/api/v1/repos", + Body::empty(), + )) + .await + .unwrap(); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"priv-repo".to_string()) && names.contains(&"pub-repo".to_string()), + "owner sees their own private repo, got {names:?}" + ); + } + + #[sqlx::test] + async fn list_repos_shows_private_repo_to_authorized_root_reader(pool: PgPool) { + // Proves the gate is visibility_check, not a bare is_public filter: an + // is_public=false repo with a root rule granting a reader DID is listable + // to that reader (and not to a stranger). + let owner = "did:key:zLISTOWNER3CCCCCCCCCCCCCCCCCCCCCCCCCCCCC"; + let reader = "did:key:zLISTREADERDDDDDDDDDDDDDDDDDDDDDDDDDDDDD"; + let stranger = "did:key:zLISTSTRANGEREEEEEEEEEEEEEEEEEEEEEEEEEE"; + let state = test_state(pool).await; + let rec = seed_private_repo(owner, "priv-repo"); + state.db.create_repo(&rec).await.expect("seed private"); + state + .db + .set_visibility_rule( + &rec.id, + "/", + crate::db::VisibilityMode::A, + &[reader.to_string()], + owner, + ) + .await + .expect("grant root reader"); + + let names_for = |did: &'static str, st: AppState| async move { + let resp = list_repos_router(st) + .oneshot(signed_request_as( + did, + Method::GET, + "/api/v1/repos", + Body::empty(), + )) + .await + .unwrap(); + names_in(&json_body(resp).await) + }; + + assert!( + names_for(reader, state.clone()) + .await + .contains(&"priv-repo".to_string()), + "authorized root reader must see the private repo" + ); + assert!( + !names_for(stranger, state) + .await + .contains(&"priv-repo".to_string()), + "an unlisted stranger must not see it" + ); + } + + #[sqlx::test] + async fn list_federated_repos_hides_private_from_anonymous(pool: PgPool) { + let owner = "did:key:zFEDOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let router = Router::new() + .route( + "/api/v1/repos/federated", + axum::routing::get(crate::api::repos::list_federated_repos), + ) + .with_state(state); + let resp = router + .oneshot(anon_get("/api/v1/repos/federated")) + .await + .unwrap(); + let body = json_body(resp).await; + let names = names_in(&body["repos"]); + assert!( + names.contains(&"pub-repo".to_string()), + "public repo federated" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "private repo must not be federated to anonymous callers (#97)" + ); + } } diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index 9410e13..335d261 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -113,6 +113,20 @@ pub fn visibility_check( } } +/// Whether `caller` (None = anonymous) may see a repo in a listing — the `"/"` +/// visibility decision, shared by every repo-listing surface (REST list, +/// federated list, GraphQL `repos`) so they enforce one rule, not three drifting +/// copies. Not a bare `is_public` test: a repo can be `is_public=false` with a +/// root rule granting readers, or `is_public=true` with a root deny (#97). +pub fn listable_at_root( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> bool { + visibility_check(rules, is_public, owner_did, caller, "/") == Decision::Allow +} + /// The subtree path globs that `caller` (None = anonymous) may NOT read, given /// the repo's rules. Whole-repo ("/") rules are excluded: a denied whole-repo /// read is handled by the 404 gate before a clone ever starts. Each remaining From ba15ba49c9c3a515a9cb1b0c863e59a5849927ab Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:55:25 -0500 Subject: [PATCH 4/9] fix(review): share the listing visibility seam, extract fail-closed pin helper, add listing tests Code-review follow-up on the VIS-subtree fixes (no behavior change to the three shipped fixes; all confirmed sound by review): - DRY the '/' listing gate into one shared visibility::listable_at_root used by list_repos, list_federated_repos, and the GraphQL repos query, so the three surfaces enforce one rule instead of three drifting copies (enforce-shared-rule-on-every-reader-path). - Extract the full-scan fail-closed pin block into fail_closed_full_scan_objects, mirroring replication_withheld_set's degraded-path shape; drop the duplicated comment and the per-site no_rules sentinel (use &[]). - Add the three listing tests the review flagged as missing: anonymous GraphQL repos hides a private repo, the paged X-Total-Count path excludes the private count (the KTD2 exploit shape), and an is_public=true repo under a root deny is not listed (proves the gate is visibility_check, not a bare is_public filter). --- crates/gitlawb-node/src/api/repos.rs | 92 ++++++++++--------- crates/gitlawb-node/src/test_support.rs | 115 ++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 41 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 7199c4f..d8a09a8 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -93,6 +93,43 @@ async fn replication_withheld_set( } } +/// The replicable object set for a full-scan pin fallback, failing closed (#99). +/// The full-scan candidate set includes dangling objects the reachable-only +/// withheld set never classified, so compute the reachable visibility-allowed +/// blob set and the all-blob universe off the async worker and keep only +/// non-blobs plus allowed blobs. Any error in either walk — or a task panic — +/// pins nothing this push, mirroring the degraded-path shape of +/// `replication_withheld_set`. +async fn fail_closed_full_scan_objects( + disk_path: std::path::PathBuf, + rules: Vec, + is_public: bool, + owner_did: String, + candidates: Vec, +) -> Vec { + tokio::task::spawn_blocking(move || -> anyhow::Result> { + let allowed = crate::git::visibility_pack::replicable_blob_set( + &disk_path, &rules, is_public, &owner_did, + )?; + let all_blobs = crate::git::push_delta::all_blob_oids(&disk_path)?; + Ok(crate::git::visibility_pack::replicable_objects_fail_closed( + candidates, &allowed, &all_blobs, + )) + }) + .await + .map_err(|e| { + tracing::warn!(err = %e, "fail-closed blob walk task panicked; pinning nothing this push") + }) + .ok() + .and_then(|r| { + r.map_err(|e| { + tracing::warn!(err = %e, "fail-closed blob walk failed; pinning nothing this push") + }) + .ok() + }) + .unwrap_or_default() +} + // ── Request / Response types ─────────────────────────────────────────────── #[derive(Debug, Deserialize)] @@ -893,17 +930,13 @@ pub async fn git_receive_pack( ) .await; - // Compute the per-push pin candidate set once, off the async worker (git - // subprocess). On the happy path this is the push delta (objects this push - // introduced); on any non-commit tip or git error it fails closed to a full - // repo scan. Only needed when something will actually replicate. All - // degraded paths log inside the helper rather than failing silently. - // Resolve the pin candidate set, then filter to what may actually - // replicate. Delta path: the reachable-only `withheld` set suffices (delta - // objects are reachable). Full-scan path: the candidate set can include - // dangling blobs the withheld set never classified, so fail closed — - // replicate a blob only if it is reachable AND visibility-allowed (#99). On - // any error in the fail-closed walks, pin nothing this push. + // Resolve the per-push pin candidate set once, off the async worker, then + // filter to what may actually replicate. Delta path: the reachable-only + // `withheld` set suffices (delta objects are reachable). Full-scan path: the + // candidate set can include dangling blobs the withheld set never classified, + // so fail closed — replicate a blob only if it is reachable AND + // visibility-allowed (#99). Only computed when something will actually + // replicate; every degraded path logs rather than failing silently. let object_list: Vec = if let Some(withheld_set) = withheld.clone() { let new_tips: Vec = ref_updates .iter() @@ -922,37 +955,14 @@ pub async fn git_receive_pack( ) .await; if pin_set.full_scan { - let p = disk_path.clone(); - let rules_for_blobs = rules_opt.clone().unwrap_or_default(); - let owner = record.owner_did.clone(); - let is_pub = record.is_public; - let candidates = pin_set.candidates; - tokio::task::spawn_blocking(move || -> anyhow::Result> { - let allowed = crate::git::visibility_pack::replicable_blob_set( - &p, - &rules_for_blobs, - is_pub, - &owner, - )?; - let all_blobs = crate::git::push_delta::all_blob_oids(&p)?; - Ok(crate::git::visibility_pack::replicable_objects_fail_closed( - candidates, - &allowed, - &all_blobs, - )) - }) + fail_closed_full_scan_objects( + disk_path.clone(), + rules_opt.clone().unwrap_or_default(), + record.is_public, + record.owner_did.clone(), + pin_set.candidates, + ) .await - .map_err(|e| { - tracing::warn!(err = %e, "fail-closed blob walk task panicked; pinning nothing this push") - }) - .ok() - .and_then(|r| { - r.map_err(|e| { - tracing::warn!(err = %e, "fail-closed blob walk failed; pinning nothing this push") - }) - .ok() - }) - .unwrap_or_default() } else { crate::git::visibility_pack::replicable_objects(pin_set.candidates, &withheld_set) } diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 874f819..0cd9fa0 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -807,4 +807,119 @@ mod tests { "private repo must not be federated to anonymous callers (#97)" ); } + + #[sqlx::test] + async fn graphql_repos_hides_private_from_anonymous(pool: PgPool) { + // The GraphQL repos query is the third listing surface; an anonymous + // query must not enumerate a private repo (#97). + let owner = "did:key:zGQLOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = state + .graphql_schema + .execute(async_graphql::Request::new("{ repos { name } }")) + .await; + assert!(resp.errors.is_empty(), "graphql errors: {:?}", resp.errors); + let names = names_in(&resp.data.into_json().expect("graphql json")["repos"]); + assert!( + names.contains(&"pub-repo".to_string()), + "public repo listed" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "private repo must not be enumerable via anonymous GraphQL (#97)" + ); + } + + #[sqlx::test] + async fn list_repos_paged_count_excludes_private(pool: PgPool) { + // The paged path (limit set) is the KTD2 exploit shape: a pre-cut page + + // SQL total would leak the private-repo count. Assert X-Total-Count is the + // visible count and the page is not short (#97). + let owner = "did:key:zPAGEOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-a")) + .await + .expect("seed public a"); + state + .db + .create_repo(&seed_repo(owner, "pub-b")) + .await + .expect("seed public b"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos?limit=10")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert_eq!( + total.as_deref(), + Some("2"), + "paged X-Total-Count must reflect only the 2 visible repos, not leak the private count" + ); + assert_eq!( + names.len(), + 2, + "page must not be short: both public repos present" + ); + assert!(!names.contains(&"priv-repo".to_string())); + } + + #[sqlx::test] + async fn list_repos_hides_public_repo_under_root_deny(pool: PgPool) { + // Proves the gate is visibility_check, not a bare is_public filter, in the + // negative direction: an is_public=true repo with a root deny rule (mode B, + // no readers) is NOT listable to anonymous, while a plain public repo is. + let owner = "did:key:zDENYOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "open-repo")) + .await + .expect("seed open"); + let denied = seed_repo(owner, "deny-repo"); // is_public = true + state.db.create_repo(&denied).await.expect("seed denied"); + state + .db + .set_visibility_rule(&denied.id, "/", crate::db::VisibilityMode::B, &[], owner) + .await + .expect("root deny rule"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"open-repo".to_string()), + "plain public repo listed" + ); + assert!( + !names.contains(&"deny-repo".to_string()), + "is_public=true repo with a root deny must NOT be listed (proves visibility_check, not is_public)" + ); + } } From 33a53d4ef11402ac6fb62898886918ab6e18cfc6 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 25 Jun 2026 09:03:53 -0500 Subject: [PATCH 5/9] fix(review): route the last inline root-visibility check through listable_at_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 review follow-up. replication_withheld_set still made the announce decision with an inline visibility_check(.., "/") == Allow — the fourth copy of the listing gate the seam refactor was meant to collapse. Route it through crate::visibility::listable_at_root (caller is always None, the anonymous replication audience), so every reader of the '/' decision now shares one seam. Also drop an em dash from the fail_closed_full_scan_objects doc comment. --- crates/gitlawb-node/src/api/repos.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index d8a09a8..4c7948f 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -46,7 +46,7 @@ async fn replication_withheld_set( disk_path: std::path::PathBuf, ) -> (bool, Option>) { let announce = match &rules { - Some(rules) => visibility_check(rules, is_public, owner_did, None, "/") == Decision::Allow, + Some(rules) => crate::visibility::listable_at_root(rules, is_public, owner_did, None), None => false, }; if !announce { @@ -97,7 +97,7 @@ async fn replication_withheld_set( /// The full-scan candidate set includes dangling objects the reachable-only /// withheld set never classified, so compute the reachable visibility-allowed /// blob set and the all-blob universe off the async worker and keep only -/// non-blobs plus allowed blobs. Any error in either walk — or a task panic — +/// non-blobs plus allowed blobs. Any error in either walk (or a task panic) /// pins nothing this push, mirroring the degraded-path shape of /// `replication_withheld_set`. async fn fail_closed_full_scan_objects( From e76f09a2b626485998589247edcd59db09926f31 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Thu, 25 Jun 2026 12:58:26 -0500 Subject: [PATCH 6/9] fix(node): gate the /api/v1/stats repos count through visibility to close the count oracle (#104) The stats endpoint returned count_repos_deduped(): an unfiltered, unauthenticated count of every logical repo including is_public=false and mode-A hide-existence repos. A polling observer could detect that a hidden repo was created (N to N+1), the exact existence signal mode-A suppresses everywhere else. Count only the repos an anonymous caller could list: over-fetch the deduped set, batch-load visibility rules, and keep rows that pass listable_at_root(.., None), mirroring the listing seam in api/repos.rs. The caller is always None (meta_routes carries no auth layer). Fail closed: any DB error collapses the count to 0, since an under-count cannot leak existence. count_repos_deduped is retained as the tested reference for the DEDUP_CTE dedup count (its tests pin the CASE the live list paths share); it has no production caller after this change and is allowed dead outside tests. Tests: bare-private (no-rule branch), hide-existence mode-A with both repos is_public=true (the Some(rule) branch), public-under-root-deny, list-parity, sibling-fields-preserved, and empty-DB. --- crates/gitlawb-node/src/db/mod.rs | 14 ++- crates/gitlawb-node/src/server.rs | 25 +++- crates/gitlawb-node/src/test_support.rs | 161 ++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 6 deletions(-) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 2e9c91f..5a4e91e 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -1003,10 +1003,16 @@ impl Db { Ok(rows.into_iter().map(row_to_repo).collect()) } - /// Count of distinct logical repos (mirror + canonical collapsed), for - /// `/api/v1/stats`. Uses the same did:key-aware owner-key grouping as - /// `DEDUP_CTE` (the CASE must stay byte-identical); the marker/tiebreak only - /// decide which row would survive, not the group count, so they are not needed here. + /// Count of distinct logical repos (mirror + canonical collapsed). Uses the + /// same did:key-aware owner-key grouping as `DEDUP_CTE` (the CASE must stay + /// byte-identical); the marker/tiebreak only decide which row would survive, + /// not the group count, so they are not needed here. + /// + /// `/api/v1/stats` no longer calls this — it counts only anonymously-listable + /// repos to avoid a count oracle (#104). Retained as the tested reference + /// implementation of the unfiltered dedup count: its tests pin the `DEDUP_CTE` + /// CASE that the live list paths depend on. Allowed dead outside tests. + #[cfg_attr(not(test), allow(dead_code))] pub async fn count_repos_deduped(&self) -> Result { let row = sqlx::query( "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt FROM repos", diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index 122b71f..a0b55fc 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -451,8 +451,29 @@ async fn node_info(State(state): State) -> Json { })) } -async fn stats(State(state): State) -> Json { - let repos = state.db.count_repos_deduped().await.unwrap_or(0); +pub(crate) async fn stats(State(state): State) -> Json { + // Count only the repos an anonymous caller could list, so the aggregate does + // not leak the existence of private/mode-A repos (#104 count oracle). Mirror + // the listing seam (api/repos.rs): over-fetch the deduped set, batch-load the + // visibility rules, and keep rows that pass listable_at_root. The caller is + // always None — meta_routes carries no auth layer (see the route group in this + // file). Fail closed: any DB error collapses the whole count to 0 (an + // under-count never leaks existence), preserving the prior `.unwrap_or(0)`. + let repos = async { + let rows = state.db.list_all_repos_deduped_with_stars(None).await?; + let ids: Vec = rows.iter().map(|(r, _)| r.id.clone()).collect(); + let rules_by_repo = state.db.list_visibility_rules_for_repos(&ids).await?; + let count = rows + .iter() + .filter(|(r, _)| { + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, None) + }) + .count() as i64; + Ok::(count) + } + .await + .unwrap_or(0); let agents = state.db.count_agents().await.unwrap_or(0); let pushes = state.db.count_pushes().await.unwrap_or(0); Json(json!({ diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 0cd9fa0..af03e79 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -922,4 +922,165 @@ mod tests { "is_public=true repo with a root deny must NOT be listed (proves visibility_check, not is_public)" ); } + + // ── /api/v1/stats count oracle (#104) ────────────────────────────────── + // The stats endpoint lives in meta_routes (no auth layer), so the caller is + // always anonymous (None). Its `repos` count must withhold private/mode-A + // repos exactly as the listing surfaces do, or it is a count oracle. + + fn stats_router(state: AppState) -> Router { + Router::new() + .route("/api/v1/stats", axum::routing::get(crate::server::stats)) + .with_state(state) + } + + async fn stats_repos_count(state: AppState) -> i64 { + let resp = stats_router(state) + .oneshot(anon_get("/api/v1/stats")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + json_body(resp).await["repos"] + .as_i64() + .expect("stats.repos is an integer") + } + + #[sqlx::test] + async fn stats_repos_count_excludes_bare_private(pool: PgPool) { + // No-rule branch: an is_public=false repo with no visibility rule is + // denied to anonymous, so stats.repos counts only the public repo. + let owner = "did:key:zSTATSPRIVAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + assert_eq!( + stats_repos_count(state).await, + 1, + "stats.repos must not count the private repo (#104 count oracle)" + ); + } + + #[sqlx::test] + async fn stats_repos_count_excludes_hide_existence_repo(pool: PgPool) { + // Some(rule) branch — the #104 subject. Both repos are is_public=true, so + // the only reason the second is withheld is its root rule with empty + // reader_dids (anonymous excluded). Proves the count goes through + // listable_at_root, not a bare is_public predicate. + let owner = "did:key:zSTATSHIDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "open-repo")) + .await + .expect("seed open"); + let hidden = seed_repo(owner, "hidden-repo"); // is_public = true + state.db.create_repo(&hidden).await.expect("seed hidden"); + state + .db + .set_visibility_rule(&hidden.id, "/", crate::db::VisibilityMode::A, &[], owner) + .await + .expect("root hide-existence rule"); + + assert_eq!( + stats_repos_count(state).await, + 1, + "stats.repos must not count a hide-existence (mode-A, empty readers) repo (#104)" + ); + } + + #[sqlx::test] + async fn stats_repos_count_excludes_public_under_root_deny(pool: PgPool) { + // Inverse the seam was built for: an is_public=true repo with a root deny + // (mode B, no readers) must not be counted — is_public alone would count it. + let owner = "did:key:zSTATSDENYAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "open-repo")) + .await + .expect("seed open"); + let denied = seed_repo(owner, "deny-repo"); // is_public = true + state.db.create_repo(&denied).await.expect("seed denied"); + state + .db + .set_visibility_rule(&denied.id, "/", crate::db::VisibilityMode::B, &[], owner) + .await + .expect("root deny rule"); + + assert_eq!( + stats_repos_count(state).await, + 1, + "stats.repos must not count an is_public=true repo under a root deny (#104)" + ); + } + + #[sqlx::test] + async fn stats_repos_count_matches_list_total(pool: PgPool) { + // R2 parity: stats.repos == anonymous GET /api/v1/repos X-Total-Count. + let owner = "did:key:zSTATSPARITYAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let list_total = { + let resp = list_repos_router(state.clone()) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + resp.headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .and_then(|s| s.parse::().ok()) + .expect("X-Total-Count header") + }; + + assert_eq!( + stats_repos_count(state).await, + list_total, + "stats.repos must equal the anonymous list X-Total-Count (R2 parity)" + ); + assert_eq!(list_total, 1, "sanity: only the public repo is visible"); + } + + #[sqlx::test] + async fn stats_preserves_sibling_fields(pool: PgPool) { + // R4: the rewrite must not drop agents/pushes/version. + let state = test_state(pool).await; + let resp = stats_router(state) + .oneshot(anon_get("/api/v1/stats")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let body = json_body(resp).await; + for key in ["repos", "agents", "pushes", "version"] { + assert!(body.get(key).is_some(), "stats must still carry `{key}`"); + } + } + + #[sqlx::test] + async fn stats_repos_count_empty_db_is_zero(pool: PgPool) { + let state = test_state(pool).await; + assert_eq!( + stats_repos_count(state).await, + 0, + "empty DB yields repos == 0 without error" + ); + } } From 37fb047a88500f83873f8e9fe29839cc27b52887 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 27 Jun 2026 19:53:19 -0500 Subject: [PATCH 7/9] test(review): add listing visibility regression guards (#97) Three gaps the review flagged as uncovered: - owner-filtered listing (?owner=) still hides a private repo and its count from anonymous (a distinct SQL bind branch from the unfiltered path). - offset past the visible set returns an empty page while X-Total-Count stays the full visible total (guards against deriving the total from the cut page). - a logical repo present as both a canonical row (with a root-deny rule) and a gossip mirror row stays hidden: pins that the DEDUP_CTE picks the canonical survivor so the batch rule lookup finds the deny. If dedup ever picked the mirror (slash-form id, no rule) the gate would fall back to is_public and leak. --- crates/gitlawb-node/src/test_support.rs | 152 ++++++++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index af03e79..433f138 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -923,6 +923,158 @@ mod tests { ); } + #[sqlx::test] + async fn list_repos_owner_filter_excludes_private_from_anonymous(pool: PgPool) { + // The owner-filtered path (?owner=, SQL $1 bind) must still apply the Rust + // "/" visibility gate: an anonymous caller filtering by an owner sees that + // owner's public repos but never their private ones, and the count does + // not leak (#97). This is a distinct SQL branch from the unfiltered path. + let short = "zOWNERFILTERAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let owner = format!("did:key:{short}"); + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(&owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(&owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get(&format!("/api/v1/repos?owner={short}&limit=10"))) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"pub-repo".to_string()), + "owner's public repo listed" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "owner's private repo hidden from anonymous even when owner-filtered (#97)" + ); + assert_eq!( + total.as_deref(), + Some("1"), + "owner-filtered X-Total-Count must exclude the private repo" + ); + } + + #[sqlx::test] + async fn list_repos_pagination_offset_past_end_keeps_total(pool: PgPool) { + // Pagination edge: an offset past the visible set returns an empty page, + // but X-Total-Count still reflects the full visible count -- so paging can + // neither short the page nor leak a different total (#97). Guards against a + // refactor that derives the total from the cut page instead of the set. + let owner = "did:key:zOFFSETOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-a")) + .await + .expect("seed public a"); + state + .db + .create_repo(&seed_repo(owner, "pub-b")) + .await + .expect("seed public b"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos?limit=5&offset=100")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!(names.is_empty(), "offset past the end yields an empty page"); + assert_eq!( + total.as_deref(), + Some("2"), + "X-Total-Count stays the full visible total regardless of offset" + ); + } + + #[sqlx::test] + async fn list_repos_hides_canonical_under_root_deny_even_with_mirror(pool: PgPool) { + // Regression guard for the dedup-survivor + visibility-rule seam. A logical + // repo present as BOTH a canonical row (carrying a root-deny rule) and a + // gossip mirror row: the DEDUP_CTE must pick the canonical survivor so the + // batch rule lookup (keyed by the survivor's id) finds the deny and + // withholds it. If dedup ever picked the mirror (slash-form id, no rule), + // the gate would fall back to is_public=true and leak the repo. is_public + // is true here, so the rule is the only thing hiding it. + let short = "zMIRRORDENYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let owner = format!("did:key:{short}"); + let state = test_state(pool).await; + let canonical = seed_repo(&owner, "secret"); // is_public = true + state + .db + .create_repo(&canonical) + .await + .expect("seed canonical"); + state + .db + .set_visibility_rule( + &canonical.id, + "/", + crate::db::VisibilityMode::B, + &[], + &owner, + ) + .await + .expect("root deny rule on canonical"); + state + .db + .upsert_mirror_repo(short, "secret", "/tmp/mirror", None) + .await + .expect("seed mirror"); + state + .db + .create_repo(&seed_repo(&owner, "open")) + .await + .expect("seed public sibling"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!(names.contains(&"open".to_string()), "public sibling listed"); + assert!( + !names.contains(&"secret".to_string()), + "canonical repo with a root deny must stay hidden even when a mirror row exists (#97 dedup-survivor/rule seam)" + ); + assert_eq!( + total.as_deref(), + Some("1"), + "X-Total-Count counts only the visible sibling, not the mirror+canonical pair" + ); + } + // ── /api/v1/stats count oracle (#104) ────────────────────────────────── // The stats endpoint lives in meta_routes (no auth layer), so the caller is // always anonymous (None). Its `repos` count must withhold private/mode-A From 90ab7acc1bde7ae498256477f9a00b927549889c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 27 Jun 2026 20:18:30 -0500 Subject: [PATCH 8/9] fix(review): address CodeRabbit feedback on #111 - visibility_pack NFC/NFD test: accept a SHA-1 or SHA-256 object id (40 | 64) instead of hardcoding 40, matching the fixture guard later in the file, so the test is not SHA-1-only. - federated listing test: also assert the response `count` reflects only the visible repos, guarding against a regression to the pre-filter total. - add an authenticated-caller GraphQL test: an owner sees their own private repo via `repos`, covering the resolver's caller-DID path that the anonymous-only test did not exercise. - stats: count via the no-stars `list_all_repos_deduped()` (same DEDUP_CTE) instead of the stars-joined variant, since the count never needs stars. --- .../gitlawb-node/src/git/visibility_pack.rs | 7 ++-- crates/gitlawb-node/src/server.rs | 8 +++-- crates/gitlawb-node/src/test_support.rs | 34 +++++++++++++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index a3e0492..578ee40 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -1029,9 +1029,10 @@ mod tests { let secret_oid = oid(&format!("{nfd_dir}/key.pem")); let public_oid = oid("public.txt"); // Guard against a vacuous pass: the NFD-named blob must actually exist. - assert_eq!( - secret_oid.len(), - 40, + // Accept SHA-1 (40) or SHA-256 (64) object ids so the test is + // hash-format agnostic, matching the fixture guard later in this file. + assert!( + matches!(secret_oid.len(), 40 | 64), "secret blob was not stored under the NFD path" ); run( diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index a0b55fc..11a4724 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -460,12 +460,14 @@ pub(crate) async fn stats(State(state): State) -> Json = rows.iter().map(|(r, _)| r.id.clone()).collect(); + // stats only needs the count, so use the no-stars deduped list (same + // DEDUP_CTE) and skip the repo_stars aggregation the listing path needs. + let rows = state.db.list_all_repos_deduped().await?; + let ids: Vec = rows.iter().map(|r| r.id.clone()).collect(); let rules_by_repo = state.db.list_visibility_rules_for_repos(&ids).await?; let count = rows .iter() - .filter(|(r, _)| { + .filter(|r| { let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, None) }) diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 433f138..b511752 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -798,6 +798,11 @@ mod tests { .unwrap(); let body = json_body(resp).await; let names = names_in(&body["repos"]); + assert_eq!( + body["count"].as_u64(), + Some(1), + "federated count must reflect only the visible repos, not the pre-filter total (#97)" + ); assert!( names.contains(&"pub-repo".to_string()), "public repo federated" @@ -841,6 +846,35 @@ mod tests { ); } + #[sqlx::test] + async fn graphql_repos_shows_authorized_caller_their_private_repo(pool: PgPool) { + // Positive path: the resolver pulls the caller DID from GraphQL request + // data, so the authenticated context must still surface a private repo its + // owner may read. Guards an auth-context regression on the GraphQL surface + // that the anonymous-only test would miss (#97). + let owner = "did:key:zGQLAUTHOWNERAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = state + .graphql_schema + .execute( + async_graphql::Request::new("{ repos { name } }") + .data(AuthenticatedDid(owner.to_string())), + ) + .await; + assert!(resp.errors.is_empty(), "graphql errors: {:?}", resp.errors); + let names = names_in(&resp.data.into_json().expect("graphql json")["repos"]); + assert!( + names.contains(&"priv-repo".to_string()), + "owner must see their own private repo via authenticated GraphQL (#97)" + ); + } + #[sqlx::test] async fn list_repos_paged_count_excludes_private(pool: PgPool) { // The paged path (limit set) is the KTD2 exploit shape: a pre-cut page + From c1636d8723d451beef236f15250a1dd80b372236 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 27 Jun 2026 21:21:43 -0500 Subject: [PATCH 9/9] fix(review): match owner filter to did_matches for bare-owner mirror rows (#111) Addresses jatmn's P2. The shared listing CTE filtered owners with `owner_did = $1 OR owner_did LIKE '%:' || $1`, so a full `did:key:z...` query did not match a mirror-only row stored with the bare owner `z...`. #97 routed the no-limit owner-filtered path (used by `gl repo list`) through this SQL predicate, regressing the full-DID-against-bare-mirror direction that `crate::api::did_matches` handled in the old Rust path. Normalize the bound owner to its did:key short form and compare it against the row's owner key using the same CASE the dedup grouping already applies, so full and short forms match symmetrically and a non-key DID still matches only exactly. The no-arg `list_all_repos_deduped()` binds NULL and is unaffected. Regression test: a bare-owner mirror-only row is returned when queried by both the full `did:key:` form and the bare short form. Confirmed RED (test fails) before the fix. --- crates/gitlawb-node/src/db/mod.rs | 21 +++++++++++-- crates/gitlawb-node/src/test_support.rs | 39 +++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 5a4e91e..9fc65e0 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -934,7 +934,11 @@ impl Db { ) AS updated_at, disk_path, forked_from, machine_id FROM repos - WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1) + -- Match the owner filter on the same did:key-aware owner key the + -- dedup groups on, so a full did:key: form matches a bare-owner + -- mirror row (and vice versa) exactly as crate::api::did_matches + -- does. Callers bind the already-normalized key ($1). + WHERE ($1::text IS NULL OR (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END) = $1) ORDER BY CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name, -- mirror rows carry a slash-form id (\"{owner_short}/{name}\"), -- written only by upsert_mirror_repo; canonical ids are UUIDs. @@ -949,10 +953,23 @@ impl Db { /// set (no SQL pagination): the listing surface filters by per-caller `"/"` /// visibility in Rust and paginates after, so neither the page nor the count /// leaks a repo the caller may not read (#97). + /// + /// The owner filter is normalized to its did:key short form before binding so + /// the SQL predicate matches `crate::api::did_matches`: a full `did:key:z...` + /// query and a bare-owner mirror row (`z...`) match each other, and vice + /// versa. A non-key DID (still has a `:` after the prefix) is left intact and + /// only matches exactly. pub async fn list_all_repos_deduped_with_stars( &self, owner_filter: Option<&str>, ) -> Result> { + // Mirror did_matches: strip `did:key:` only when the remainder is a bare + // key id (no further `:`). The DEDUP_CTE applies the identical CASE to + // owner_did, so the two compare on the same normalized key. + let owner_key = owner_filter.map(|o| match o.strip_prefix("did:key:") { + Some(rest) if !rest.contains(':') => rest, + _ => o, + }); let sql = format!( "{} SELECT @@ -968,7 +985,7 @@ impl Db { Self::DEDUP_CTE ); let rows = sqlx::query(&sql) - .bind(owner_filter) + .bind(owner_key) .fetch_all(&self.pool) .await?; diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index b511752..4313c23 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -1003,6 +1003,45 @@ mod tests { ); } + #[sqlx::test] + async fn list_repos_owner_filter_full_did_matches_bare_mirror(pool: PgPool) { + // A mirror-only repo (known via gossip, no local canonical row) stores the + // bare owner key `z...`. Filtering by the full `did:key:z...` form must + // still return it, matching crate::api::did_matches — the behavior the + // no-limit `gl repo list --owner` path relied on before #97 routed owner + // filtering through SQL (jatmn P2 on #111). Both owner forms must match. + let short = "zMIRRORONLYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .upsert_mirror_repo(short, "mirror-repo", "/tmp/mirror", None) + .await + .expect("seed mirror-only row"); + + // full did:key: form must match the bare-owner mirror row + let resp = list_repos_router(state.clone()) + .oneshot(anon_get(&format!("/api/v1/repos?owner=did:key:{short}"))) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"mirror-repo".to_string()), + "full did:key: owner filter must match a bare-owner mirror row (jatmn #111)" + ); + + // short bare form must still match + let resp = list_repos_router(state) + .oneshot(anon_get(&format!("/api/v1/repos?owner={short}"))) + .await + .unwrap(); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"mirror-repo".to_string()), + "short-form owner filter must still match the mirror row" + ); + } + #[sqlx::test] async fn list_repos_pagination_offset_past_end_keeps_total(pool: PgPool) { // Pagination edge: an offset past the visible set returns an empty page,