diff --git a/Cargo.lock b/Cargo.lock index 2866a23..849e081 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3403,6 +3403,7 @@ dependencies = [ "tower-http", "tracing", "tracing-subscriber", + "unicode-normalization", "uuid", "zstd", ] diff --git a/crates/gitlawb-node/Cargo.toml b/crates/gitlawb-node/Cargo.toml index 881d4bd..2c4a329 100644 --- a/crates/gitlawb-node/Cargo.toml +++ b/crates/gitlawb-node/Cargo.toml @@ -39,6 +39,9 @@ hmac = { workspace = true } http-body-util = "0.1" tokio-util = { version = "0.7", features = ["io"] } dirs-next = "2" +# NFC-normalize visibility globs/paths at the matcher seam so a deny rule +# withholds a canonically-equivalent path regardless of NFC/NFD form (#101). +unicode-normalization = "0.1" reqwest = { workspace = true } libp2p-core = "0.43" libp2p-gossipsub = "0.49.4" diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 6afe68f..4c7948f 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -46,7 +46,7 @@ async fn replication_withheld_set( disk_path: std::path::PathBuf, ) -> (bool, Option>) { let announce = match &rules { - Some(rules) => visibility_check(rules, is_public, owner_did, None, "/") == Decision::Allow, + Some(rules) => crate::visibility::listable_at_root(rules, is_public, owner_did, None), None => false, }; if !announce { @@ -93,6 +93,43 @@ async fn replication_withheld_set( } } +/// The replicable object set for a full-scan pin fallback, failing closed (#99). +/// The full-scan candidate set includes dangling objects the reachable-only +/// withheld set never classified, so compute the reachable visibility-allowed +/// blob set and the all-blob universe off the async worker and keep only +/// non-blobs plus allowed blobs. Any error in either walk (or a task panic) +/// pins nothing this push, mirroring the degraded-path shape of +/// `replication_withheld_set`. +async fn fail_closed_full_scan_objects( + disk_path: std::path::PathBuf, + rules: Vec, + is_public: bool, + owner_did: String, + candidates: Vec, +) -> Vec { + tokio::task::spawn_blocking(move || -> anyhow::Result> { + let allowed = crate::git::visibility_pack::replicable_blob_set( + &disk_path, &rules, is_public, &owner_did, + )?; + let all_blobs = crate::git::push_delta::all_blob_oids(&disk_path)?; + Ok(crate::git::visibility_pack::replicable_objects_fail_closed( + candidates, &allowed, &all_blobs, + )) + }) + .await + .map_err(|e| { + tracing::warn!(err = %e, "fail-closed blob walk task panicked; pinning nothing this push") + }) + .ok() + .and_then(|r| { + r.map_err(|e| { + tracing::warn!(err = %e, "fail-closed blob walk failed; pinning nothing this push") + }) + .ok() + }) + .unwrap_or_default() +} + // ── Request / Response types ─────────────────────────────────────────────── #[derive(Debug, Deserialize)] @@ -207,50 +244,61 @@ pub struct ListReposQuery { /// present, returns one page and the `X-Total-Count` response header carries the /// total matching row count. Without `limit`, falls back to returning every row /// (kept for backwards compat with peer sync and existing CLI tooling). +/// +/// Every returned row passes the per-caller `"/"` visibility gate +/// (`crate::visibility::listable_at_root`), the same decision the per-repo +/// content endpoints make, so neither the page nor `X-Total-Count` leaks a repo +/// (or its mere count) the caller may not read (#97). pub async fn list_repos( State(state): State, Query(query): Query, + auth: Option>, ) -> Result { use axum::http::HeaderValue; use axum::response::IntoResponse; - if let Some(raw_limit) = query.limit { - let limit = raw_limit.clamp(1, 200); - let offset = query.offset.unwrap_or(0).max(0); - let (rows, total) = state - .db - .list_all_repos_paged(query.owner.as_deref(), limit, offset) - .await?; - let body: Vec = rows - .into_iter() - .map(|(r, stars)| to_response(&r, &state, stars)) - .collect(); - let mut response = Json(body).into_response(); - response.headers_mut().insert( - "X-Total-Count", - HeaderValue::from_str(&total.to_string()).unwrap_or(HeaderValue::from_static("0")), - ); - return Ok(response); - } + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); - let repos = state.db.list_all_repos_with_stars().await?; - let filtered: Vec<(RepoRecord, i64)> = repos + // Over-fetch the deduped set (did:key-aware DEDUP_CTE collapses mirror rows), + // then apply the per-repo "/" visibility gate in Rust BEFORE pagination so + // neither the page nor X-Total-Count leaks a repo the caller may not read — + // including its mere count. The "/" decision depends on owner short/full-DID + // matching and JSON reader-DID membership, so it cannot be a clean SQL + // predicate without drifting from visibility_check; the count is derived from + // the visible set (#97). + let owner_filtered = state + .db + .list_all_repos_deduped_with_stars(query.owner.as_deref()) + .await?; + + let ids: Vec = owner_filtered.iter().map(|(r, _)| r.id.clone()).collect(); + let rules_by_repo = state.db.list_visibility_rules_for_repos(&ids).await?; + let visible: Vec<(crate::db::RepoRecord, i64)> = owner_filtered .into_iter() .filter(|(r, _)| { - if let Some(owner) = &query.owner { - crate::api::did_matches(owner.as_str(), &r.owner_did) - } else { - true - } + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, caller) }) .collect(); - let deduped = dedupe_canonical_repos(filtered); - let total = deduped.len() as i64; - let resp: Vec<_> = deduped + + let total = visible.len() as i64; + + // Paginate in Rust when a limit is set: SQL LIMIT/OFFSET cannot run before + // the visibility filter without returning short pages and a leaked count. + let page: Vec<(crate::db::RepoRecord, i64)> = match query.limit { + Some(raw_limit) => { + let limit = raw_limit.clamp(1, 200) as usize; + let offset = query.offset.unwrap_or(0).max(0) as usize; + visible.into_iter().skip(offset).take(limit).collect() + } + None => visible, + }; + + let body: Vec = page .into_iter() .map(|(r, stars)| to_response(&r, &state, stars)) .collect(); - let mut response = Json(resp).into_response(); + let mut response = Json(body).into_response(); response.headers_mut().insert( "X-Total-Count", HeaderValue::from_str(&total.to_string()).unwrap_or(HeaderValue::from_static("0")), @@ -882,12 +930,14 @@ pub async fn git_receive_pack( ) .await; - // Compute the per-push pin candidate set once, off the async worker (git - // subprocess). On the happy path this is the push delta (objects this push - // introduced); on any non-commit tip or git error it fails closed to a full - // repo scan. Only needed when something will actually replicate. All - // degraded paths log inside the helper rather than failing silently. - let candidates: Vec = if withheld.is_some() { + // Resolve the per-push pin candidate set once, off the async worker, then + // filter to what may actually replicate. Delta path: the reachable-only + // `withheld` set suffices (delta objects are reachable). Full-scan path: the + // candidate set can include dangling blobs the withheld set never classified, + // so fail closed — replicate a blob only if it is reachable AND + // visibility-allowed (#99). Only computed when something will actually + // replicate; every degraded path logs rather than failing silently. + let object_list: Vec = if let Some(withheld_set) = withheld.clone() { let new_tips: Vec = ref_updates .iter() .map(|u| u.new_sha.clone()) @@ -898,16 +948,32 @@ pub async fn git_receive_pack( .map(|u| u.old_sha.clone()) .filter(|s| s != ZERO_SHA) .collect(); - crate::git::push_delta::resolve_candidates_for_push(disk_path.clone(), new_tips, old_tips) + let pin_set = crate::git::push_delta::resolve_candidates_for_push( + disk_path.clone(), + new_tips, + old_tips, + ) + .await; + if pin_set.full_scan { + fail_closed_full_scan_objects( + disk_path.clone(), + rules_opt.clone().unwrap_or_default(), + record.is_public, + record.owner_did.clone(), + pin_set.candidates, + ) .await + } else { + crate::git::visibility_pack::replicable_objects(pin_set.candidates, &withheld_set) + } } else { Vec::new() }; // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty). // Skipped entirely when the public cannot read the repo (withheld == None). - if let Some(withheld_ipfs) = withheld.clone() { - let candidates_ipfs = candidates.clone(); + if withheld.is_some() { + let object_list_ipfs = object_list.clone(); let ipfs_api = state.config.ipfs_api.clone(); let repo_path_clone = disk_path.clone(); let db_clone = state.db.clone(); @@ -924,9 +990,8 @@ pub async fn git_receive_pack( let pinned = crate::ipfs_pin::pin_new_objects( &ipfs_api, &repo_path_clone, - candidates_ipfs, + object_list_ipfs, &db_clone, - &withheld_ipfs, ) .await; if !pinned.is_empty() { @@ -1030,23 +1095,21 @@ pub async fn git_receive_pack( let owner_did_for_arweave = record.owner_did.clone(); let self_public_url = state.config.public_url.clone(); let node_keypair = Arc::clone(&state.node_keypair); - let withheld_pinata = withheld; - let candidates_pinata = candidates; + let object_list_pinata = object_list; + let do_pinata_replication = withheld.is_some(); tokio::spawn(async move { - let pinned = match &withheld_pinata { - Some(withheld) => { - crate::pinata::pin_new_objects( - &http_client, - &pinata_upload_url, - &pinata_jwt, - &repo_path_clone, - candidates_pinata, - &db_clone, - withheld, - ) - .await - } - None => Vec::new(), + let pinned = if do_pinata_replication { + crate::pinata::pin_new_objects( + &http_client, + &pinata_upload_url, + &pinata_jwt, + &repo_path_clone, + object_list_pinata, + &db_clone, + ) + .await + } else { + Vec::new() }; if !pinned.is_empty() { @@ -1203,8 +1266,24 @@ pub async fn list_refs( /// which node hosts it. Results from unreachable peers are silently omitted. pub async fn list_federated_repos( State(state): State, + auth: Option>, ) -> Result> { + let caller = auth.as_ref().map(|e| e.0 .0.as_str()); let local_repos = dedupe_canonical_repos(state.db.list_all_repos_with_stars().await?); + + // Hide local repos the caller may not read at "/" before federating them, so + // the federated surface does not enumerate private repos (#97). Peer repos + // arrive already filtered by each peer's own /api/v1/repos (anonymous view). + let ids: Vec = local_repos.iter().map(|(r, _)| r.id.clone()).collect(); + let rules_by_repo = state.db.list_visibility_rules_for_repos(&ids).await?; + let local_repos: Vec<(crate::db::RepoRecord, i64)> = local_repos + .into_iter() + .filter(|(r, _)| { + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, caller) + }) + .collect(); + let local_node_url = state .config .public_url diff --git a/crates/gitlawb-node/src/db/mod.rs b/crates/gitlawb-node/src/db/mod.rs index 36e44ff..9fc65e0 100644 --- a/crates/gitlawb-node/src/db/mod.rs +++ b/crates/gitlawb-node/src/db/mod.rs @@ -919,7 +919,7 @@ impl Db { /// stay byte-identical or Postgres stops using the index. /// The canonical row wins (mirror rows carry a slash-form `id` written only by /// `upsert_mirror_repo`), ties broken by earliest `created_at` then `id` so a - /// full tie still picks a deterministic survivor. `list_all_repos_paged`, + /// full tie still picks a deterministic survivor. `list_all_repos_deduped_with_stars`, /// `list_all_repos_deduped`, and the marker logic in /// `crate::api::repos::dedupe_canonical_repos` must stay in sync. const DEDUP_CTE: &'static str = "WITH deduped AS ( @@ -934,7 +934,11 @@ impl Db { ) AS updated_at, disk_path, forked_from, machine_id FROM repos - WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1) + -- Match the owner filter on the same did:key-aware owner key the + -- dedup groups on, so a full did:key: form matches a bare-owner + -- mirror row (and vice versa) exactly as crate::api::did_matches + -- does. Callers bind the already-normalized key ($1). + WHERE ($1::text IS NULL OR (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END) = $1) ORDER BY CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name, -- mirror rows carry a slash-form id (\"{owner_short}/{name}\"), -- written only by upsert_mirror_repo; canonical ids are UUIDs. @@ -944,62 +948,54 @@ impl Db { created_at ASC, id ASC )"; - pub async fn list_all_repos_paged( + /// All repos with star counts, mirror-deduplicated via `DEDUP_CTE` and + /// ordered newest-first, optionally filtered to one owner. Returns the full + /// set (no SQL pagination): the listing surface filters by per-caller `"/"` + /// visibility in Rust and paginates after, so neither the page nor the count + /// leaks a repo the caller may not read (#97). + /// + /// The owner filter is normalized to its did:key short form before binding so + /// the SQL predicate matches `crate::api::did_matches`: a full `did:key:z...` + /// query and a bare-owner mirror row (`z...`) match each other, and vice + /// versa. A non-key DID (still has a `:` after the prefix) is left intact and + /// only matches exactly. + pub async fn list_all_repos_deduped_with_stars( &self, owner_filter: Option<&str>, - limit: i64, - offset: i64, - ) -> Result<(Vec<(RepoRecord, i64)>, i64)> { + ) -> Result> { + // Mirror did_matches: strip `did:key:` only when the remainder is a bare + // key id (no further `:`). The DEDUP_CTE applies the identical CASE to + // owner_did, so the two compare on the same normalized key. + let owner_key = owner_filter.map(|o| match o.strip_prefix("did:key:") { + Some(rest) if !rest.contains(':') => rest, + _ => o, + }); let sql = format!( "{} SELECT d.id, d.name, d.owner_did, d.description, d.is_public, d.default_branch, d.created_at, d.updated_at, d.disk_path, d.forked_from, d.machine_id, - COALESCE(s.cnt, 0) AS star_count, - COUNT(*) OVER () AS total_count + COALESCE(s.cnt, 0) AS star_count FROM deduped d LEFT JOIN ( SELECT repo_id, COUNT(*) AS cnt FROM repo_stars GROUP BY repo_id ) s ON s.repo_id = d.id - ORDER BY d.updated_at DESC - LIMIT $2 OFFSET $3", + ORDER BY d.updated_at DESC", Self::DEDUP_CTE ); let rows = sqlx::query(&sql) - .bind(owner_filter) - .bind(limit) - .bind(offset) + .bind(owner_key) .fetch_all(&self.pool) .await?; - let total = rows - .first() - .map(|r| r.get::("total_count")) - .unwrap_or(0); - let out: Vec<(RepoRecord, i64)> = rows + Ok(rows .into_iter() .map(|r| { let stars: i64 = r.get("star_count"); (row_to_repo(r), stars) }) - .collect(); - - let total = if out.is_empty() { - let row = sqlx::query( - "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt - FROM repos - WHERE ($1::text IS NULL OR owner_did = $1 OR owner_did LIKE '%:' || $1)", - ) - .bind(owner_filter) - .fetch_one(&self.pool) - .await?; - row.get::("cnt") - } else { - total - }; - - Ok((out, total)) + .collect()) } /// Deduped repo list (no stars, no paging) for the unfiltered read surfaces @@ -1024,10 +1020,16 @@ impl Db { Ok(rows.into_iter().map(row_to_repo).collect()) } - /// Count of distinct logical repos (mirror + canonical collapsed), for - /// `/api/v1/stats`. Uses the same did:key-aware owner-key grouping as - /// `DEDUP_CTE` (the CASE must stay byte-identical); the marker/tiebreak only - /// decide which row would survive, not the group count, so they are not needed here. + /// Count of distinct logical repos (mirror + canonical collapsed). Uses the + /// same did:key-aware owner-key grouping as `DEDUP_CTE` (the CASE must stay + /// byte-identical); the marker/tiebreak only decide which row would survive, + /// not the group count, so they are not needed here. + /// + /// `/api/v1/stats` no longer calls this — it counts only anonymously-listable + /// repos to avoid a count oracle (#104). Retained as the tested reference + /// implementation of the unfiltered dedup count: its tests pin the `DEDUP_CTE` + /// CASE that the live list paths depend on. Allowed dead outside tests. + #[cfg_attr(not(test), allow(dead_code))] pub async fn count_repos_deduped(&self) -> Result { let row = sqlx::query( "SELECT COUNT(DISTINCT (CASE WHEN owner_did LIKE 'did:key:%' AND position(':' in substr(owner_did, 9)) = 0 THEN substr(owner_did, 9) ELSE owner_did END, name)) AS cnt FROM repos", @@ -2468,6 +2470,45 @@ impl Db { }) .collect()) } + + /// All visibility rules for a set of repos, grouped by `repo_id`, in one + /// query. The listing surfaces use this to apply the same `"/"` visibility + /// decision the per-repo endpoints make without an N+1 per-repo rule fetch + /// (#97). Repos with no rules are simply absent from the map. + pub async fn list_visibility_rules_for_repos( + &self, + repo_ids: &[String], + ) -> Result>> { + use std::collections::HashMap; + if repo_ids.is_empty() { + return Ok(HashMap::new()); + } + let rows = sqlx::query( + "SELECT id, repo_id, path_glob, mode, reader_dids, created_by, created_at + FROM visibility_rules WHERE repo_id = ANY($1) ORDER BY path_glob", + ) + .bind(repo_ids) + .fetch_all(&self.pool) + .await?; + let mut out: HashMap> = HashMap::new(); + for r in rows { + let readers: String = r.get("reader_dids"); + let created_at: String = r.get("created_at"); + let rule = VisibilityRule { + id: r.get("id"), + repo_id: r.get("repo_id"), + path_glob: r.get("path_glob"), + mode: VisibilityMode::from_db(&r.get::("mode")), + reader_dids: serde_json::from_str(&readers).unwrap_or_default(), + created_by: r.get("created_by"), + created_at: created_at + .parse::>() + .unwrap_or_else(|_| Utc::now()), + }; + out.entry(rule.repo_id.clone()).or_default().push(rule); + } + Ok(out) + } } // ── Repo Stars ──────────────────────────────────────────────────────────────── diff --git a/crates/gitlawb-node/src/git/push_delta.rs b/crates/gitlawb-node/src/git/push_delta.rs index 21965db..7ab0081 100644 --- a/crates/gitlawb-node/src/git/push_delta.rs +++ b/crates/gitlawb-node/src/git/push_delta.rs @@ -10,16 +10,24 @@ //! ## Correctness framing (do not confuse with the #84 withholding direction) //! //! The pin enumeration is part of the *exposure* set, not the withheld filter. -//! Narrowing it per push only *shrinks* what is pinned, so it cannot create an -//! under-withholding leak — the withheld filter (`visibility_pack`) still -//! subtracts from whatever, smaller, set we feed it. The only risk here is -//! *under-pinning* (a durability gap), which the reconciliation sweep backstops. +//! On the *delta* path, narrowing per push only *shrinks* what is pinned, so it +//! cannot create an under-withholding leak — the reachable-only withheld filter +//! (`visibility_pack`) still subtracts from the smaller set we feed it. The only +//! risk there is *under-pinning* (a durability gap) the sweep backstops. +//! +//! The *full-scan* path is different: `list_all_objects` includes dangling +//! objects the reachable withheld set never classified, so subtracting that set +//! is not enough — a dangling private blob would slip through (#99). The caller +//! signals a full scan via [`PinCandidateSet::full_scan`] and must then apply +//! the fail-closed blob filter (`visibility_pack::replicable_objects_fail_closed`) +//! instead of the plain reachable-only subtraction. //! //! Because the pin candidate set needs only the OID *set* (never the per-path //! information the withheld classifier needs), `git rev-list --objects //! --no-object-names` is safe here. The "rev-list reports one path per object" //! trap from #84 applies only to `visibility_pack`'s per-path `ls-tree` walk. +use std::collections::HashSet; use std::path::{Path, PathBuf}; use std::process::Command; @@ -191,6 +199,61 @@ pub fn list_all_objects(repo_path: &Path) -> Result> { .collect()) } +/// Like [`list_all_objects`] but pairs each OID with its object type, via +/// `--batch-check='%(objectname) %(objecttype)'`. The pin path's fail-closed +/// filter needs to tell blobs (content, withholdable) from commits/trees +/// (structural, never withheld) without typing the candidate list itself. +pub fn list_all_objects_with_type(repo_path: &Path) -> Result> { + let output = Command::new("git") + .args([ + "cat-file", + "--batch-all-objects", + "--batch-check=%(objectname) %(objecttype)", + ]) + .current_dir(repo_path) + .output() + .context("failed to run git cat-file")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + bail!("git cat-file failed: {stderr}"); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + Ok(stdout + .lines() + .filter_map(|l| { + let mut it = l.split_whitespace(); + let oid = it.next()?; + let ty = it.next()?; + Some((oid.to_string(), ty.to_string())) + }) + .collect()) +} + +/// Every blob OID in the repo, including unreachable/dangling ones. The +/// fail-closed pin filter drops any candidate blob absent from the reachable, +/// visibility-allowed set; a dangling private blob is in this set but not the +/// allowed set, so it never replicates (#99). +pub fn all_blob_oids(repo_path: &Path) -> Result> { + Ok(list_all_objects_with_type(repo_path)? + .into_iter() + .filter(|(_, ty)| ty == "blob") + .map(|(oid, _)| oid) + .collect()) +} + +/// The pin candidate OIDs for a push plus whether they came from a full-repo +/// scan. `full_scan` is true when the delta could not be used and the whole +/// object DB (including dangling objects) was enumerated — the caller must then +/// apply the fail-closed blob filter, because the reachable-only withheld set +/// cannot classify dangling blobs (#99). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PinCandidateSet { + pub candidates: Vec, + pub full_scan: bool, +} + /// Resolve the pin candidate OID set for a push, off the async worker. /// /// Runs [`resolve_push_delta`] in `spawn_blocking` (git subprocess) and applies @@ -202,28 +265,29 @@ pub fn list_all_objects(repo_path: &Path) -> Result> { /// failed full scan, and a panicked blocking task each emit a warning. On a /// failed full scan or a task panic the candidate set is empty (pin nothing /// this push); that is a durability gap the reconciliation sweep backstops, and -/// it can never leak because the withheld filter still runs on whatever set is -/// returned. +/// it can never leak because the withheld/fail-closed filter still runs on +/// whatever set is returned. `full_scan` rides on the returned set so the caller +/// knows when the dangling-inclusive filter is required. pub async fn resolve_candidates_for_push( repo_path: PathBuf, new_tips: Vec, old_tips: Vec, -) -> Vec { +) -> PinCandidateSet { tokio::task::spawn_blocking(move || { let new_refs: Vec<&str> = new_tips.iter().map(String::as_str).collect(); let old_refs: Vec<&str> = old_tips.iter().map(String::as_str).collect(); match resolve_push_delta(&repo_path, &new_refs, &old_refs) { PinCandidates::Delta(objs) => { tracing::info!(delta = objs.len(), repo = %repo_path.display(), "pin candidate set from push delta"); - objs + PinCandidateSet { candidates: objs, full_scan: false } } PinCandidates::FullScanRequired => { tracing::warn!(repo = %repo_path.display(), "pin delta unavailable (non-commit tip, git error, or force-full-scan) — full-scan fallback"); match list_all_objects(&repo_path) { - Ok(objs) => objs, + Ok(objs) => PinCandidateSet { candidates: objs, full_scan: true }, Err(e) => { tracing::warn!(repo = %repo_path.display(), err = %e, "full-scan fallback failed; pinning nothing this push (reconciliation sweep backstops)"); - Vec::new() + PinCandidateSet { candidates: Vec::new(), full_scan: false } } } } @@ -232,7 +296,7 @@ pub async fn resolve_candidates_for_push( .await .unwrap_or_else(|e| { tracing::warn!(err = %e, "pin candidate computation task panicked; pinning nothing this push (reconciliation sweep backstops)"); - Vec::new() + PinCandidateSet { candidates: Vec::new(), full_scan: false } }) } @@ -452,11 +516,11 @@ mod tests { let repo = Repo::new(); let c1 = repo.commit_file("a.txt", "one\n"); let c2 = repo.commit_file("b.txt", "two\n"); - let got: HashSet = + let set = resolve_candidates_for_push(repo.path.clone(), vec![c2.clone()], vec![c1.clone()]) - .await - .into_iter() - .collect(); + .await; + assert!(!set.full_scan, "happy-path delta is not a full scan"); + let got: HashSet = set.candidates.into_iter().collect(); let new_blob = repo.rev("HEAD:b.txt"); assert!( got.contains(&c2) && got.contains(&new_blob), @@ -473,11 +537,9 @@ mod tests { repo.commit_file("a.txt", "one\n"); let blob = repo.rev("HEAD:a.txt"); let all: HashSet = list_all_objects(&repo.path).unwrap().into_iter().collect(); - let got: HashSet = - resolve_candidates_for_push(repo.path.clone(), vec![blob], vec![]) - .await - .into_iter() - .collect(); + let set = resolve_candidates_for_push(repo.path.clone(), vec![blob], vec![]).await; + assert!(set.full_scan, "non-commit tip is signalled as a full scan"); + let got: HashSet = set.candidates.into_iter().collect(); assert_eq!(got, all, "non-commit tip falls back to full repo scan"); } @@ -566,4 +628,36 @@ mod tests { // 2 commits + 2 trees + 2 blobs = 6 objects. assert_eq!(all.len(), 6, "got: {all:?}"); } + + #[test] + fn all_blob_oids_includes_dangling_and_excludes_nonblobs() { + // The all-blob universe must include a dangling blob (the #99 hazard) and + // exclude commits/trees, so the fail-closed filter can tell a blob from a + // structural object without typing the candidate list. + let repo = Repo::new(); + let c1 = repo.commit_file("a.txt", "one\n"); + let reachable_blob = repo.rev("HEAD:a.txt"); + let tree = repo.rev("HEAD^{tree}"); + // Write a blob referenced by no tree/commit — dangling. + std::fs::write(repo.path.join("orphan.bin"), b"dangling\n").unwrap(); + let dangling = repo.git(&["hash-object", "-w", "orphan.bin"]); + + let blobs = all_blob_oids(&repo.path).unwrap(); + assert!(blobs.contains(&reachable_blob), "reachable blob present"); + assert!( + blobs.contains(&dangling), + "dangling blob present in the all-blob universe" + ); + assert!(!blobs.contains(&c1), "commit is not a blob"); + assert!(!blobs.contains(&tree), "tree is not a blob"); + + // The typed lister tags each object; spot-check the dangling blob's type. + let typed = list_all_objects_with_type(&repo.path).unwrap(); + assert!( + typed + .iter() + .any(|(oid, ty)| oid == &dangling && ty == "blob"), + "dangling object is typed as a blob" + ); + } } diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index 9c4d804..578ee40 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -298,6 +298,47 @@ pub fn replicable_objects(all: Vec, withheld: &HashSet) -> Vec Result> { + let pairs = blob_paths(repo_path)?; + let mut allowed = HashSet::new(); + for (oid, path) in &pairs { + if visibility_check(rules, is_public, owner_did, None, path) == Decision::Allow { + allowed.insert(oid.clone()); + } + } + Ok(allowed) +} + +/// Objects safe to replicate, failing closed on blobs (#99). A candidate +/// replicates iff it is NOT a blob (`all_blob_oids` — commits and trees are +/// structural, never content-withheld) OR it is in `allowed_blobs` (reachable +/// and visibility-allowed). This drops both withheld reachable blobs and +/// dangling/unreachable blobs the reachable walk never classified, without +/// tagging the candidate list with per-object types. Used on the full-scan pin +/// path, where the candidate set can contain dangling objects the reachable-only +/// withheld set cannot cover; the delta path keeps `replicable_objects`. +pub fn replicable_objects_fail_closed( + candidates: Vec, + allowed_blobs: &HashSet, + all_blob_oids: &HashSet, +) -> Vec { + candidates + .into_iter() + .filter(|oid| !all_blob_oids.contains(oid) || allowed_blobs.contains(oid)) + .collect() +} + /// For every blob withheld from anonymous, the DIDs allowed to read it: the /// owner plus any reader DID that `visibility_check` Allows at some path the /// blob appears at. Least-privilege: a reader of one private subtree is not a @@ -596,6 +637,112 @@ mod tests { assert_eq!(got, all); } + #[test] + fn fail_closed_keeps_nonblobs_and_allowed_blobs_only() { + // Non-blob objects (commit/tree) always pass; a blob passes only if it + // is in the allowed set. A withheld blob and a dangling blob (both in + // all_blob_oids, neither in allowed) are dropped. + let allowed: HashSet = ["b_pub".to_string()].into_iter().collect(); + let all_blobs: HashSet = ["b_pub", "b_secret", "b_dangling"] + .into_iter() + .map(String::from) + .collect(); + let candidates = vec![ + "commit1".to_string(), + "tree1".to_string(), + "b_pub".to_string(), + "b_secret".to_string(), + "b_dangling".to_string(), + ]; + let got = replicable_objects_fail_closed(candidates, &allowed, &all_blobs); + assert_eq!( + got, + vec![ + "commit1".to_string(), + "tree1".to_string(), + "b_pub".to_string() + ] + ); + } + + #[test] + fn fail_closed_drops_dangling_private_blob() { + // #99: a private blob orphaned by a force-push/amend is unreachable but + // still present in the object DB. The full-scan candidate set includes + // it; the reachable-only allowed walk never classifies it. The + // fail-closed filter must drop it — it is a blob not in the allowed set. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"public bytes\n").unwrap(); + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(&work) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"]); + run(&["config", "user.email", "t@t"]); + run(&["config", "user.name", "t"]); + run(&["add", "."]); + run(&["commit", "-qm", "init"]); + let oid_of = |rev: &str| { + let out = Command::new("git") + .args(["rev-parse", rev]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let public_oid = oid_of("HEAD:public/a.txt"); + + // Write a blob straight into the object DB, referenced by no tree or + // commit — exactly the dangling state #99 is about. + std::fs::write(work.join("orphan.bin"), b"DANGLING SECRET\n").unwrap(); + let dangling_oid = { + let out = Command::new("git") + .args(["hash-object", "-w", "orphan.bin"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + + let all_blobs = crate::git::push_delta::all_blob_oids(&work).unwrap(); + assert!( + all_blobs.contains(&dangling_oid), + "precondition: the dangling blob is in the all-objects universe" + ); + + let rules: Vec = vec![]; + let allowed = replicable_blob_set(&work, &rules, true, OWNER).unwrap(); + assert!( + !allowed.contains(&dangling_oid), + "dangling blob is unreachable, so never in the allowed set" + ); + assert!( + allowed.contains(&public_oid), + "reachable public blob is in the allowed set" + ); + + // Full-scan candidate set includes the dangling blob; fail-closed drops it. + let candidates = vec![dangling_oid.clone(), public_oid.clone()]; + let replicable = replicable_objects_fail_closed(candidates, &allowed, &all_blobs); + assert!( + !replicable.contains(&dangling_oid), + "#99: a dangling private blob must not replicate" + ); + assert!( + replicable.contains(&public_oid), + "the public blob still replicates" + ); + } + #[test] fn recipients_are_owner_plus_allowed_readers_only() { let (_td, repo, secret_oid, public_oid) = fixture(); @@ -836,6 +983,81 @@ mod tests { ); } + #[test] + fn withholds_secret_blob_across_nfc_nfd_normalization_skew() { + // #101: the secret lives under a directory whose name is committed in NFD + // ("se" + combining acute U+0301), while the deny rule is authored in NFC + // ("é" = U+00E9). The variant byte sits INSIDE the rule-covered directory + // name, so a byte-exact matcher under-withholds and leaks the blob on the + // replication path. NFC normalization at the matcher seam closes it. (The + // sibling café.txt test does not exercise this: there the rule prefix + // "/secret" is pure ASCII and byte-identical regardless of how é is encoded + // in the filename, so it passes for the wrong reason.) + let nfd_dir = "se\u{0301}cret"; // decomposed + let nfc_rule = "/s\u{00e9}cret/**"; // composed + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(work.join(nfd_dir)).unwrap(); + std::fs::write(work.join("public.txt"), b"public\n").unwrap(); + std::fs::write(work.join(nfd_dir).join("key.pem"), b"TOP SECRET\n").unwrap(); + let run = |args: &[&str], dir: &Path| { + assert!( + Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["config", "core.precomposeunicode", "false"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret_oid = oid(&format!("{nfd_dir}/key.pem")); + let public_oid = oid("public.txt"); + // Guard against a vacuous pass: the NFD-named blob must actually exist. + // Accept SHA-1 (40) or SHA-256 (64) object ids so the test is + // hash-format agnostic, matching the fixture guard later in this file. + assert!( + matches!(secret_oid.len(), 40 | 64), + "secret blob was not stored under the NFD path" + ); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let rules = [rule(nfc_rule, &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "NFC-authored deny rule must withhold the secret blob under the NFD-named directory" + ); + assert!( + !withheld.contains(&public_oid), + "public blob must NOT be withheld" + ); + } + // TAB/newline are legal filename bytes on unix but rejected by the Windows // filesystem, so building the fixture only makes sense (and only compiles the // OsStr handling) under cfg(unix), matching fails_closed_on_non_utf8_path. diff --git a/crates/gitlawb-node/src/graphql/query.rs b/crates/gitlawb-node/src/graphql/query.rs index c7673f6..2caeefb 100644 --- a/crates/gitlawb-node/src/graphql/query.rs +++ b/crates/gitlawb-node/src/graphql/query.rs @@ -15,8 +15,26 @@ impl QueryRoot { .list_all_repos_deduped() .await .map_err(|e| async_graphql::Error::new(e.to_string()))?; + + // Apply the same "/" visibility gate the REST/per-repo endpoints use so + // this surface does not enumerate private repos (#97). The caller DID is + // threaded onto the context by optional_signature; absent = anonymous. + let caller = ctx + .data::() + .ok() + .map(|d| d.0.as_str()); + let ids: Vec = repos.iter().map(|r| r.id.clone()).collect(); + let rules_by_repo = db + .list_visibility_rules_for_repos(&ids) + .await + .map_err(|e| async_graphql::Error::new(e.to_string()))?; + Ok(repos .into_iter() + .filter(|r| { + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, caller) + }) .map(|r| RepoType { name: r.name, owner_did: r.owner_did, diff --git a/crates/gitlawb-node/src/ipfs_pin.rs b/crates/gitlawb-node/src/ipfs_pin.rs index ac1fde0..3b34619 100644 --- a/crates/gitlawb-node/src/ipfs_pin.rs +++ b/crates/gitlawb-node/src/ipfs_pin.rs @@ -7,8 +7,6 @@ //! If `ipfs_api` is empty the functions are no-ops, so the node works fine //! without a local IPFS daemon. -use std::collections::HashSet; - use anyhow::Result; use gitlawb_core::cid::Cid; @@ -88,9 +86,10 @@ pub async fn cat(ipfs_api: &str, cid: &str) -> Result> { /// Pin any of the given candidate git objects that are not yet recorded in /// `pinned_cids`. /// -/// `candidates` is the OID set to consider — the per-push delta on the push -/// path, or the whole-repo list on the reconciliation sweep / full-scan -/// fallback (see `git::push_delta`). `repo_path` is still needed to read each +/// `object_list` is the already-withheld-filtered OID set to pin: the caller +/// applies `visibility_pack::replicable_objects` on the delta path or the +/// `..._fail_closed` filter on the full-scan path before calling, so this +/// function never sees a withheld blob. `repo_path` is still needed to read each /// object's bytes. The twin in `pinata.rs` mirrors this shape — change both in /// lockstep. /// @@ -98,16 +97,13 @@ pub async fn cat(ipfs_api: &str, cid: &str) -> Result> { pub async fn pin_new_objects( ipfs_api: &str, repo_path: &std::path::Path, - candidates: Vec, + object_list: Vec, db: &crate::db::Db, - withheld: &HashSet, ) -> Vec<(String, String)> { if ipfs_api.is_empty() { return vec![]; } - let object_list = crate::git::visibility_pack::replicable_objects(candidates, withheld); - let mut pinned = Vec::new(); for sha in object_list { diff --git a/crates/gitlawb-node/src/pinata.rs b/crates/gitlawb-node/src/pinata.rs index cbd8885..6c9c0bf 100644 --- a/crates/gitlawb-node/src/pinata.rs +++ b/crates/gitlawb-node/src/pinata.rs @@ -7,7 +7,6 @@ //! no-op, so nodes without Pinata backing work fine. use anyhow::Result; -use std::collections::HashSet; /// Pin a single git object's raw bytes on Pinata (v3 API). /// @@ -70,27 +69,25 @@ pub async fn pin_object( /// Pin any of the given candidate git objects that haven't yet been sent to /// Pinata. /// -/// `candidates` is the OID set to consider — the per-push delta on the push -/// path, or the whole-repo list on the reconciliation sweep / full-scan -/// fallback (see `git::push_delta`). `repo_path` is still needed to read each -/// object's bytes. The twin in `ipfs_pin.rs` mirrors this shape — change both -/// in lockstep. Objects already recorded with a `pinata_cid` are skipped. -/// Returns `(sha_hex, cid)` pairs for each newly pinned object. +/// `object_list` is the already-withheld-filtered OID set to pin: the caller +/// applies `visibility_pack::replicable_objects` on the delta path or the +/// `..._fail_closed` filter on the full-scan path before calling. `repo_path` is +/// still needed to read each object's bytes. The twin in `ipfs_pin.rs` mirrors +/// this shape — change both in lockstep. Objects already recorded with a +/// `pinata_cid` are skipped. Returns `(sha_hex, cid)` pairs for each newly +/// pinned object. pub async fn pin_new_objects( client: &reqwest::Client, upload_url: &str, jwt: &str, repo_path: &std::path::Path, - candidates: Vec, + object_list: Vec, db: &crate::db::Db, - withheld: &HashSet, ) -> Vec<(String, String)> { if jwt.is_empty() { return vec![]; } - let object_list = crate::git::visibility_pack::replicable_objects(candidates, withheld); - let mut pinned = Vec::new(); for sha in object_list { diff --git a/crates/gitlawb-node/src/server.rs b/crates/gitlawb-node/src/server.rs index 122b71f..11a4724 100644 --- a/crates/gitlawb-node/src/server.rs +++ b/crates/gitlawb-node/src/server.rs @@ -451,8 +451,31 @@ async fn node_info(State(state): State) -> Json { })) } -async fn stats(State(state): State) -> Json { - let repos = state.db.count_repos_deduped().await.unwrap_or(0); +pub(crate) async fn stats(State(state): State) -> Json { + // Count only the repos an anonymous caller could list, so the aggregate does + // not leak the existence of private/mode-A repos (#104 count oracle). Mirror + // the listing seam (api/repos.rs): over-fetch the deduped set, batch-load the + // visibility rules, and keep rows that pass listable_at_root. The caller is + // always None — meta_routes carries no auth layer (see the route group in this + // file). Fail closed: any DB error collapses the whole count to 0 (an + // under-count never leaks existence), preserving the prior `.unwrap_or(0)`. + let repos = async { + // stats only needs the count, so use the no-stars deduped list (same + // DEDUP_CTE) and skip the repo_stars aggregation the listing path needs. + let rows = state.db.list_all_repos_deduped().await?; + let ids: Vec = rows.iter().map(|r| r.id.clone()).collect(); + let rules_by_repo = state.db.list_visibility_rules_for_repos(&ids).await?; + let count = rows + .iter() + .filter(|r| { + let rules = rules_by_repo.get(&r.id).map(Vec::as_slice).unwrap_or(&[]); + crate::visibility::listable_at_root(rules, r.is_public, &r.owner_did, None) + }) + .count() as i64; + Ok::(count) + } + .await + .unwrap_or(0); let agents = state.db.count_agents().await.unwrap_or(0); let pushes = state.db.count_pushes().await.unwrap_or(0); Json(json!({ diff --git a/crates/gitlawb-node/src/test_support.rs b/crates/gitlawb-node/src/test_support.rs index 154db14..4313c23 100644 --- a/crates/gitlawb-node/src/test_support.rs +++ b/crates/gitlawb-node/src/test_support.rs @@ -607,4 +607,705 @@ mod tests { "stats must count logical repos (mirror+canonical collapsed)" ); } + + // ── #97: repo-listing surfaces are visibility-gated ────────────────────── + + fn seed_private_repo(owner_did: &str, name: &str) -> RepoRecord { + RepoRecord { + is_public: false, + ..seed_repo(owner_did, name) + } + } + + fn anon_get(uri: &str) -> Request { + Request::builder() + .method(Method::GET) + .uri(uri) + .body(Body::empty()) + .expect("request builder") + } + + async fn json_body(resp: axum::response::Response) -> serde_json::Value { + let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX) + .await + .expect("body bytes"); + serde_json::from_slice(&bytes).expect("json body") + } + + fn names_in(v: &serde_json::Value) -> Vec { + v.as_array() + .expect("array body") + .iter() + .filter_map(|r| r["name"].as_str().map(str::to_string)) + .collect() + } + + fn list_repos_router(state: AppState) -> Router { + Router::new() + .route( + "/api/v1/repos", + axum::routing::get(crate::api::repos::list_repos), + ) + .with_state(state) + } + + #[sqlx::test] + async fn list_repos_hides_private_repo_and_count_from_anonymous(pool: PgPool) { + let owner = "did:key:zLISTOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"pub-repo".to_string()), + "public repo listed" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "private repo must not be enumerable anonymously (#97)" + ); + assert_eq!( + total.as_deref(), + Some("1"), + "X-Total-Count must not leak the private repo's existence" + ); + } + + #[sqlx::test] + async fn list_repos_shows_owner_their_private_repo(pool: PgPool) { + let owner = "did:key:zLISTOWNER2BBBBBBBBBBBBBBBBBBBBBBBBBBBBB"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(signed_request_as( + owner, + Method::GET, + "/api/v1/repos", + Body::empty(), + )) + .await + .unwrap(); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"priv-repo".to_string()) && names.contains(&"pub-repo".to_string()), + "owner sees their own private repo, got {names:?}" + ); + } + + #[sqlx::test] + async fn list_repos_shows_private_repo_to_authorized_root_reader(pool: PgPool) { + // Proves the gate is visibility_check, not a bare is_public filter: an + // is_public=false repo with a root rule granting a reader DID is listable + // to that reader (and not to a stranger). + let owner = "did:key:zLISTOWNER3CCCCCCCCCCCCCCCCCCCCCCCCCCCCC"; + let reader = "did:key:zLISTREADERDDDDDDDDDDDDDDDDDDDDDDDDDDDDD"; + let stranger = "did:key:zLISTSTRANGEREEEEEEEEEEEEEEEEEEEEEEEEEE"; + let state = test_state(pool).await; + let rec = seed_private_repo(owner, "priv-repo"); + state.db.create_repo(&rec).await.expect("seed private"); + state + .db + .set_visibility_rule( + &rec.id, + "/", + crate::db::VisibilityMode::A, + &[reader.to_string()], + owner, + ) + .await + .expect("grant root reader"); + + let names_for = |did: &'static str, st: AppState| async move { + let resp = list_repos_router(st) + .oneshot(signed_request_as( + did, + Method::GET, + "/api/v1/repos", + Body::empty(), + )) + .await + .unwrap(); + names_in(&json_body(resp).await) + }; + + assert!( + names_for(reader, state.clone()) + .await + .contains(&"priv-repo".to_string()), + "authorized root reader must see the private repo" + ); + assert!( + !names_for(stranger, state) + .await + .contains(&"priv-repo".to_string()), + "an unlisted stranger must not see it" + ); + } + + #[sqlx::test] + async fn list_federated_repos_hides_private_from_anonymous(pool: PgPool) { + let owner = "did:key:zFEDOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let router = Router::new() + .route( + "/api/v1/repos/federated", + axum::routing::get(crate::api::repos::list_federated_repos), + ) + .with_state(state); + let resp = router + .oneshot(anon_get("/api/v1/repos/federated")) + .await + .unwrap(); + let body = json_body(resp).await; + let names = names_in(&body["repos"]); + assert_eq!( + body["count"].as_u64(), + Some(1), + "federated count must reflect only the visible repos, not the pre-filter total (#97)" + ); + assert!( + names.contains(&"pub-repo".to_string()), + "public repo federated" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "private repo must not be federated to anonymous callers (#97)" + ); + } + + #[sqlx::test] + async fn graphql_repos_hides_private_from_anonymous(pool: PgPool) { + // The GraphQL repos query is the third listing surface; an anonymous + // query must not enumerate a private repo (#97). + let owner = "did:key:zGQLOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = state + .graphql_schema + .execute(async_graphql::Request::new("{ repos { name } }")) + .await; + assert!(resp.errors.is_empty(), "graphql errors: {:?}", resp.errors); + let names = names_in(&resp.data.into_json().expect("graphql json")["repos"]); + assert!( + names.contains(&"pub-repo".to_string()), + "public repo listed" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "private repo must not be enumerable via anonymous GraphQL (#97)" + ); + } + + #[sqlx::test] + async fn graphql_repos_shows_authorized_caller_their_private_repo(pool: PgPool) { + // Positive path: the resolver pulls the caller DID from GraphQL request + // data, so the authenticated context must still surface a private repo its + // owner may read. Guards an auth-context regression on the GraphQL surface + // that the anonymous-only test would miss (#97). + let owner = "did:key:zGQLAUTHOWNERAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = state + .graphql_schema + .execute( + async_graphql::Request::new("{ repos { name } }") + .data(AuthenticatedDid(owner.to_string())), + ) + .await; + assert!(resp.errors.is_empty(), "graphql errors: {:?}", resp.errors); + let names = names_in(&resp.data.into_json().expect("graphql json")["repos"]); + assert!( + names.contains(&"priv-repo".to_string()), + "owner must see their own private repo via authenticated GraphQL (#97)" + ); + } + + #[sqlx::test] + async fn list_repos_paged_count_excludes_private(pool: PgPool) { + // The paged path (limit set) is the KTD2 exploit shape: a pre-cut page + + // SQL total would leak the private-repo count. Assert X-Total-Count is the + // visible count and the page is not short (#97). + let owner = "did:key:zPAGEOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-a")) + .await + .expect("seed public a"); + state + .db + .create_repo(&seed_repo(owner, "pub-b")) + .await + .expect("seed public b"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos?limit=10")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert_eq!( + total.as_deref(), + Some("2"), + "paged X-Total-Count must reflect only the 2 visible repos, not leak the private count" + ); + assert_eq!( + names.len(), + 2, + "page must not be short: both public repos present" + ); + assert!(!names.contains(&"priv-repo".to_string())); + } + + #[sqlx::test] + async fn list_repos_hides_public_repo_under_root_deny(pool: PgPool) { + // Proves the gate is visibility_check, not a bare is_public filter, in the + // negative direction: an is_public=true repo with a root deny rule (mode B, + // no readers) is NOT listable to anonymous, while a plain public repo is. + let owner = "did:key:zDENYOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "open-repo")) + .await + .expect("seed open"); + let denied = seed_repo(owner, "deny-repo"); // is_public = true + state.db.create_repo(&denied).await.expect("seed denied"); + state + .db + .set_visibility_rule(&denied.id, "/", crate::db::VisibilityMode::B, &[], owner) + .await + .expect("root deny rule"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"open-repo".to_string()), + "plain public repo listed" + ); + assert!( + !names.contains(&"deny-repo".to_string()), + "is_public=true repo with a root deny must NOT be listed (proves visibility_check, not is_public)" + ); + } + + #[sqlx::test] + async fn list_repos_owner_filter_excludes_private_from_anonymous(pool: PgPool) { + // The owner-filtered path (?owner=, SQL $1 bind) must still apply the Rust + // "/" visibility gate: an anonymous caller filtering by an owner sees that + // owner's public repos but never their private ones, and the count does + // not leak (#97). This is a distinct SQL branch from the unfiltered path. + let short = "zOWNERFILTERAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let owner = format!("did:key:{short}"); + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(&owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(&owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get(&format!("/api/v1/repos?owner={short}&limit=10"))) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"pub-repo".to_string()), + "owner's public repo listed" + ); + assert!( + !names.contains(&"priv-repo".to_string()), + "owner's private repo hidden from anonymous even when owner-filtered (#97)" + ); + assert_eq!( + total.as_deref(), + Some("1"), + "owner-filtered X-Total-Count must exclude the private repo" + ); + } + + #[sqlx::test] + async fn list_repos_owner_filter_full_did_matches_bare_mirror(pool: PgPool) { + // A mirror-only repo (known via gossip, no local canonical row) stores the + // bare owner key `z...`. Filtering by the full `did:key:z...` form must + // still return it, matching crate::api::did_matches — the behavior the + // no-limit `gl repo list --owner` path relied on before #97 routed owner + // filtering through SQL (jatmn P2 on #111). Both owner forms must match. + let short = "zMIRRORONLYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .upsert_mirror_repo(short, "mirror-repo", "/tmp/mirror", None) + .await + .expect("seed mirror-only row"); + + // full did:key: form must match the bare-owner mirror row + let resp = list_repos_router(state.clone()) + .oneshot(anon_get(&format!("/api/v1/repos?owner=did:key:{short}"))) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"mirror-repo".to_string()), + "full did:key: owner filter must match a bare-owner mirror row (jatmn #111)" + ); + + // short bare form must still match + let resp = list_repos_router(state) + .oneshot(anon_get(&format!("/api/v1/repos?owner={short}"))) + .await + .unwrap(); + let names = names_in(&json_body(resp).await); + assert!( + names.contains(&"mirror-repo".to_string()), + "short-form owner filter must still match the mirror row" + ); + } + + #[sqlx::test] + async fn list_repos_pagination_offset_past_end_keeps_total(pool: PgPool) { + // Pagination edge: an offset past the visible set returns an empty page, + // but X-Total-Count still reflects the full visible count -- so paging can + // neither short the page nor leak a different total (#97). Guards against a + // refactor that derives the total from the cut page instead of the set. + let owner = "did:key:zOFFSETOWNERAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-a")) + .await + .expect("seed public a"); + state + .db + .create_repo(&seed_repo(owner, "pub-b")) + .await + .expect("seed public b"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos?limit=5&offset=100")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!(names.is_empty(), "offset past the end yields an empty page"); + assert_eq!( + total.as_deref(), + Some("2"), + "X-Total-Count stays the full visible total regardless of offset" + ); + } + + #[sqlx::test] + async fn list_repos_hides_canonical_under_root_deny_even_with_mirror(pool: PgPool) { + // Regression guard for the dedup-survivor + visibility-rule seam. A logical + // repo present as BOTH a canonical row (carrying a root-deny rule) and a + // gossip mirror row: the DEDUP_CTE must pick the canonical survivor so the + // batch rule lookup (keyed by the survivor's id) finds the deny and + // withholds it. If dedup ever picked the mirror (slash-form id, no rule), + // the gate would fall back to is_public=true and leak the repo. is_public + // is true here, so the rule is the only thing hiding it. + let short = "zMIRRORDENYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let owner = format!("did:key:{short}"); + let state = test_state(pool).await; + let canonical = seed_repo(&owner, "secret"); // is_public = true + state + .db + .create_repo(&canonical) + .await + .expect("seed canonical"); + state + .db + .set_visibility_rule( + &canonical.id, + "/", + crate::db::VisibilityMode::B, + &[], + &owner, + ) + .await + .expect("root deny rule on canonical"); + state + .db + .upsert_mirror_repo(short, "secret", "/tmp/mirror", None) + .await + .expect("seed mirror"); + state + .db + .create_repo(&seed_repo(&owner, "open")) + .await + .expect("seed public sibling"); + + let resp = list_repos_router(state) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let total = resp + .headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .map(str::to_string); + let names = names_in(&json_body(resp).await); + assert!(names.contains(&"open".to_string()), "public sibling listed"); + assert!( + !names.contains(&"secret".to_string()), + "canonical repo with a root deny must stay hidden even when a mirror row exists (#97 dedup-survivor/rule seam)" + ); + assert_eq!( + total.as_deref(), + Some("1"), + "X-Total-Count counts only the visible sibling, not the mirror+canonical pair" + ); + } + + // ── /api/v1/stats count oracle (#104) ────────────────────────────────── + // The stats endpoint lives in meta_routes (no auth layer), so the caller is + // always anonymous (None). Its `repos` count must withhold private/mode-A + // repos exactly as the listing surfaces do, or it is a count oracle. + + fn stats_router(state: AppState) -> Router { + Router::new() + .route("/api/v1/stats", axum::routing::get(crate::server::stats)) + .with_state(state) + } + + async fn stats_repos_count(state: AppState) -> i64 { + let resp = stats_router(state) + .oneshot(anon_get("/api/v1/stats")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + json_body(resp).await["repos"] + .as_i64() + .expect("stats.repos is an integer") + } + + #[sqlx::test] + async fn stats_repos_count_excludes_bare_private(pool: PgPool) { + // No-rule branch: an is_public=false repo with no visibility rule is + // denied to anonymous, so stats.repos counts only the public repo. + let owner = "did:key:zSTATSPRIVAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + assert_eq!( + stats_repos_count(state).await, + 1, + "stats.repos must not count the private repo (#104 count oracle)" + ); + } + + #[sqlx::test] + async fn stats_repos_count_excludes_hide_existence_repo(pool: PgPool) { + // Some(rule) branch — the #104 subject. Both repos are is_public=true, so + // the only reason the second is withheld is its root rule with empty + // reader_dids (anonymous excluded). Proves the count goes through + // listable_at_root, not a bare is_public predicate. + let owner = "did:key:zSTATSHIDEAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "open-repo")) + .await + .expect("seed open"); + let hidden = seed_repo(owner, "hidden-repo"); // is_public = true + state.db.create_repo(&hidden).await.expect("seed hidden"); + state + .db + .set_visibility_rule(&hidden.id, "/", crate::db::VisibilityMode::A, &[], owner) + .await + .expect("root hide-existence rule"); + + assert_eq!( + stats_repos_count(state).await, + 1, + "stats.repos must not count a hide-existence (mode-A, empty readers) repo (#104)" + ); + } + + #[sqlx::test] + async fn stats_repos_count_excludes_public_under_root_deny(pool: PgPool) { + // Inverse the seam was built for: an is_public=true repo with a root deny + // (mode B, no readers) must not be counted — is_public alone would count it. + let owner = "did:key:zSTATSDENYAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "open-repo")) + .await + .expect("seed open"); + let denied = seed_repo(owner, "deny-repo"); // is_public = true + state.db.create_repo(&denied).await.expect("seed denied"); + state + .db + .set_visibility_rule(&denied.id, "/", crate::db::VisibilityMode::B, &[], owner) + .await + .expect("root deny rule"); + + assert_eq!( + stats_repos_count(state).await, + 1, + "stats.repos must not count an is_public=true repo under a root deny (#104)" + ); + } + + #[sqlx::test] + async fn stats_repos_count_matches_list_total(pool: PgPool) { + // R2 parity: stats.repos == anonymous GET /api/v1/repos X-Total-Count. + let owner = "did:key:zSTATSPARITYAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let state = test_state(pool).await; + state + .db + .create_repo(&seed_repo(owner, "pub-repo")) + .await + .expect("seed public"); + state + .db + .create_repo(&seed_private_repo(owner, "priv-repo")) + .await + .expect("seed private"); + + let list_total = { + let resp = list_repos_router(state.clone()) + .oneshot(anon_get("/api/v1/repos")) + .await + .unwrap(); + resp.headers() + .get("X-Total-Count") + .and_then(|h| h.to_str().ok()) + .and_then(|s| s.parse::().ok()) + .expect("X-Total-Count header") + }; + + assert_eq!( + stats_repos_count(state).await, + list_total, + "stats.repos must equal the anonymous list X-Total-Count (R2 parity)" + ); + assert_eq!(list_total, 1, "sanity: only the public repo is visible"); + } + + #[sqlx::test] + async fn stats_preserves_sibling_fields(pool: PgPool) { + // R4: the rewrite must not drop agents/pushes/version. + let state = test_state(pool).await; + let resp = stats_router(state) + .oneshot(anon_get("/api/v1/stats")) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let body = json_body(resp).await; + for key in ["repos", "agents", "pushes", "version"] { + assert!(body.get(key).is_some(), "stats must still carry `{key}`"); + } + } + + #[sqlx::test] + async fn stats_repos_count_empty_db_is_zero(pool: PgPool) { + let state = test_state(pool).await; + assert_eq!( + stats_repos_count(state).await, + 0, + "empty DB yields repos == 0 without error" + ); + } } diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index afe6a7c..335d261 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -5,6 +5,7 @@ //! `is_public` flag. It performs no I/O so it is exhaustively unit tested. use crate::db::VisibilityRule; +use unicode_normalization::UnicodeNormalization; #[derive(Debug, PartialEq, Eq)] pub enum Decision { @@ -12,6 +13,17 @@ pub enum Decision { Deny, } +/// NFC-normalize a glob prefix or path so the matcher compares canonically +/// equivalent strings byte-for-byte. Without this, a deny rule authored NFC +/// (`é` = U+00E9) byte-compares unequal to a path committed NFD (`e` + U+0301) +/// and the blob slips past the rule on the replication path (#101). NFC, not +/// NFKC: compatibility folding (ligatures, full-width forms) would merge paths +/// the filesystem treats as distinct and over-withhold. Both sides of every +/// comparison must pass through this, or the skew just moves. +fn nfc(s: &str) -> String { + s.nfc().collect() +} + /// True if `caller` is the repo owner (matches full did:key or its short form), /// mirroring the owner-match idiom in `api/protect.rs`. fn is_owner(owner_did: &str, caller: &str) -> bool { @@ -36,12 +48,17 @@ fn glob_matches(glob: &str, path: &str) -> bool { if prefix == "/" { return true; } + // Compare in NFC so an NFC rule matches a canonically-equivalent NFD path + // (and vice versa). Both sides normalized here — the single matcher seam. + let prefix = nfc(prefix); + let path = nfc(path); path == prefix || path.starts_with(&format!("{prefix}/")) } -/// Specificity = length of the match prefix; longer is more specific. +/// Specificity = length of the (normalized) match prefix; longer is more +/// specific. Normalized so ranking stays consistent with `glob_matches`. fn specificity(glob: &str) -> usize { - glob_prefix(glob).len() + nfc(glob_prefix(glob)).len() } /// Decide whether `caller` (None = anonymous) may read `path` in a repo. @@ -96,6 +113,20 @@ pub fn visibility_check( } } +/// Whether `caller` (None = anonymous) may see a repo in a listing — the `"/"` +/// visibility decision, shared by every repo-listing surface (REST list, +/// federated list, GraphQL `repos`) so they enforce one rule, not three drifting +/// copies. Not a bare `is_public` test: a repo can be `is_public=false` with a +/// root rule granting readers, or `is_public=true` with a root deny (#97). +pub fn listable_at_root( + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> bool { + visibility_check(rules, is_public, owner_did, caller, "/") == Decision::Allow +} + /// The subtree path globs that `caller` (None = anonymous) may NOT read, given /// the repo's rules. Whole-repo ("/") rules are excluded: a denied whole-repo /// read is handled by the 404 gate before a clone ever starts. Each remaining @@ -371,4 +402,38 @@ mod tests { true )); } + + // #101: a deny rule must withhold a path that denotes the same characters in + // a different Unicode normalization form. Without NFC normalization at the + // matcher seam, an NFC-authored rule byte-compares unequal to an NFD-stored + // path and the blob leaks on the replication path. + #[test] + fn matcher_withholds_across_nfc_nfd_normalization_skew() { + // "é" composed (NFC, U+00E9) in the rule; decomposed (NFD, e + U+0301) + // in the committed path. + let nfc_rule = "/s\u{00e9}cret/**"; + let nfd_path = "/se\u{0301}cret/key.pem"; + let rules = [rule(nfc_rule, VisibilityMode::B, &["did:key:z6MkFriend"])]; + assert_eq!( + visibility_check(&rules, true, OWNER, None, nfd_path), + Decision::Deny, + "NFC-authored deny rule must withhold the canonically-equivalent NFD path" + ); + + // Mirror: rule authored NFD, path committed NFC. + let nfd_rule = "/se\u{0301}cret/**"; + let nfc_path = "/s\u{00e9}cret/key.pem"; + let rules2 = [rule(nfd_rule, VisibilityMode::B, &["did:key:z6MkFriend"])]; + assert_eq!( + visibility_check(&rules2, true, OWNER, None, nfc_path), + Decision::Deny, + "NFD-authored deny rule must withhold the canonically-equivalent NFC path" + ); + + // A genuinely different path is still allowed (no over-withholding). + assert_eq!( + visibility_check(&rules, true, OWNER, None, "/public/x.txt"), + Decision::Allow + ); + } }