diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6b16873..23809edd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,13 @@ on: pull_request: branches: [main] +# liblbug static-links on linux; its #cgo LDFLAGS use -Wl,--whole-archive to +# force liblbug's weak C++ RTTI into the binary for the dlopen'd FTS extension +# (see internal/thirdparty/go-ladybug/cgo_shared.go). --whole-archive isn't on +# cgo's built-in #cgo LDFLAGS allowlist, so permit it for every job's build/test. +env: + CGO_LDFLAGS_ALLOW: '-Wl,--(no-)?whole-archive' + jobs: test: runs-on: ${{ matrix.os }} @@ -20,6 +27,9 @@ jobs: with: go-version: ${{ matrix.go-version }} + - name: Fetch liblbug + run: bash scripts/fetch-lbug.sh + - name: Build run: go build -o gortex ./cmd/gortex/ @@ -47,6 +57,10 @@ jobs: with: go-version: '1.26' + - name: Fetch liblbug (windows dynamic — lbug_shared.dll) + shell: bash + run: bash scripts/fetch-lbug.sh + - name: Build CLI run: go build -o gortex.exe ./cmd/gortex/ @@ -77,6 +91,9 @@ jobs: with: go-version: '1.26' + - name: Fetch liblbug + run: bash scripts/fetch-lbug.sh + - name: Install ONNX Runtime run: | wget -q https://github.com/microsoft/onnxruntime/releases/download/v1.24.4/onnxruntime-linux-x64-1.24.4.tgz diff --git a/.github/workflows/init-smoke.yml b/.github/workflows/init-smoke.yml index 6a6c306a..d8924e17 100644 --- a/.github/workflows/init-smoke.yml +++ b/.github/workflows/init-smoke.yml @@ -18,6 +18,12 @@ on: - "cmd/gortex/init*.go" - "internal/agents/**" +# liblbug static-links on linux with -Wl,--whole-archive (forces its weak C++ +# RTTI into the binary for the dlopen'd FTS extension; see cgo_shared.go). +# Not on cgo's #cgo LDFLAGS allowlist, so permit it for the build step below. +env: + CGO_LDFLAGS_ALLOW: '-Wl,--(no-)?whole-archive' + jobs: dry-run: runs-on: ubuntu-latest @@ -29,6 +35,9 @@ jobs: go-version-file: go.mod cache: true + - name: Fetch liblbug + run: bash scripts/fetch-lbug.sh + - name: Build gortex run: go build -o /tmp/gortex ./cmd/gortex diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a6b01497..9b6f9491 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -79,6 +79,18 @@ jobs: chmod 600 "$SIGNING_DIR"/cert.* "$SIGNING_DIR"/notary.* + # Fetch the static liblbug for every unix target into + # lib/static/-/ on the host. $PWD is bind-mounted into the + # goreleaser-cross container, so the cross-compiles below link them + # in (self-contained binaries, nothing to ship alongside). Pinned by + # LBUG_VERSION inside the script. + - name: Fetch liblbug (linux + darwin, static) + run: | + bash scripts/fetch-lbug.sh linux amd64 + bash scripts/fetch-lbug.sh linux arm64 + bash scripts/fetch-lbug.sh darwin amd64 + bash scripts/fetch-lbug.sh darwin arm64 + - name: Run GoReleaser (cross-compile via Docker) # goreleaser-cross ships osxcross + aarch64/x86_64 gcc toolchains # so all 4 targets (linux/amd64, linux/arm64, darwin/amd64, @@ -208,6 +220,119 @@ jobs: rm -rf /tmp/macos-signing fi + # Windows is built on a NATIVE windows runner because lbug's windows lib + # is MSVC-built and must be linked dynamically — the mingw .exe loads + # lbug_shared.dll via `-l:lbug_shared.dll` (no import lib / gendef + # needed), so it can't be produced by the goreleaser-cross job above. + # This job builds, bundles the .exe with lbug_shared.dll + the mingw and + # VC++ runtime DLLs it needs, zips, cosign-signs, and appends the zip to + # the release the `release` job already created. + release-windows: + needs: release + runs-on: windows-latest + permissions: + contents: write + id-token: write + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6 + with: + go-version: '1.26' + + - uses: sigstore/cosign-installer@6f9f17788090df1f26f669e9d70d6ae9567deba6 # v4.1.2 + with: + cosign-release: v2.4.1 + + # Fetches lbug_shared.dll (the MSVC-built DLL) into + # lib/dynamic/windows/. The mingw-w64 toolchain the runner ships on + # PATH links the .exe directly against it. + - name: Fetch liblbug (windows, dynamic) + shell: bash + run: bash scripts/fetch-lbug.sh windows amd64 + + - name: Build gortex.exe + shell: bash + env: + CGO_ENABLED: "1" + run: | + set -euo pipefail + VER="${GITHUB_REF#refs/tags/}" + go build -ldflags "-s -w -X main.version=${VER} -X main.commit=$(git rev-parse --short HEAD) -X main.date=$(date -u +%Y-%m-%dT%H:%M:%SZ)" -o gortex.exe ./cmd/gortex/ + + - name: Stage exe + runtime DLLs + shell: bash + run: | + set -euo pipefail + mkdir -p stage + cp gortex.exe stage/ + cp internal/thirdparty/go-ladybug/lib/dynamic/windows/lbug_shared.dll stage/ + + # A missing runtime DLL must FAIL the release, never ship a + # zip whose .exe can't start. `gcc -print-file-name` echoes the + # bare name (exit 0) when it can't find the file, and the mingw + # runtime DLLs live in the toolchain's bin/ dir (not the lib/ + # dir -print-file-name searches), so resolve via bin/ and assert + # an absolute, existing path. + find_dll() { + local name="$1" hit + for base in \ + "$(dirname "$(command -v gcc 2>/dev/null || true)")" \ + "$(dirname "$(command -v x86_64-w64-mingw32-gcc 2>/dev/null || true)")" \ + /c/mingw64/bin /c/msys64/mingw64/bin /c/ProgramData/mingw64/mingw64/bin; do + [ -n "$base" ] && [ -f "$base/$name" ] && { echo "$base/$name"; return 0; } + done + hit="$(find /c/mingw64 /c/msys64 -name "$name" 2>/dev/null | head -1 || true)" + [ -n "$hit" ] && { echo "$hit"; return 0; } + return 1 + } + # mingw C/C++ runtime the .exe links dynamically. + for lib in libstdc++-6.dll libgcc_s_seh-1.dll libwinpthread-1.dll; do + p="$(find_dll "$lib")" || { echo "FATAL: mingw runtime $lib not found"; exit 1; } + cp "$p" stage/; echo "bundled $lib <- $p" + done + # VC++ runtime the MSVC-built lbug_shared.dll imports + # (MSVCP140/VCRUNTIME140*). Present on windows-latest (VS). + for d in VCRUNTIME140.dll VCRUNTIME140_1.dll MSVCP140.dll; do + if [ -f "/c/Windows/System32/$d" ]; then cp "/c/Windows/System32/$d" stage/; echo "bundled $d"; + else echo "FATAL: VC++ runtime $d not found on runner"; exit 1; fi + done + ls -la stage/ + + - name: Zip (gortex_windows_amd64.zip) + shell: pwsh + run: Compress-Archive -Path stage/* -DestinationPath gortex_windows_amd64.zip -Force + + - name: Sign + upload to release + shell: bash + env: + COSIGN_YES: "true" + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + cosign sign-blob \ + --output-signature gortex_windows_amd64.zip.sig \ + --output-certificate gortex_windows_amd64.zip.pem \ + gortex_windows_amd64.zip + TAG="${GITHUB_REF#refs/tags/}" + gh release upload "$TAG" \ + gortex_windows_amd64.zip \ + gortex_windows_amd64.zip.sig \ + gortex_windows_amd64.zip.pem \ + --clobber + + # Append the windows zip's sha256 to the release checksums.txt so + # the one-line installer (scripts/install.ps1, which verifies + # against checksums.txt) covers windows too — the unix goreleaser + # run only hashed its own artifacts. needs:release guarantees + # checksums.txt already exists. + sha="$(sha256sum gortex_windows_amd64.zip | awk '{print $1}')" + gh release download "$TAG" --pattern checksums.txt --clobber 2>/dev/null || : > checksums.txt + if ! grep -q "gortex_windows_amd64.zip" checksums.txt; then + printf '%s gortex_windows_amd64.zip\n' "$sha" >> checksums.txt + gh release upload "$TAG" checksums.txt --clobber + fi + # SLSA-3 provenance via the OpenSSF reusable workflow. This runs in a # separate, isolated job that the `release` job can't tamper with — # that isolation is what elevates us from SLSA-2 to SLSA-3. Output is diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 808e6b9a..dfbc56bc 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -12,6 +12,14 @@ permissions: contents: read security-events: write +# liblbug static-links on linux with -Wl,--whole-archive (forces its weak C++ +# RTTI into the binary so the dlopen'd FTS extension resolves — see +# internal/thirdparty/go-ladybug/cgo_shared.go). --whole-archive isn't on cgo's +# #cgo LDFLAGS allowlist, so govulncheck — which loads the cgo packages through +# the Go toolchain — must allow it, the same way ci.yml does. +env: + CGO_LDFLAGS_ALLOW: '-Wl,--(no-)?whole-archive' + jobs: govulncheck: runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 293c1886..07826a2a 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,20 @@ eval/scripts/ eval/logs/ internal_docs/ + +# liblbug native libraries are fetched at build time by +# scripts/fetch-lbug.sh (run by make / CI / release), never committed. +internal/thirdparty/go-ladybug/lib/ + +# Ad-hoc bench/probe tooling — kept locally, not part of the repo. +bench/all-tools-bench/ +bench/daemon-bench/ +bench/edge-diff/ +bench/ladybug-bundle-probe/ +bench/multi-repo-bench/ +bench/node-diff/ +bench/store-bench/ +bench/unresolved-audit/ +bench/run-linux.sh +bench/run-linux-rest.sh +cmd/lbug-probe/ diff --git a/.goreleaser.yml b/.goreleaser.yml index 787e87fc..993313c7 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -1,8 +1,17 @@ version: 2 # Run inside ghcr.io/goreleaser/goreleaser-cross — the Docker image ships -# cross-compile toolchains for all four targets below so CGO (tree-sitter) -# links cleanly on a single Linux runner. See .github/workflows/release.yml. +# cross-compile toolchains so CGO (tree-sitter + the statically-linked +# liblbug) links cleanly on a single Linux runner. This config builds the +# UNIX targets only (linux + darwin, both with liblbug static-linked into +# a self-contained binary). Windows is built separately on a native +# windows runner (see the `release-windows` job in release.yml) because +# lbug's windows lib is MSVC-built and must be linked dynamically + shipped +# as a DLL — it can't be static-linked from mingw. +# +# liblbug static archives are fetched into lib/static/-/ by the +# "Fetch liblbug" step in release.yml before this runs (the repo is +# bind-mounted into the container). before: hooks: - go mod tidy @@ -19,30 +28,19 @@ builds: # Version (see internal/version). Commit lands in the +build slot so # `gortex version` output round-trips as canonical semver. - -s -w -X main.version={{.Version}} -X main.commit={{.ShortCommit}} -X main.date={{.Date}} - # Statically link the mingw-w64 C/C++ runtime (libstdc++, libgcc, - # winpthread) into the Windows binary. CGO is on for tree-sitter and - # some grammar scanners ship C++; without -static the released - # gortex.exe dynamically links libstdc++-6.dll et al., which are not - # present on a stock Windows box — the binary fails to start with a - # missing-DLL error. No-op on linux/darwin, which keep their normal - # dynamic libc/libc++. - - '{{ if eq .Os "windows" }}-extldflags "-static"{{ end }}' env: - CGO_ENABLED=1 + # liblbug static-links on linux with -Wl,--whole-archive (forces its + # weak C++ RTTI into the binary for the dlopen'd FTS extension; see + # internal/thirdparty/go-ladybug/cgo_shared.go). --whole-archive isn't + # on cgo's #cgo LDFLAGS allowlist, so permit it. No-op for darwin. + - 'CGO_LDFLAGS_ALLOW=-Wl,--(no-)?whole-archive' goos: - linux - darwin - - windows goarch: - amd64 - arm64 - ignore: - # windows/arm64 needs an aarch64-w64-mingw32 cross-toolchain that - # the goreleaser-cross image doesn't ship; windows/amd64 covers - # every mainstream Windows dev box. Revisit when the image gains - # the llvm-mingw arm64 target. - - goos: windows - goarch: arm64 # Per-target CC + CXX. goreleaser-cross exposes these cross-toolchains # on PATH; CGO needs both set per target triple because some deps # (tree-sitter yaml scanner, etc.) ship C++. Without CXX, the system @@ -69,11 +67,6 @@ builds: env: - CC=aarch64-linux-gnu-gcc - CXX=aarch64-linux-gnu-g++ - - goos: windows - goarch: amd64 - env: - - CC=x86_64-w64-mingw32-gcc - - CXX=x86_64-w64-mingw32-g++ # Per-target build hook. Fires after each Mach-O / ELF is linked, # before the archive step. The script is a no-op for non-darwin # targets, so we don't need a per-override hook list. @@ -151,20 +144,9 @@ homebrew_casks: executable: gortex shell_parameter_format: cobra -# Scoop manifest — `scoop install gortex` on Windows. goreleaser commits -# the generated manifest (pointing at the signed windows/amd64 .zip in -# this release) to a separate bucket repo on every tagged release, -# exactly like the Homebrew cask above. -scoops: - - name: gortex - repository: - owner: gortexhq - name: scoop-bucket - # GITHUB_TOKEN can only push to the source repo, so the bucket - # needs its own PAT with `repo` scope on gortexhq/scoop-bucket, - # stored as SCOOP_BUCKET_TOKEN in repo secrets. release.yml wires - # it in. - token: "{{ .Env.SCOOP_BUCKET_TOKEN }}" - homepage: "https://github.com/zzet/gortex" - description: "Code intelligence engine that indexes repositories into an in-memory knowledge graph." - license: "Custom" +# NOTE: the Scoop manifest is intentionally NOT generated here. Windows is +# built by the separate `release-windows` job (native runner, dynamic +# liblbug) and isn't an artifact of this goreleaser-cross run, so goreleaser +# has no windows zip to point a scoop manifest at. Re-add a scoop manifest +# (pointing at the windows job's zip) as a follow-up once the windows +# release path is settled. diff --git a/Makefile b/Makefile index a80f421e..52c69dcc 100644 --- a/Makefile +++ b/Makefile @@ -9,17 +9,36 @@ COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -s -w -X main.version=$(VERSION) -X main.commit=$(COMMIT) -X main.date=$(DATE) +# liblbug links statically on linux; the #cgo LDFLAGS use -Wl,--whole-archive +# to force its weak C++ RTTI objects into the binary so the dlopen'd FTS +# extension resolves them (paired with -rdynamic — see cgo_shared.go). +# --whole-archive isn't on cgo's #cgo LDFLAGS allowlist, so it must be +# explicitly permitted. Exported so every go build/test recipe inherits it; +# it's a no-op on darwin/windows (those targets don't use the flag). +export CGO_LDFLAGS_ALLOW := -Wl,--(no-)?whole-archive + .PHONY: build build-onnx build-gomlx build-hugot build-windows \ - test bench bench-rpi bench-rpi-quick bench-rpi-profile bench-compare \ + lbug test bench bench-rpi bench-rpi-quick bench-rpi-profile bench-compare \ lint fmt clean install dev-link tag-release \ deps-onnx deps-gomlx deps-hugot deps-vectors \ claude-plugin claude-plugin-check +# --------------------------------------------------------------------------- +# Native dependency: liblbug (the ladybug storage engine) +# --------------------------------------------------------------------------- +# Fetched at build time, never committed. Static on linux/darwin (baked +# into a self-contained binary); dynamic on windows (lbug's windows build +# is MSVC — the .exe links lbug_shared.dll via a generated mingw import +# lib and ships the DLL alongside). Idempotent: skips if present; set +# LBUG_FORCE=1 to refetch, LBUG_VERSION to pin a version. +lbug: + @bash scripts/fetch-lbug.sh + # --------------------------------------------------------------------------- # Build variants # --------------------------------------------------------------------------- -build: +build: lbug go build -ldflags '$(LDFLAGS)' -tags llama -o $(BINARY) ./cmd/gortex/ build-onnx: deps-onnx @@ -33,7 +52,7 @@ build-gomlx: deps-gomlx build-hugot: deps-hugot go build -ldflags '$(LDFLAGS)' -o $(BINARY) ./cmd/gortex/ -test: +test: lbug go test -race ./... bench: @@ -116,6 +135,7 @@ tag-release: # Cross-compile for Raspberry Pi (ARM64) build-rpi: + @bash scripts/fetch-lbug.sh linux arm64 CGO_ENABLED=1 GOOS=linux GOARCH=arm64 CC=aarch64-linux-gnu-gcc \ go build -ldflags '$(LDFLAGS)' -o gortex-rpi ./cmd/gortex/ @echo "✓ Built gortex-rpi (linux/arm64)" @@ -134,10 +154,11 @@ build-rpi32: # mingw-w64 C/C++ runtime (libstdc++, libgcc, winpthread) into the .exe # so it runs on a stock Windows box without bundled DLLs. build-windows: + @bash scripts/fetch-lbug.sh windows amd64 CGO_ENABLED=1 GOOS=windows GOARCH=amd64 \ CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ \ go build -ldflags '$(LDFLAGS) -extldflags "-static"' -o gortex.exe ./cmd/gortex/ - @echo "✓ Built gortex.exe (windows/amd64)" + @echo "✓ Built gortex.exe (windows/amd64) — ship lbug_shared.dll alongside" # --------------------------------------------------------------------------- # Marketplace plugin bundle diff --git a/cmd/gortex/backend.go b/cmd/gortex/backend.go new file mode 100644 index 00000000..b3d97955 --- /dev/null +++ b/cmd/gortex/backend.go @@ -0,0 +1,82 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/graph" +) + +// openBackend constructs the graph.Store the daemon will run +// against. Picks the implementation by the --backend flag: +// +// - "memory" (default) — in-process *graph.Graph; nothing +// persists across runs; matches every existing test fixture. +// - "ladybug" — embedded Cypher property-graph DB; persists to +// --backend-path; only available when the binary is built +// with `-tags ladybug`. +// +// Returns the store, a cleanup func the caller must defer (closes +// the underlying handle on disk-backed stores), and any error +// constructing or opening the store. +// +// The actual per-backend Open* helpers live in their own +// build-tagged files (backend_memory.go is always built; the +// disk-backed ones are gated by build tags). This file is the +// shared dispatch. +func openBackend(name, path string, bufferPoolMB uint64, logger *zap.Logger) (graph.Store, func(), error) { + switch strings.ToLower(strings.TrimSpace(name)) { + case "", "memory", "mem", "in-memory": + s := graph.New() + return s, func() {}, nil + case "ladybug", "lbug": + resolved, err := resolveBackendPath(path, "store.lbug") + if err != nil { + return nil, nil, err + } + logger.Info("opening ladybug backend", + zap.String("path", resolved), + zap.Uint64("buffer_pool_mb", bufferPoolMB), + ) + return openLadybugBackend(resolved, bufferPoolMB) + default: + return nil, nil, fmt.Errorf("unknown --backend %q (expected: memory, ladybug)", name) + } +} + +// resolveBackendPath turns an empty --backend-path into a default +// at ~/.gortex/. Otherwise expands ~ and returns the +// absolute path. Creates the parent directory if missing — the +// disk-backed stores expect the parent dir to exist. +func resolveBackendPath(in, filename string) (string, error) { + in = strings.TrimSpace(in) + if in == "" { + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("resolve home dir: %w", err) + } + in = filepath.Join(home, ".gortex", filename) + } else if strings.HasPrefix(in, "~/") { + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("resolve home dir: %w", err) + } + in = filepath.Join(home, in[2:]) + } + abs, err := filepath.Abs(in) + if err != nil { + return "", fmt.Errorf("abs path %q: %w", in, err) + } + // Ladybug Open expects either an existing directory (it reuses + // it) or a non-existing path (it creates the dir). We MkdirAll + // the parent so the path is reachable; the store itself opens + // the leaf. + if err := os.MkdirAll(filepath.Dir(abs), 0o755); err != nil { + return "", fmt.Errorf("mkdir parent %q: %w", filepath.Dir(abs), err) + } + return abs, nil +} diff --git a/cmd/gortex/backend_ladybug.go b/cmd/gortex/backend_ladybug.go new file mode 100644 index 00000000..0b8a299e --- /dev/null +++ b/cmd/gortex/backend_ladybug.go @@ -0,0 +1,40 @@ +package main + +import ( + "fmt" + + "github.com/zzet/gortex/internal/daemon" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_ladybug" +) + +// openLadybugBackend opens (or creates) the ladybug store at +// path. Returns a cleanup func that closes the underlying handle +// — important because ladybug's writer locks the directory and +// a subsequent reopen on the same path would fail until the +// previous handle is closed. +func openLadybugBackend(path string, bufferPoolMB uint64) (graph.Store, func(), error) { + s, err := store_ladybug.OpenWithOptions(path, store_ladybug.Options{ + BufferPoolMB: bufferPoolMB, + }) + if err != nil { + // liblbug collapses every open failure — including "another + // process already holds the lock on this store" — into a single + // generic status with no message (lbug_state is just Success/Error, + // and lbug_database_init exposes no error string). A second gortex + // process on the same store is the most common cause, so name it + // instead of leaving the user the bare, unactionable status code. + hint := "if another gortex daemon or server is using this store, stop it first (`gortex daemon status` / `gortex daemon stop`)" + if pid, ok := daemon.RunningPID(); ok { + hint = fmt.Sprintf("a gortex daemon is already running (pid %d) — stop it with `gortex daemon stop`, or use `gortex daemon restart`", pid) + } + return nil, nil, fmt.Errorf("open ladybug store at %q: %w (%s)", path, err, hint) + } + return s, func() { _ = s.Close() }, nil +} + +// The daemon warm-restart path consults this optional capability +// (cmd/gortex/daemon_state.go: storeNeedsRebuild) to force a full re-index +// when a schema migration crossed a rebuild rung. This assertion keeps the +// concrete store and the daemon's optional-interface check from drifting. +var _ interface{ NeedsRebuild() bool } = (*store_ladybug.Store)(nil) diff --git a/cmd/gortex/daemon.go b/cmd/gortex/daemon.go index c04b469a..d709b185 100644 --- a/cmd/gortex/daemon.go +++ b/cmd/gortex/daemon.go @@ -34,11 +34,14 @@ var ( // (the function has no *cobra.Command of its own) to decide whether // the flag overrides the `embedding:` config block. Set once in // runDaemonStart before buildDaemonState runs. - daemonEmbeddingsChanged bool - daemonStatusWatch bool - daemonStatusInterval time.Duration - daemonHTTPAddr string - daemonHTTPAuthToken string + daemonEmbeddingsChanged bool + daemonStatusWatch bool + daemonStatusInterval time.Duration + daemonHTTPAddr string + daemonHTTPAuthToken string + daemonBackend string + daemonBackendPath string + daemonBackendBufferPoolMB uint64 ) var daemonCmd = &cobra.Command{ @@ -97,6 +100,12 @@ func init() { "also expose the MCP 2026 Streamable HTTP transport on this TCP address (e.g. 127.0.0.1:7411); empty disables") daemonStartCmd.Flags().StringVar(&daemonHTTPAuthToken, "http-auth-token", "", "bearer token required on every Streamable HTTP request (default: read $GORTEX_DAEMON_HTTP_TOKEN; empty allows unauthenticated localhost binds)") + daemonStartCmd.Flags().StringVar(&daemonBackend, "backend", "ladybug", + "storage backend: ladybug (default — embedded Cypher graph DB, persists to --backend-path so warm restarts skip re-indexing) | memory (in-process, no persistence — fastest per-op but pays the full cold-warmup cost on every restart)") + daemonStartCmd.Flags().StringVar(&daemonBackendPath, "backend-path", "", + "directory where the on-disk backend persists its store. Required when --backend != memory. Default: ~/.gortex/.store") + daemonStartCmd.Flags().Uint64Var(&daemonBackendBufferPoolMB, "backend-buffer-pool-mb", 0, + "page-cache cap for the on-disk backend in MiB. 0 reads $GORTEX_DAEMON_BUFFER_POOL_MB or falls back to 4096 (4 GiB); only consulted for --backend=ladybug") daemonLogsCmd.Flags().IntVarP(&daemonTail, "tail", "n", 50, "show only the last N log lines") daemonStatusCmd.Flags().BoolVarP(&daemonStatusWatch, "watch", "w", false, @@ -121,6 +130,17 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { if daemon.IsRunning() { return fmt.Errorf("daemon already running (socket: %s)", daemon.SocketPath()) } + // IsRunning only probes the socket. A daemon that is mid-shutdown — or + // one whose socket wedged — still owns the PID file and, crucially, still + // holds the store's on-disk lock. Starting over the top of it makes the + // backend open fail with an opaque "failed to open database" lock + // conflict, so refuse early with the PID and an actionable next step. The + // detached child reaches here too, but it hasn't written its own PID file + // yet (that happens in the serve loop), so this can't false-positive on + // the daemon we're in the middle of starting. + if pid, ok := daemon.RunningPID(); ok { + return fmt.Errorf("daemon already running (pid %d) — stop it with `gortex daemon stop`, or use `gortex daemon restart`", pid) + } if daemonDetach && os.Getenv("GORTEX_DAEMON_CHILD") != "1" { return spawnDetachedDaemon() } @@ -174,7 +194,19 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { if mw != nil { _ = mw.Stop() } - saveSnapshot(state.graph, collectSnapshotRepos(state.multiIndexer), collectSnapshotContracts(state.multiIndexer), collectSnapshotVector(state.multiIndexer), version, logger) + if mg, ok := state.graph.(*graph.Graph); ok { + // Memory backend — snapshot the full in-memory graph; + // the next warmup replays nodes/edges from the gob+gzip + // dump because there's no other persistence layer. + saveSnapshot(mg, collectSnapshotRepos(state.multiIndexer), collectSnapshotContracts(state.multiIndexer), collectSnapshotVector(state.multiIndexer), version, logger) + } + // Persistent backends (ladybug) no longer write a metadata + // snapshot: per-file mtimes live in the FileMtime sidecar + // table, contract records ride on KindContract.Meta, and the + // vector index is served directly by the ladybug native HNSW + // (`CALL QUERY_VECTOR_INDEX`). Warm restart reads everything + // it needs from `store.lbug` — no gob+gzip round-trip + // required. if state.mcpServer != nil { _ = state.mcpServer.FlushSavings() } @@ -309,7 +341,23 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { // the GC then has to clean up. Skipping snapshots until ready cleared // a stall observed in profile #5 where saveSnapshotTo was the only // runnable goroutine on a daemon mid-warmup. - stopSnapshotter := startPeriodicSnapshots(state.graph, state.multiIndexer, version, 10*time.Minute, controller.IsReady, logger) + // Periodic snapshots. For the memory backend this is the full + // gob+gzip export of the in-memory graph. For persistent backends + // (ladybug) it's metadata-only — repos + contracts + vector — + // since the backend already persists the graph itself. Both + // shapes feed the warm-restart path that uses ReconcileRepoCtx + // instead of full TrackRepoCtx; without the metadata save, warm + // restart had no FileMtimes and crashed in BulkUpsertSymbolFTS. + // Periodic snapshots fire only for the memory backend — that's + // the path that has no other persistence layer for the graph + // itself. Ladybug-backed daemons rely on the backend's own + // durability (graph → store.lbug, FileMtimes → FileMtime sidecar + // table, contracts → KindContract.Meta, vectors → SymbolVec) so + // the gob+gzip snapshot is dead weight in that mode. + stopSnapshotter := func() {} + if mg, ok := state.graph.(*graph.Graph); ok { + stopSnapshotter = startPeriodicSnapshots(mg, state.multiIndexer, version, 10*time.Minute, controller.IsReady, logger) + } defer stopSnapshotter() // Periodic savings flush — 5 minute interval. Bounds on-crash data @@ -364,6 +412,15 @@ func runDaemonStart(cmd *cobra.Command, _ []string) error { // first" against a fully populated state. if state.mcpServer != nil { state.mcpServer.RunAnalysis() + // Co-change pre-warm: fire the git-history mine in the + // background so the first user-visible + // find_co_changing_symbols / search-rerank call sees a + // populated cache. On Ladybug the mine is dominated by + // the AllNodes + per-pair AddEdge disk-persist step that + // mineCoChange already defers into its own goroutine — + // but even the git log itself can take 10–30s on a large + // history, and we want that off every request path. + state.mcpServer.PrewarmCoChange() } elapsed := time.Since(start) controller.MarkReady(elapsed) @@ -609,6 +666,13 @@ func emitDaemonStartSummary(w io.Writer, pid int, elapsed time.Duration) { func runDaemonStop(cmd *cobra.Command, _ []string) error { w := cmd.ErrOrStderr() if !daemon.IsRunning() { + // The socket is gone, but a process may still be alive and holding + // the store lock — a daemon mid-shutdown, or one whose socket wedged. + // killByPID terminates it AND blocks until it has actually exited, + // which is what `daemon restart` relies on to not race the lock. + if _, ok := daemon.RunningPID(); ok { + return killByPID() + } emitDaemonStopAlreadyDown(w) return nil } @@ -617,6 +681,13 @@ func runDaemonStop(cmd *cobra.Command, _ []string) error { // post-stop summary (the socket file vanishes on clean shutdown). socket := daemon.SocketPath() uptime := daemonUptimeBeforeStop() + // Capture the PID too. ControlShutdown only *acks* — the daemon then + // flushes and closes the store (releasing its on-disk lock) and exits + // asynchronously (see server.go: the handler Shutdown()s ~100ms later in + // a goroutine). We must block until that process is gone, or a following + // `daemon start` races the still-held lock and dies with the opaque + // "failed to open database with status 1". + pid, havePID := daemon.RunningPID() c, err := daemon.Dial(daemon.Handshake{Mode: daemon.ModeControl, ClientName: "cli"}) if err != nil { @@ -632,10 +703,39 @@ func runDaemonStop(cmd *cobra.Command, _ []string) error { if !resp.OK { return fmt.Errorf("shutdown rejected: %s %s", resp.ErrorCode, resp.ErrorMsg) } + if havePID { + waitForDaemonExit(pid) + } emitDaemonStopSummary(w, socket, uptime) return nil } +// waitForDaemonExit blocks until the daemon process pid has exited — and thus +// released the store's on-disk lock — force-killing it if a graceful shutdown +// stalls. This is what makes `daemon stop` honest: when it returns, the store +// is free for the next process, which is the foundation `daemon restart` +// stands on. Polls cheaply; the common case (a clean flush) clears in well +// under a second. +func waitForDaemonExit(pid int) { + deadline := time.Now().Add(15 * time.Second) + for time.Now().Before(deadline) { + if !platform.ProcessAlive(pid) { + return + } + time.Sleep(50 * time.Millisecond) + } + // Graceful shutdown stalled (e.g. a wedged cgo call). Don't leave a + // half-exited daemon clutching the lock — force it, then clean up the + // socket/PID so the next start isn't tripped by stale files. + fmt.Fprintln(os.Stderr, "[gortex daemon] graceful shutdown timed out — force-killing") + _ = platform.KillProcess(pid) + for i := 0; i < 60 && platform.ProcessAlive(pid); i++ { + time.Sleep(50 * time.Millisecond) + } + _ = os.Remove(daemon.PIDFilePath()) + _ = os.Remove(daemon.SocketPath()) +} + // daemonUptimeBeforeStop best-effort-fetches the daemon's reported uptime via // a Status control before shutdown so the summary card can show how long the // process ran. Returns 0 on any error — we'd rather degrade the card than @@ -709,15 +809,17 @@ func runDaemonRestart(cmd *cobra.Command, args []string) error { emitDaemonRestartBanner(cmd.ErrOrStderr()) - // Stop is idempotent when not running. + // Stop is idempotent when not running and now blocks until the old + // process has fully exited — releasing the store's on-disk lock — before + // returning. That's what lets the start below reuse the store without + // racing the lock. The old code polled `daemon.IsRunning()` here, which + // watched the wrong resource: the socket is torn down ~100ms after the + // shutdown ack, long before the process exits and the lock clears, so the + // poll fell through early and the restart died on "failed to open + // database with status 1". if err := runDaemonStop(cmd, args); err != nil { return err } - // Give the OS a moment to release the socket file. - deadline := time.Now().Add(3 * time.Second) - for time.Now().Before(deadline) && daemon.IsRunning() { - time.Sleep(50 * time.Millisecond) - } daemonDetach = true return runDaemonStart(cmd, args) } @@ -812,8 +914,10 @@ func renderDaemonHeader(w io.Writer, st daemon.StatusResponse) { t.AppendRow(table.Row{"socket", st.SocketPath}) t.AppendRow(table.Row{"uptime", formatDuration(time.Duration(st.UptimeSeconds) * time.Second)}) if st.Ready { - t.AppendRow(table.Row{"state", - fmt.Sprintf("ready (warmup %s)", formatDuration(time.Duration(st.WarmupSeconds)*time.Second))}) + t.AppendRow(table.Row{ + "state", + fmt.Sprintf("ready (warmup %s)", formatDuration(time.Duration(st.WarmupSeconds)*time.Second)), + }) } else { t.AppendRow(table.Row{"state", "warming up (socket reachable, background re-index in progress)"}) } @@ -1131,6 +1235,21 @@ func daemonControlClient() (*daemon.Client, error) { return c, nil } +// resolveDaemonBufferPoolMB returns the effective buffer-pool cap. +// Precedence: --backend-buffer-pool-mb flag > GORTEX_DAEMON_BUFFER_POOL_MB env > 0 +// (which Open then maps to DefaultBufferPoolMB inside the store). +func resolveDaemonBufferPoolMB() uint64 { + if daemonBackendBufferPoolMB != 0 { + return daemonBackendBufferPoolMB + } + if env := strings.TrimSpace(os.Getenv("GORTEX_DAEMON_BUFFER_POOL_MB")); env != "" { + if v, err := strconv.ParseUint(env, 10, 64); err == nil { + return v + } + } + return 0 +} + // killByPID is the fallback stop path for stale daemons that have a PID // file but don't respond on the socket. Asks the process to terminate, // waits, then force-kills. Silently returns nil if the PID no longer diff --git a/cmd/gortex/daemon_controller.go b/cmd/gortex/daemon_controller.go index 630f0e94..23db5319 100644 --- a/cmd/gortex/daemon_controller.go +++ b/cmd/gortex/daemon_controller.go @@ -14,10 +14,12 @@ import ( "go.uber.org/zap" + "github.com/zzet/gortex/internal/churn" "github.com/zzet/gortex/internal/config" "github.com/zzet/gortex/internal/daemon" "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/indexer" + "github.com/zzet/gortex/internal/releases" "github.com/zzet/gortex/internal/search" "github.com/zzet/gortex/internal/semantic/lsp" ) @@ -31,7 +33,7 @@ import ( // otherwise. The mutex is coarse; finer locking is a later optimization. type realController struct { mu sync.Mutex - graph *graph.Graph + graph graph.Store indexer *indexer.Indexer multiIndexer *indexer.MultiIndexer configManager *config.ConfigManager @@ -112,6 +114,121 @@ func (c *realController) Track(ctx context.Context, p daemon.TrackParams) (json. }) } +// EnrichChurn runs the churn enricher in-process against the daemon's +// graph. We hold c.mu for the duration so a concurrent Track/Untrack +// can't reshape the set of files while the enricher walks them. The +// caller (CLI / git hook) picks the params; an empty Path means "every +// tracked repo", an empty Branch means "resolve each repo's default +// branch from its working tree". +func (c *realController) EnrichChurn(ctx context.Context, p daemon.EnrichChurnParams) (daemon.EnrichChurnResult, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.graph == nil { + return daemon.EnrichChurnResult{}, fmt.Errorf("graph not initialized") + } + if c.multiIndexer == nil { + return daemon.EnrichChurnResult{}, fmt.Errorf("multi-repo indexer not initialized") + } + + // Resolve the set of repo roots the call targets. Empty Path = + // every tracked repo. A path or prefix narrows to one. + type target struct { + prefix string + root string + } + var targets []target + want := strings.TrimSpace(p.Path) + for prefix, meta := range c.multiIndexer.AllMetadata() { + if want != "" && want != prefix && want != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + if len(targets) == 0 { + return daemon.EnrichChurnResult{}, fmt.Errorf("no tracked repo matches %q", p.Path) + } + + started := time.Now() + var combined daemon.EnrichChurnResult + for _, t := range targets { + branch := strings.TrimSpace(p.Branch) + if branch == "" { + branch = gitDefaultBranch(t.root) + } + if branch == "" { + c.logger.Warn("enrich churn: no default branch resolved", + zap.String("prefix", t.prefix), zap.String("root", t.root)) + continue + } + res, err := churn.EnrichGraph(ctx, c.graph, t.root, churn.Options{Branch: branch}) + if err != nil { + return daemon.EnrichChurnResult{}, fmt.Errorf("enrich %s: %w", t.prefix, err) + } + combined.Files += res.Files + combined.Symbols += res.Symbols + combined.Branch = res.Branch + combined.HeadSHA = res.HeadSHA + } + combined.DurationMS = time.Since(started).Milliseconds() + return combined, nil +} + +// EnrichReleases runs the per-file release enricher against the +// daemon's graph. Mirrors EnrichChurn — c.mu is held for the duration, +// targets resolve via the multi-indexer, and an empty Branch lets +// each repo's default branch be resolved on demand (so feature-branch +// tags don't leak into the timeline). +func (c *realController) EnrichReleases(ctx context.Context, p daemon.EnrichReleasesParams) (daemon.EnrichReleasesResult, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.graph == nil { + return daemon.EnrichReleasesResult{}, fmt.Errorf("graph not initialized") + } + if c.multiIndexer == nil { + return daemon.EnrichReleasesResult{}, fmt.Errorf("multi-repo indexer not initialized") + } + + type target struct { + prefix string + root string + } + var targets []target + want := strings.TrimSpace(p.Path) + for prefix, meta := range c.multiIndexer.AllMetadata() { + if want != "" && want != prefix && want != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + if len(targets) == 0 { + return daemon.EnrichReleasesResult{}, fmt.Errorf("no tracked repo matches %q", p.Path) + } + _ = ctx // graph mutation is synchronous; no cancellation surface today + + started := time.Now() + var combined daemon.EnrichReleasesResult + for _, t := range targets { + branch := strings.TrimSpace(p.Branch) + if branch == "" { + branch = gitDefaultBranch(t.root) + // Empty branch is still legal — releases.EnrichGraphForBranch + // treats "" as "every tag", which is the right default when + // no default branch can be resolved (e.g. a clone without + // origin/HEAD set yet). + } + count, err := releases.EnrichGraphForBranch(c.graph, t.root, t.prefix, branch) + if err != nil { + return daemon.EnrichReleasesResult{}, fmt.Errorf("enrich %s: %w", t.prefix, err) + } + combined.Files += count + combined.Branch = branch + } + combined.DurationMS = time.Since(started).Milliseconds() + return combined, nil +} + // Untrack evicts a repo from the graph and drops it from config. // PathOrPrefix accepts either an absolute path or a repo prefix. func (c *realController) Untrack(_ context.Context, p daemon.UntrackParams) (json.RawMessage, error) { diff --git a/cmd/gortex/daemon_rebuild_test.go b/cmd/gortex/daemon_rebuild_test.go new file mode 100644 index 00000000..990b0b8c --- /dev/null +++ b/cmd/gortex/daemon_rebuild_test.go @@ -0,0 +1,33 @@ +package main + +import "testing" + +type fakeRebuildYes struct{} + +func (fakeRebuildYes) NeedsRebuild() bool { return true } + +type fakeRebuildNo struct{} + +func (fakeRebuildNo) NeedsRebuild() bool { return false } + +// storeNeedsRebuild must detect the optional NeedsRebuild capability and +// default to false for backends that don't implement it (the in-memory +// store), so the warm-restart fast path is bypassed only on an explicit +// rebuild signal. +func TestStoreNeedsRebuild(t *testing.T) { + cases := []struct { + name string + g any + want bool + }{ + {"implements true", fakeRebuildYes{}, true}, + {"implements false", fakeRebuildNo{}, false}, + {"no capability", struct{}{}, false}, + {"nil", nil, false}, + } + for _, c := range cases { + if got := storeNeedsRebuild(c.g); got != c.want { + t.Errorf("%s: storeNeedsRebuild = %v, want %v", c.name, got, c.want) + } + } +} diff --git a/cmd/gortex/daemon_snapshot.go b/cmd/gortex/daemon_snapshot.go index c263d346..ba078315 100644 --- a/cmd/gortex/daemon_snapshot.go +++ b/cmd/gortex/daemon_snapshot.go @@ -338,7 +338,12 @@ func migrateSnapshotFile(path string, fromVersion int) (io.Reader, error) { // The vec argument carries the workspace-global vector-search index so // a default-on daemon does not re-embed the whole graph on restart. func saveSnapshot(g *graph.Graph, repos []snapshotRepo, snapContracts []snapshotContract, vec snapshotVector, version string, logger *zap.Logger) { - _ = saveSnapshotTo(g, repos, snapContracts, vec, version, daemon.SnapshotPath(), logger) + // Memory backend: the gob+gzip dump IS the persistence layer, so + // route to the per-backend path so a future ladybug-backed daemon + // can't accidentally pick up this snapshot at startup. See + // daemon.BackendSnapshotPath for the memory ↔ ladybug switch + // rationale. + _ = saveSnapshotTo(g, repos, snapContracts, vec, version, daemon.BackendSnapshotPath("memory"), logger) } // saveSnapshotTo writes the snapshot to an explicit path. Used by @@ -585,6 +590,14 @@ func fromSnapshotContract(s snapshotContract) contracts.Contract { // trades "one bad byte poisons the entire cache" for "N bad records // cost at most N files being re-indexed on next warmup." func loadSnapshot(g *graph.Graph, logger *zap.Logger) (snapshotLoadResult, error) { + // Memory backend reads from its own backend-tagged path. Falls + // back transparently to the legacy unsuffixed daemon.gob.gz when + // the override env is set or the new file doesn't exist yet, so + // users upgrading across this change don't have to re-warm. + res, err := loadSnapshotFrom(g, daemon.BackendSnapshotPath("memory"), logger) + if err == nil && (res.Loaded || res.Partial) { + return res, nil + } return loadSnapshotFrom(g, daemon.SnapshotPath(), logger) } @@ -592,7 +605,7 @@ func loadSnapshot(g *graph.Graph, logger *zap.Logger) (snapshotLoadResult, error // Used by `gortex server --snapshot ` so a per-workspace // process can boot from a specific snapshot file produced by the // cloud indexer worker. -func loadSnapshotFrom(g *graph.Graph, path string, logger *zap.Logger) (snapshotLoadResult, error) { +func loadSnapshotFrom(g graph.Store, path string, logger *zap.Logger) (snapshotLoadResult, error) { // Allocate Contracts up front so every early-return path (missing // file, gzip error, header decode error, schema mismatch) hands the // caller a safe-to-read zero-value instead of a nil map. The warmup diff --git a/cmd/gortex/daemon_state.go b/cmd/gortex/daemon_state.go index 48ff7e27..f7bc5e36 100644 --- a/cmd/gortex/daemon_state.go +++ b/cmd/gortex/daemon_state.go @@ -9,6 +9,7 @@ import ( "sort" "strings" "sync" + "sync/atomic" "time" "go.uber.org/zap" @@ -36,7 +37,7 @@ import ( // instance per running daemon; every session the daemon accepts shares // these pointers. type daemonState struct { - graph *graph.Graph + graph graph.Store indexer *indexer.Indexer multiIndexer *indexer.MultiIndexer configManager *config.ConfigManager @@ -177,7 +178,20 @@ func buildDaemonState(logger *zap.Logger) (*daemonState, error) { } } - g := graph.New() + g, backendCleanup, err := openBackend(daemonBackend, daemonBackendPath, resolveDaemonBufferPoolMB(), logger) + if err != nil { + return nil, fmt.Errorf("opening backend %q: %w", daemonBackend, err) + } + // Cleanup runs at daemon shutdown via the returned state's + // teardown chain (see DaemonState.Close); store it on the + // state so deferred close fires after every other shutdown + // step (snapshot save, etc.). + defer func() { + if err != nil { + backendCleanup() + } + }() + reg := parser.NewRegistry() languages.RegisterAll(reg) languages.RegisterCustomGrammars(reg, cfg.Index.Grammars, logger) @@ -190,10 +204,36 @@ func buildDaemonState(logger *zap.Logger) (*daemonState, error) { // have no signal to distinguish "indexed and unchanged" from "new // on disk", treat everything as stale, and produce duplicate // nodes/edges on every restart (bug B1). - loadResult, err := loadSnapshot(g, logger) - if err != nil { - logger.Warn("daemon: snapshot load failed", zap.Error(err)) + // + // Two snapshot shapes: + // + // - Memory backend: full graph replay (loadSnapshot). The + // gob+gzip dump IS the persistence layer; nodes + edges are + // replayed into the empty *graph.Graph. + // + // - Persistent backend (ladybug): metadata-only load + // (loadSnapshotMetadata). The graph already lives in the + // backend's own on-disk store, so the snapshot only needs to + // carry the data the backend doesn't track — per-repo + // FileMtimes, contract registries, vector index. Skipping the + // load entirely (the previous behaviour) left priorMtimes + // empty and routed every warm restart through a full + // TrackRepoCtx → BulkUpsertSymbolFTS path that crashes on an + // already-populated store. + var loadResult snapshotLoadResult + if mg, ok := g.(*graph.Graph); ok { + loadResult, err = loadSnapshot(mg, logger) + if err != nil { + logger.Warn("daemon: snapshot load failed", zap.Error(err)) + } } + // Ladybug-backed daemons don't read a metadata snapshot: per- + // repo FileMtimes live in the FileMtime sidecar table (loaded + // per-repo by priorMtimesFromStore in the parallel_parse loop + // below), KindContract nodes carry the rich contract record on + // Node.Meta (rehydrated via contracts.LoadRegistryFromGraph), + // and vector queries route to ladybug's native HNSW. The legacy + // gob round-trip is now memory-backend-only. idx := indexer.New(g, reg, cfg.Index, logger) @@ -652,11 +692,34 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat jobs := make(chan config.RepoEntry, len(repos)) var wg sync.WaitGroup + // changedRepos counts repos that actually did indexing work this + // warmup: a cold full-track, or a reconcile that re-indexed / evicted + // at least one file. When it stays zero, NOTHING on disk changed + // since the last shutdown, so the persisted graph already holds every + // resolved and derived edge — the global resolution passes below + // (RunDeferredPassesAll / RunGlobalResolve / RunGlobalGraphPasses) are + // pure recomputation and get skipped, which is what makes a true warm + // restart near-instant instead of replaying the full cold-warmup cost. + var changedRepos atomic.Int64 for i := 0; i < workers; i++ { wg.Add(1) go func() { defer wg.Done() for entry := range jobs { + // Per-entry panic guard so one repo's CGo / liblbug + // crash (e.g. the "mutex lock failed: Invalid + // argument" the resolver's stub-merge path surfaces + // on certain warm-restart shapes) doesn't kill the + // worker — the bad repo logs and skips, the worker + // proceeds to the next job, and warmup completes. + func(entry config.RepoEntry) { + defer func() { + if r := recover(); r != nil { + logger.Error("daemon: warmup repo panic recovered", + zap.String("path", entry.Path), + zap.Any("panic", r)) + } + }() // Route repos whose nodes came from the snapshot through // ReconcileRepoCtx — it calls IncrementalReindex, which // evicts files deleted while the daemon was down and @@ -677,20 +740,58 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat // erodes the graph until exported methods show zero // callers despite having dozens of real call sites. repoStart := time.Now() - priorMtimes := priorMtimesForEntry(state.snapshotRepos, entry) + // Prefer mtimes stored in the backend's FileMtime + // sidecar table — that lifts the persistence off the + // gob snapshot for the ladybug backend, which is the + // path that actually rebuilds across restarts. Falls + // back to the snapshot's per-repo FileMtimes when the + // backend doesn't implement the reader (memory) or + // hasn't seen this repo yet. + priorMtimes := priorMtimesFromStore(state.graph, entry, logger) + if len(priorMtimes) == 0 { + priorMtimes = priorMtimesForEntry(state.snapshotRepos, entry) + } if state.snapshotPartial { priorMtimes = nil } + // A backend that crossed a schema-rebuild migration rung + // (NeedsRebuild) has on-disk rows in the old shape that an + // incremental reconcile cannot fix. Drop prior mtimes so every + // file re-indexes into the new schema (the nil branch below + // runs a full TrackRepoCtx and marks the repo changed, so the + // global resolve/derivation passes re-run too). No-op for + // backends without the capability and whenever no rebuild rung + // was crossed — the common case. + if storeNeedsRebuild(state.graph) { + if len(priorMtimes) > 0 { + logger.Info("daemon: backend signalled schema rebuild; forcing full re-index", + zap.String("path", entry.Path)) + } + priorMtimes = nil + } pathFn := "track" if priorMtimes != nil { pathFn = "reconcile" - if _, err := state.multiIndexer.ReconcileRepoCtx(ctx, entry, priorMtimes); err != nil { + res, err := state.multiIndexer.ReconcileRepoCtx(ctx, entry, priorMtimes) + switch { + case err != nil: logger.Warn("daemon: startup reconcile failed", zap.String("path", entry.Path), zap.Error(err)) + // Treat a failed reconcile as "changed" so the global + // passes still run — degrade toward correctness, not + // toward the fast path, when we can't trust the delta. + changedRepos.Add(1) + case res != nil && (res.StaleFileCount > 0 || res.DeletedFileCount > 0 || len(res.FailedFiles) > 0): + changedRepos.Add(1) + } + } else { + // No prior mtimes → full cold (re)index of this repo, + // which is "changed" by definition. + changedRepos.Add(1) + if _, err := state.multiIndexer.TrackRepoCtx(ctx, entry); err != nil { + logger.Warn("daemon: startup track failed", + zap.String("path", entry.Path), zap.Error(err)) } - } else if _, err := state.multiIndexer.TrackRepoCtx(ctx, entry); err != nil { - logger.Warn("daemon: startup track failed", - zap.String("path", entry.Path), zap.Error(err)) } elapsed := time.Since(repoStart) if elapsed > 2*time.Second { @@ -699,6 +800,7 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat zap.String("path_fn", pathFn), zap.Duration("elapsed", elapsed)) } + }(entry) } }() } @@ -715,19 +817,36 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat "elapsed_ms": time.Since(phaseStart).Milliseconds(), }) + // Warm-restart fast path. When the reconcile loop above re-indexed + // nothing, the persistent backend already carries every resolved and + // derived edge from the prior run; the deferred per-repo passes, the + // cross-repo resolve, and the graph-wide derivation passes would all + // just recompute what's on disk. Skipping them is what turns a warm + // restart from a multi-minute replay of the cold-warmup cost into a + // near-instant "open store, reconcile zero files, start watching". + // The in-memory backend reaches here too, but its snapshot replay + // already restored the derived edges, so the skip is equally safe. + anyChanged := changedRepos.Load() > 0 + logger.Info("daemon: warmup change detection", + zap.Int64("changed_repos", changedRepos.Load()), + zap.Int("tracked_repos", len(repos)), + zap.Bool("global_passes", anyChanged)) + // Drain deferred per-repo passes (ResolveAll / semantic enrich / // contract extract+commit) serially across the indexers the parallel // loop populated. Must run before RunGlobalResolve so cross-repo // resolution sees fully-lifted per-repo placeholder edges. - phaseStart = time.Now() - publishReadinessPhase(state, "deferred_passes_all", false, nil) - state.multiIndexer.RunDeferredPassesAll(ctx) - logger.Info("daemon: warmup phase done", - zap.String("phase", "deferred_passes_all"), - zap.Duration("elapsed", time.Since(phaseStart))) - publishReadinessPhase(state, "deferred_passes_all_done", false, map[string]any{ - "elapsed_ms": time.Since(phaseStart).Milliseconds(), - }) + if anyChanged { + phaseStart = time.Now() + publishReadinessPhase(state, "deferred_passes_all", false, nil) + state.multiIndexer.RunDeferredPassesAll(ctx) + logger.Info("daemon: warmup phase done", + zap.String("phase", "deferred_passes_all"), + zap.Duration("elapsed", time.Since(phaseStart))) + publishReadinessPhase(state, "deferred_passes_all_done", false, map[string]any{ + "elapsed_ms": time.Since(phaseStart).Milliseconds(), + }) + } // Rehydrate per-repo contract registries from the snapshot. Only // target indexers whose registry is still nil — a non-nil registry @@ -737,7 +856,7 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat // MergedContractRegistry skips them, so `contracts` returns only // the contracts of repos whose files happened to change since the // last shutdown. - if len(state.snapshotContracts) > 0 { + { phaseStart = time.Now() injectedRepos, injectedCount := 0, 0 for prefix := range state.multiIndexer.AllMetadata() { @@ -745,20 +864,32 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat if idx == nil || idx.ContractRegistry() != nil { continue } - cs, ok := state.snapshotContracts[prefix] - if !ok || len(cs) == 0 { - continue - } - reg := contracts.NewRegistry() - for _, c := range cs { - reg.Add(c) + // Primary path: rebuild the per-repo registry from + // KindContract nodes already in the backend's graph. + // The indexer stamps every contract record onto + // Node.Meta at commit time, so the graph is the + // authoritative source — no gob round-trip needed. + reg := contracts.LoadRegistryFromGraph(state.graph, prefix) + if reg == nil { + // Fallback to the legacy gob-snapshot path for + // daemons upgrading across this change. The + // snapshot copy is read-only by this point so the + // two sources can't drift mid-flight. + cs, ok := state.snapshotContracts[prefix] + if !ok || len(cs) == 0 { + continue + } + reg = contracts.NewRegistry() + for _, c := range cs { + reg.Add(c) + } } idx.SetContractRegistry(reg) injectedRepos++ - injectedCount += len(cs) + injectedCount += len(reg.All()) } if injectedRepos > 0 { - logger.Info("daemon: rehydrated contract registries from snapshot", + logger.Info("daemon: rehydrated contract registries from graph/snapshot", zap.Int("repos", injectedRepos), zap.Int("contracts", injectedCount), zap.Duration("elapsed", time.Since(phaseStart))) @@ -788,24 +919,33 @@ func warmupDaemonState(state *daemonState, logger *zap.Logger) *indexer.MultiWat // for a fresh-start daemon (where there's no snapshot to reconcile // against). After resolution, contract bridge edges may have // changed too, so ReconcileContractEdges runs again. - phaseStart = time.Now() - publishReadinessPhase(state, "global_resolve", false, nil) - state.multiIndexer.RunGlobalResolve() - logger.Info("daemon: warmup phase done", - zap.String("phase", "global_resolve"), - zap.Duration("elapsed", time.Since(phaseStart))) - publishReadinessPhase(state, "global_resolve_done", false, map[string]any{ - "elapsed_ms": time.Since(phaseStart).Milliseconds(), - }) + if anyChanged { + phaseStart = time.Now() + publishReadinessPhase(state, "global_resolve", false, nil) + state.multiIndexer.RunGlobalResolve() + logger.Info("daemon: warmup phase done", + zap.String("phase", "global_resolve"), + zap.Duration("elapsed", time.Since(phaseStart))) + publishReadinessPhase(state, "global_resolve_done", false, map[string]any{ + "elapsed_ms": time.Since(phaseStart).Milliseconds(), + }) + } // Finish the batch: turn off the per-repo skip flag and run the // graph-wide derivation passes once. RunGlobalResolve above just // lifted the last cross-repo placeholder EdgeCalls, so EdgeTests // derivation here picks up cross-repo test→subject pairs that - // were unresolved during the per-repo loop. + // were unresolved during the per-repo loop. On the warm-restart fast + // path (nothing changed) ResetBatch clears the deferred-batch flags + // without re-running those passes — the persisted graph already has + // the derived edges. phaseStart = time.Now() publishReadinessPhase(state, "end_batch", false, nil) - state.multiIndexer.EndBatch() + if anyChanged { + state.multiIndexer.EndBatch() + } else { + state.multiIndexer.ResetBatch() + } logger.Info("daemon: warmup phase done", zap.String("phase", "end_batch"), zap.Duration("elapsed", time.Since(phaseStart))) @@ -865,6 +1005,50 @@ func publishReadinessPhase(state *daemonState, phase string, ready bool, extra m state.mcpServer.PublishReadiness(phase, ready, extra) } +// priorMtimesFromStore asks the backend for its persisted FileMtime +// rows for the repo described by entry. Returns nil when the backend +// doesn't implement the reader (in-memory backend) or has no recorded +// mtimes for the repo (fresh cold start). When non-nil it short- +// circuits the gob-snapshot lookup so the warm path is driven by +// data the backend persisted itself. +func priorMtimesFromStore(g graph.Store, entry config.RepoEntry, logger *zap.Logger) map[string]int64 { + reader, ok := g.(graph.FileMtimeReader) + if !ok { + if logger != nil { + logger.Info("daemon: priorMtimesFromStore: store does not implement FileMtimeReader") + } + return nil + } + prefix := strings.TrimPrefix(config.ResolvePrefix(entry), "/") + if prefix == "" { + if logger != nil { + logger.Info("daemon: priorMtimesFromStore: empty prefix", + zap.String("entry_path", entry.Path), + zap.String("entry_name", entry.Name)) + } + return nil + } + mtimes := reader.LoadFileMtimes(prefix) + if logger != nil { + logger.Info("daemon: priorMtimesFromStore loaded", + zap.String("prefix", prefix), + zap.Int("count", len(mtimes))) + } + return mtimes +} + +// storeNeedsRebuild reports whether the backend signalled, via the optional +// NeedsRebuild capability, that a schema migration crossed a rung ALTER +// could not satisfy — so its persisted rows are in an old shape and the +// warm/incremental reconcile must be bypassed for a full re-index. Backends +// without the capability (the in-memory store) report false. See +// store_ladybug.(*Store).NeedsRebuild and the ladder in +// internal/graph/store_ladybug/migrate.go. +func storeNeedsRebuild(g any) bool { + rb, ok := g.(interface{ NeedsRebuild() bool }) + return ok && rb.NeedsRebuild() +} + // priorMtimesForEntry finds the snapshotted FileMtimes map for a // configured repo entry, matching on absolute RootPath. Falls back to // prefix-based lookup when no path match is found — useful if the diff --git a/cmd/gortex/enrich.go b/cmd/gortex/enrich.go index f2d1743c..cc2b0c20 100644 --- a/cmd/gortex/enrich.go +++ b/cmd/gortex/enrich.go @@ -2,8 +2,10 @@ package main import ( "encoding/json" + "errors" "fmt" "os" + "path/filepath" "github.com/spf13/cobra" @@ -11,6 +13,7 @@ import ( "github.com/zzet/gortex/internal/cochange" "github.com/zzet/gortex/internal/config" "github.com/zzet/gortex/internal/coverage" + "github.com/zzet/gortex/internal/daemon" "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/indexer" "github.com/zzet/gortex/internal/parser" @@ -34,6 +37,7 @@ var ( enrichBlameSnapshot string enrichCoverageSnapshot string enrichReleasesSnapshot string + enrichReleasesBranch string enrichCochangeSnapshot string enrichAllSnapshot string @@ -94,6 +98,8 @@ func init() { "write the enriched graph as a gob.gz snapshot to this path") enrichReleasesCmd.Flags().StringVar(&enrichReleasesSnapshot, "snapshot", "", "write the enriched graph as a gob.gz snapshot to this path") + enrichReleasesCmd.Flags().StringVar(&enrichReleasesBranch, "branch", "", + "restrict to tags reachable from this branch (default: resolve origin/main/master). Empty means every tag in the repo") enrichCochangeCmd.Flags().StringVar(&enrichCochangeSnapshot, "snapshot", "", "write the enriched graph as a gob.gz snapshot to this path") enrichAllCmd.Flags().StringVar(&enrichAllSnapshot, "snapshot", "", @@ -207,6 +213,17 @@ func runEnrichReleases(cmd *cobra.Command, args []string) error { if len(args) >= 1 { path = args[0] } + abs, err := filepath.Abs(path) + if err != nil { + return fmt.Errorf("abs path %q: %w", path, err) + } + + // Daemon path: forward to the running daemon so the enrichment + // runs against its in-process (and possibly LadyBug-backed) + // graph. Mirrors the churn CLI's behaviour. + if daemon.IsRunning() { + return forwardEnrichReleasesToDaemon(cmd, abs) + } cfg, err := config.Load(cfgFile) if err != nil { @@ -222,8 +239,16 @@ func runEnrichReleases(cmd *cobra.Command, args []string) error { return err } + branch := enrichReleasesBranch + if branch == "" { + branch = gitDefaultBranch(idx.RootPath()) + } + sp := newCLISpinner(cmd, "Stamping releases") - count, err := releases.EnrichGraph(g, idx.RootPath()) + if branch != "" { + sp.Set("", branch) + } + count, err := releases.EnrichGraphForBranch(g, idx.RootPath(), "", branch) if err != nil { sp.Fail(err) return fmt.Errorf("releases: %w", err) @@ -233,7 +258,9 @@ func runEnrichReleases(cmd *cobra.Command, args []string) error { result := map[string]any{ "enriched": count, + "branch": branch, "root": idx.RootPath(), + "mode": "standalone", } if enrichReleasesSnapshot != "" { if err := saveSnapshotTo(g, nil, nil, snapshotVector{}, "gortex-enrich-releases", enrichReleasesSnapshot, logger); err != nil { @@ -244,6 +271,49 @@ func runEnrichReleases(cmd *cobra.Command, args []string) error { return printEnrichResult(result) } +// forwardEnrichReleasesToDaemon sends a ControlEnrichReleases RPC +// and renders the response. Same shape as forwardEnrichChurnToDaemon. +func forwardEnrichReleasesToDaemon(cmd *cobra.Command, absPath string) error { + c, err := daemon.Dial(daemon.Handshake{Mode: daemon.ModeControl, ClientName: "cli-enrich-releases"}) + if err != nil { + if errors.Is(err, daemon.ErrDaemonUnavailable) { + return fmt.Errorf("daemon socket detected but dial failed; restart the daemon or run with no daemon (it falls back to in-memory)") + } + return fmt.Errorf("dial daemon: %w", err) + } + defer func() { _ = c.Close() }() + + resp, err := c.Control(daemon.ControlEnrichReleases, daemon.EnrichReleasesParams{ + Path: absPath, + Branch: enrichReleasesBranch, + }) + if err != nil { + return fmt.Errorf("control enrich_releases: %w", err) + } + if !resp.OK { + return fmt.Errorf("daemon rejected enrich_releases [%s]: %s", resp.ErrorCode, resp.ErrorMsg) + } + var out daemon.EnrichReleasesResult + if len(resp.Result) > 0 { + if err := json.Unmarshal(resp.Result, &out); err != nil { + return fmt.Errorf("parse daemon response: %w", err) + } + } + sp := newCLISpinner(cmd, "Enriched via daemon") + sp.Set("", fmt.Sprintf("%d files · %s", out.Files, out.Branch)) + sp.Done() + payload := map[string]any{ + "enriched": out.Files, + "branch": out.Branch, + "duration_ms": out.DurationMS, + "mode": "daemon", + } + if absPath != "" { + payload["path"] = absPath + } + return printEnrichResult(payload) +} + func runEnrichCochange(cmd *cobra.Command, args []string) error { logger := newLogger() defer func() { _ = logger.Sync() }() diff --git a/cmd/gortex/enrich_churn.go b/cmd/gortex/enrich_churn.go new file mode 100644 index 00000000..a77b4dcd --- /dev/null +++ b/cmd/gortex/enrich_churn.go @@ -0,0 +1,180 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "path/filepath" + "time" + + "github.com/spf13/cobra" + + "github.com/zzet/gortex/internal/churn" + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/daemon" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/indexer" + "github.com/zzet/gortex/internal/parser" + "github.com/zzet/gortex/internal/parser/languages" +) + +var ( + enrichChurnBranch string + enrichChurnSnapshot string +) + +var enrichChurnCmd = &cobra.Command{ + Use: "churn [path]", + Short: "Pre-compute per-symbol git churn from a fixed branch (default: origin/main)", + Long: `Walks the indexed repo and stamps meta.churn on every file and +function/method with the commit_count / age_days / churn_rate / +last_author / last_commit_at metrics the get_churn_rate MCP tool reads. + +The signal is computed against a single branch — typically the +repository's default branch — so feature-branch work-in-progress +doesn't pollute the persisted data. Pass --branch to override. + +When a daemon is running on the default socket, this command sends a +control RPC and the daemon does the enrichment against its in-process +graph (avoiding the LadyBug write-lock collision a direct write would +cause). Without a daemon, the command falls back to a one-shot in- +memory pass that can be persisted with --snapshot.`, + Args: cobra.MaximumNArgs(1), + RunE: runEnrichChurn, +} + +func init() { + enrichChurnCmd.Flags().StringVar(&enrichChurnBranch, "branch", "", + "branch / tag / SHA to compute churn against (default: origin/main, falls back to local main/master)") + enrichChurnCmd.Flags().StringVar(&enrichChurnSnapshot, "snapshot", "", + "when no daemon is running, write the enriched in-memory graph as a gob.gz snapshot to this path") + enrichCmd.AddCommand(enrichChurnCmd) +} + +func runEnrichChurn(cmd *cobra.Command, args []string) error { + logger := newLogger() + defer func() { _ = logger.Sync() }() + + path := "." + if len(args) >= 1 { + path = args[0] + } + abs, err := filepath.Abs(path) + if err != nil { + return fmt.Errorf("abs path %q: %w", path, err) + } + + // Daemon path: forward to the running daemon so the enrichment + // runs against its in-process (and possibly LadyBug-backed) + // graph. The daemon already owns the write lock; routing + // through it sidesteps the "can't open the same LadyBug + // directory twice" failure mode. + if daemon.IsRunning() { + return forwardEnrichChurnToDaemon(cmd, abs) + } + + // Standalone path: index in-memory, enrich, optionally snapshot. + // Useful in CI where no daemon is around and the caller wants a + // snapshot artefact. + cfg, err := config.Load(cfgFile) + if err != nil { + return err + } + + g := graph.New() + reg := parser.NewRegistry() + languages.RegisterAll(reg) + idx := indexer.New(g, reg, cfg.Index, loggerForSpinner(cmd, logger)) + + if err := indexWithSpinner(cmd, idx, path); err != nil { + return err + } + + branch := enrichChurnBranch + if branch == "" { + branch = gitDefaultBranch(idx.RootPath()) + } + if branch == "" { + return fmt.Errorf("could not resolve default branch in %s; pass --branch ", idx.RootPath()) + } + + sp := newCLISpinner(cmd, "Stamping churn") + sp.Set("", branch) + started := time.Now() + res, err := churn.EnrichGraph(context.Background(), g, idx.RootPath(), churn.Options{Branch: branch}) + if err != nil { + sp.Fail(err) + return fmt.Errorf("churn: %w", err) + } + sp.Set("", fmt.Sprintf("%d files · %d symbols", res.Files, res.Symbols)) + sp.Done() + + result := map[string]any{ + "files": res.Files, + "symbols": res.Symbols, + "branch": res.Branch, + "head_sha": res.HeadSHA, + "duration_ms": time.Since(started).Milliseconds(), + "root": idx.RootPath(), + "mode": "standalone", + } + if enrichChurnSnapshot != "" { + if err := saveSnapshotTo(g, nil, nil, snapshotVector{}, "gortex-enrich-churn", enrichChurnSnapshot, logger); err != nil { + return fmt.Errorf("write snapshot %s: %w", enrichChurnSnapshot, err) + } + result["snapshot"] = enrichChurnSnapshot + } + return printEnrichResult(result) +} + +// forwardEnrichChurnToDaemon sends a ControlEnrichChurn RPC to the +// running daemon and renders the response. Returns a clear error if +// the daemon rejects the request — including the case where the +// caller's path doesn't match any tracked repo. +func forwardEnrichChurnToDaemon(cmd *cobra.Command, absPath string) error { + c, err := daemon.Dial(daemon.Handshake{Mode: daemon.ModeControl, ClientName: "cli-enrich-churn"}) + if err != nil { + if errors.Is(err, daemon.ErrDaemonUnavailable) { + return fmt.Errorf("daemon socket detected but dial failed; restart the daemon or run with no daemon (it falls back to in-memory)") + } + return fmt.Errorf("dial daemon: %w", err) + } + defer func() { _ = c.Close() }() + + resp, err := c.Control(daemon.ControlEnrichChurn, daemon.EnrichChurnParams{ + Path: absPath, + Branch: enrichChurnBranch, + }) + if err != nil { + return fmt.Errorf("control enrich_churn: %w", err) + } + if !resp.OK { + return fmt.Errorf("daemon rejected enrich_churn [%s]: %s", resp.ErrorCode, resp.ErrorMsg) + } + + var out daemon.EnrichChurnResult + if len(resp.Result) > 0 { + if err := json.Unmarshal(resp.Result, &out); err != nil { + return fmt.Errorf("parse daemon response: %w", err) + } + } + sp := newCLISpinner(cmd, "Enriched via daemon") + sp.Set("", fmt.Sprintf("%d files · %d symbols · %s", out.Files, out.Symbols, out.Branch)) + sp.Done() + payload := map[string]any{ + "files": out.Files, + "symbols": out.Symbols, + "branch": out.Branch, + "head_sha": out.HeadSHA, + "duration_ms": out.DurationMS, + "mode": "daemon", + } + if absPath != "" { + payload["path"] = absPath + } + // printEnrichResult reads payload["root"] for the TTY caption; the + // daemon spans every tracked repo so there is no single root — leave + // it unset and the caption stays silent. + return printEnrichResult(payload) +} diff --git a/cmd/gortex/git.go b/cmd/gortex/git.go index 3cebfc41..f0ff7405 100644 --- a/cmd/gortex/git.go +++ b/cmd/gortex/git.go @@ -5,6 +5,7 @@ import ( "os/exec" "strings" + "github.com/zzet/gortex/internal/churn" "github.com/zzet/gortex/internal/indexer" ) @@ -50,3 +51,12 @@ func gitBranch(dir string) string { func canonicalRepo(dir string) string { return indexer.ResolveWorktree(dir).MainRepoPath } + +// gitDefaultBranch returns the repository's default branch as a +// rev-parseable reference. Thin wrapper over churn.DefaultBranch so +// the CLI, daemon controller, and MCP tool resolve the same branch +// the same way. +func gitDefaultBranch(dir string) string { + return churn.DefaultBranch(dir) +} + diff --git a/cmd/gortex/githook.go b/cmd/gortex/githook.go index 58531dca..26ba9da1 100644 --- a/cmd/gortex/githook.go +++ b/cmd/gortex/githook.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "github.com/spf13/cobra" @@ -11,20 +12,24 @@ import ( ) var ( - githookRegenMermaid bool - githookRegenWiki bool - githookRegenDocs bool - githookMermaidOutDir string - githookWikiOutDir string - githookDocsOutPath string - githookBinary string + githookRegenMermaid bool + githookRegenWiki bool + githookRegenDocs bool + githookRegenChurn bool + githookRegenReleases bool + githookMermaidOutDir string + githookWikiOutDir string + githookDocsOutPath string + githookChurnBranch string + githookReleasesBranch string + githookBinary string ) var githookCmd = &cobra.Command{ Use: "githook", Short: "Manage local git hooks that regenerate gortex artefacts", - Long: `Install, uninstall, and inspect the post-commit hook that re-runs -gortex commands after each commit. + Long: `Install, uninstall, and inspect git hooks that re-run gortex +commands. Supported hooks: post-commit, post-merge. The hook is idempotent: re-running install replaces only the gortex block, leaving any other hook content intact. Uninstall removes the @@ -33,7 +38,7 @@ block and deletes the hook file when it contains nothing else.`, var githookInstallCmd = &cobra.Command{ Use: "install ", - Short: "Install a git hook (currently: post-commit)", + Short: "Install a git hook (post-commit or post-merge)", Args: cobra.ExactArgs(1), RunE: runGithookInstall, } @@ -46,8 +51,9 @@ var githookUninstallCmd = &cobra.Command{ } var githookStatusCmd = &cobra.Command{ - Use: "status", - Short: "Report whether the post-commit hook is gortex-managed", + Use: "status [hook]", + Short: "Report whether the named hook is gortex-managed (default: post-commit)", + Args: cobra.MaximumNArgs(1), RunE: runGithookStatus, } @@ -58,6 +64,14 @@ func init() { "include `gortex wiki .` in the hook") githookInstallCmd.Flags().BoolVar(&githookRegenDocs, "regen-docs", false, "include `gortex docs . --out CHANGELOG_AUTO.md` in the hook") + githookInstallCmd.Flags().BoolVar(&githookRegenChurn, "regen-churn", false, + "include `gortex enrich churn` so get_churn_rate stays fresh without an at-read-time git subprocess") + githookInstallCmd.Flags().StringVar(&githookChurnBranch, "churn-branch", "", + "branch / tag / SHA the churn enricher pins to (default: resolve at hook run-time)") + githookInstallCmd.Flags().BoolVar(&githookRegenReleases, "regen-releases", false, + "include `gortex enrich releases` so analyze kind=releases reads pre-computed Meta") + githookInstallCmd.Flags().StringVar(&githookReleasesBranch, "releases-branch", "", + "branch / tag / SHA the releases enricher restricts to (default: resolve at hook run-time)") githookInstallCmd.Flags().StringVar(&githookMermaidOutDir, "mermaid-out-dir", "docs/architecture/", "output directory for mermaid diagrams") githookInstallCmd.Flags().StringVar(&githookWikiOutDir, "wiki-out-dir", "wiki", @@ -73,48 +87,62 @@ func init() { rootCmd.AddCommand(githookCmd) } +// supportedHook validates the hook arg. We mirror the package-level +// SupportedHooks list rather than importing it so the CLI surface +// stays decoupled from the install package's internals. +func supportedHook(name string) error { + if name == "post-commit" || name == "post-merge" { + return nil + } + return fmt.Errorf("unsupported hook %q (supported: post-commit, post-merge)", name) +} + func runGithookInstall(cmd *cobra.Command, args []string) error { hook := args[0] - if hook != "post-commit" { - return fmt.Errorf("only the post-commit hook is supported (got %q)", hook) + if err := supportedHook(hook); err != nil { + return err } repoRoot, err := resolveGithookRepoRoot() if err != nil { return err } - if !githookRegenMermaid && !githookRegenWiki && !githookRegenDocs { + if !githookRegenMermaid && !githookRegenWiki && !githookRegenDocs && !githookRegenChurn && !githookRegenReleases { // Default to mermaid when nothing was chosen — minimum // useful behaviour. githookRegenMermaid = true } - path, err := githooks.InstallPostCommit(repoRoot, githooks.InstallOpts{ - Binary: githookBinary, - RegenMermaid: githookRegenMermaid, - RegenWiki: githookRegenWiki, - RegenDocs: githookRegenDocs, - MermaidOutDir: githookMermaidOutDir, - WikiOutDir: githookWikiOutDir, - DocsOutPath: githookDocsOutPath, + path, err := githooks.InstallHook(repoRoot, hook, githooks.InstallOpts{ + Binary: githookBinary, + RegenMermaid: githookRegenMermaid, + RegenWiki: githookRegenWiki, + RegenDocs: githookRegenDocs, + RegenChurn: githookRegenChurn, + ChurnBranch: githookChurnBranch, + RegenReleases: githookRegenReleases, + ReleasesBranch: githookReleasesBranch, + MermaidOutDir: githookMermaidOutDir, + WikiOutDir: githookWikiOutDir, + DocsOutPath: githookDocsOutPath, }) if err != nil { return err } _, _ = fmt.Fprintf(cmd.OutOrStdout(), - "installed post-commit hook at %s\nactions: mermaid=%t wiki=%t docs=%t\n", - path, githookRegenMermaid, githookRegenWiki, githookRegenDocs) + "installed %s hook at %s\nactions: mermaid=%t wiki=%t docs=%t churn=%t releases=%t\n", + hook, path, githookRegenMermaid, githookRegenWiki, githookRegenDocs, githookRegenChurn, githookRegenReleases) return nil } func runGithookUninstall(cmd *cobra.Command, args []string) error { hook := args[0] - if hook != "post-commit" { - return fmt.Errorf("only the post-commit hook is supported (got %q)", hook) + if err := supportedHook(hook); err != nil { + return err } repoRoot, err := resolveGithookRepoRoot() if err != nil { return err } - path, removed, err := githooks.UninstallPostCommit(repoRoot) + path, removed, err := githooks.UninstallHook(repoRoot, hook) if err != nil { return err } @@ -126,19 +154,40 @@ func runGithookUninstall(cmd *cobra.Command, args []string) error { return nil } -func runGithookStatus(cmd *cobra.Command, _ []string) error { +func runGithookStatus(cmd *cobra.Command, args []string) error { + hook := "post-commit" + if len(args) > 0 { + if err := supportedHook(args[0]); err != nil { + return err + } + hook = args[0] + } repoRoot, err := resolveGithookRepoRoot() if err != nil { return err } - rep, err := githooks.Status(repoRoot) + hookPath, err := githooks.HookPathFor(repoRoot, hook) if err != nil { return err } + // Read directly; Status() is post-commit-locked and we want per-hook + // detail. Mirrors Status() but parameterised on hook. + body, ferr := os.ReadFile(hookPath) + exists := ferr == nil + managed := false + if exists { + bs := string(body) + begin := "# gortex-managed:" + hook + ":begin" + end := "# gortex-managed:" + hook + ":end" + if strings.Contains(bs, begin) && strings.Contains(bs, end) { + managed = true + } + } out := cmd.OutOrStdout() - _, _ = fmt.Fprintf(out, "hook_path: %s\n", rep.HookPath) - _, _ = fmt.Fprintf(out, "exists: %t\n", rep.Exists) - _, _ = fmt.Fprintf(out, "managed: %t\n", rep.Managed) + _, _ = fmt.Fprintf(out, "hook: %s\n", hook) + _, _ = fmt.Fprintf(out, "hook_path: %s\n", hookPath) + _, _ = fmt.Fprintf(out, "exists: %t\n", exists) + _, _ = fmt.Fprintf(out, "managed: %t\n", managed) return nil } diff --git a/cmd/gortex/server.go b/cmd/gortex/server.go index 2ca2cdb1..d12fead7 100644 --- a/cmd/gortex/server.go +++ b/cmd/gortex/server.go @@ -1,6 +1,7 @@ package main import ( + "context" "fmt" "net" "net/http" @@ -14,9 +15,9 @@ import ( "strings" "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/progress" "github.com/zzet/gortex/internal/contracts" "github.com/zzet/gortex/internal/daemon" - "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/indexer" gortexmcp "github.com/zzet/gortex/internal/mcp" "github.com/zzet/gortex/internal/mcp/streamable" @@ -66,7 +67,10 @@ var ( // the in-memory graph before the HTTP listener accepts traffic. // Used by gortex-cloud's per-workspace supervisor to boot a // hosted gortex server from R2/Hetzner-OS-cached state. - serverSnapshot string + serverSnapshot string + serverBackend string + serverBackendPath string + serverBackendBufferPoolMB uint64 ) var serverCmd = &cobra.Command{ @@ -96,6 +100,10 @@ func init() { serverCmd.Flags().BoolVar(&serverNoSemantic, "no-semantic", false, "disable semantic enrichment") serverCmd.Flags().StringVar(&serverSemanticMode, "semantic-mode", "typecheck", "Go analysis mode: typecheck or callgraph") serverCmd.Flags().StringVar(&serverSnapshot, "snapshot", "", "load a snapshot file at startup (gob+gzip; the format `gortex index --snapshot` writes). Used by gortex-cloud's per-workspace supervisor to boot from a precomputed snapshot.") + serverCmd.Flags().StringVar(&serverBackend, "backend", "memory", "storage backend: memory (in-process, default — fastest, no persistence) | ladybug (embedded Cypher graph DB — persists to --backend-path, slower per-op but cold-loads from disk)") + serverCmd.Flags().Uint64Var(&serverBackendBufferPoolMB, "backend-buffer-pool-mb", 0, + "page-cache cap for the on-disk backend in MiB. 0 falls back to 4096 (4 GiB); only consulted for --backend=ladybug") + serverCmd.Flags().StringVar(&serverBackendPath, "backend-path", "", "directory where the on-disk backend persists its store. Required when --backend != memory. Default: ~/.gortex/.store") rootCmd.AddCommand(serverCmd) } @@ -137,7 +145,11 @@ func runServer(cmd *cobra.Command, _ []string) error { } // Build graph/parser/indexer/query/MCP stack. - g := graph.New() + g, backendCleanup, err := openBackend(serverBackend, serverBackendPath, serverBackendBufferPoolMB, logger) + if err != nil { + return fmt.Errorf("opening backend %q: %w", serverBackend, err) + } + defer backendCleanup() reg := parser.NewRegistry() languages.RegisterAll(reg) languages.RegisterCustomGrammars(reg, cfg.Index.Grammars, logger) @@ -321,7 +333,7 @@ func runServer(cmd *cobra.Command, _ []string) error { } // Multi-repo support. - cm, err := config.NewConfigManager("") + cm, err := config.NewConfigManager(cfgFile) if err != nil { fmt.Fprintf(os.Stderr, "[gortex] warning: could not load global config: %v\n", err) } @@ -415,11 +427,24 @@ func runServer(cmd *cobra.Command, _ []string) error { srv.SetLSPDiagnosticsBroadcasting() } - // Create persistence store. + // Create persistence store. The snapshot cache exists for the + // in-memory backend, where heap state is lost on restart — load + // from snapshot skips the parse phase on a warm restart. For + // the ladybug on-disk backend the store IS already persistent + // across restarts: re-opening the same path hands back the + // previous run's graph in milliseconds, and replaying a snapshot + // via per-row g.AddNode would just re-write everything we already + // have at glacial per-row Cypher speed. Skip the cache entirely + // on those backends. var store persistence.Store - if serverNoCache { + persistentBackend := !strings.EqualFold(strings.TrimSpace(serverBackend), "memory") && strings.TrimSpace(serverBackend) != "" + switch { + case serverNoCache: store = persistence.NopStore{} - } else { + case persistentBackend: + fmt.Fprintf(os.Stderr, "[gortex] server: snapshot cache disabled (backend=%s persists across restarts)\n", serverBackend) + store = persistence.NopStore{} + default: var err error store, err = persistence.NewFileStore(serverCacheDir, version) if err != nil { @@ -587,9 +612,35 @@ func runServer(cmd *cobra.Command, _ []string) error { // Background: index, multi-repo, analyze — graph populates while HTTP is live. go func() { - // When MultiIndexer is available (global config has repos), use it exclusively. - // Single --index flag is only used when no multi-repo config exists. - if mi != nil { + // Live progress logging — the daemon runs without a TTY so + // the Spinner reporter is silent. Hook a zap-logging reporter + // + a graph-size heartbeat so the log shows what's happening. + hbCtx, hbCancel := context.WithCancel(context.Background()) + defer hbCancel() + progress.StartHeartbeat(hbCtx, logger, "indexing", 5*time.Second, func() map[string]any { + // idx.Graph() follows the indexer's active store — + // during cold-start the indexer swaps to an in-memory + // shadow, so reading via idx.Graph() shows the live + // growing count. g.NodeCount() would always read the + // disk store and stay at 0 until FlushBulk drains. + cur := idx.Graph() + if cur == nil { + cur = g + } + return map[string]any{ + "nodes": cur.NodeCount(), + "edges": cur.EdgeCount(), + "disk_nodes": g.NodeCount(), + "disk_edges": g.EdgeCount(), + } + }) + // When the active config has repos AND no explicit --index was + // requested, use MultiIndexer (it handles the per-repo flow). + // When --index is set the user wants single-repo behaviour, + // even when a multi-repo config exists — bypass MultiIndexer. + hasActiveRepos := cm != nil && len(cm.ActiveRepos()) > 0 + useMulti := mi != nil && hasActiveRepos && serverIndex == "" + if useMulti { if serverWorkspace != "" || serverScopeProject != "" { fmt.Fprintf(os.Stderr, "[gortex] server: multi-repo indexing (scope: workspace=%q project=%q)...\n", serverWorkspace, serverScopeProject) } else { diff --git a/go.mod b/go.mod index 7436767c..7c82c40c 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module github.com/zzet/gortex go 1.26.2 require ( + github.com/LadybugDB/go-ladybug v0.13.1 github.com/alexaandru/go-sitter-forest/ada v1.9.0 github.com/alexaandru/go-sitter-forest/agda v1.9.0 github.com/alexaandru/go-sitter-forest/aiken v1.9.0 @@ -344,6 +345,7 @@ require ( github.com/sagikazarmark/locafero v0.12.0 // indirect github.com/sahilm/fuzzy v0.1.2 // indirect github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 // indirect + github.com/shopspring/decimal v1.4.0 // indirect github.com/spf13/afero v1.15.0 // indirect github.com/spf13/cast v1.10.0 // indirect github.com/spf13/pflag v1.0.10 // indirect @@ -354,7 +356,8 @@ require ( github.com/x448/float16 v0.8.4 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/yosida95/uritemplate/v3 v3.0.2 // indirect - go.etcd.io/bbolt v1.4.3 // indirect + github.com/zeebo/assert v1.3.0 // indirect + go.etcd.io/bbolt v1.4.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.52.0 // indirect @@ -380,3 +383,10 @@ replace github.com/mattn/go-pointer => ./internal/thirdparty/go-pointer // blocked the Windows build because github.com/coder/hnsw imports it // unconditionally. See internal/thirdparty/renameio. replace github.com/google/renameio => ./internal/thirdparty/renameio + +// Vendored copy of github.com/LadybugDB/go-ladybug v0.13.1 with a +// missing lbug_value_destroy added to FlatTuple.GetValue. Upstream +// leaks one C-side allocation per column of every materialised row; +// observed as 15.8GB / 211M allocations in the DefaultMallocZone on +// a daemon after warmup + 27 tool calls. See internal/thirdparty/go-ladybug. +replace github.com/LadybugDB/go-ladybug => ./internal/thirdparty/go-ladybug diff --git a/go.sum b/go.sum index df168aa3..74e5ad46 100644 --- a/go.sum +++ b/go.sum @@ -448,8 +448,6 @@ github.com/blevesearch/bleve_index_api v1.3.11 h1:x29vbV8OjWfLcrDVd7Lr1q+BkLNS0J github.com/blevesearch/bleve_index_api v1.3.11/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko= github.com/blevesearch/geo v0.2.5 h1:yJg9FX1oRwLnjXSXF+ECHfXFTF4diF02Ca/qUGVjJhE= github.com/blevesearch/geo v0.2.5/go.mod h1:Jhq7WE2K6mJTx1xS44M2pUO6Io+wjCSHh1+co3YOgH4= -github.com/blevesearch/go-faiss v1.1.1 h1:oUignystYUkdYBrVh6PkTkBlfCNql2QcS+fc0fTjtVQ= -github.com/blevesearch/go-faiss v1.1.1/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-faiss v1.1.2 h1:ojv2S7ot3orbk8wMfJWryq37G4eIL8Y8PLLZYd8ZLHY= github.com/blevesearch/go-faiss v1.1.2/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= @@ -500,8 +498,6 @@ github.com/charmbracelet/x/exp/golden v0.0.0-20241011142426-46044092ad91 h1:payR github.com/charmbracelet/x/exp/golden v0.0.0-20241011142426-46044092ad91/go.mod h1:wDlXFlCrmJ8J+swcL/MnGUuYnqgQdW9rhSD61oNMb6U= github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk= github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= -github.com/chewxy/math32 v1.11.1 h1:b7PGHlp8KjylDoU8RrcEsRuGZhJuz8haxnKfuMMRqy8= -github.com/chewxy/math32 v1.11.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= github.com/chewxy/math32 v1.11.2 h1:IufN08Zwr1NKuWfY+4Tz55BcwKmyKKNdOP7KtumehnM= github.com/chewxy/math32 v1.11.2/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= github.com/clipperhouse/displaywidth v0.11.0 h1:lBc6kY44VFw+TDx4I8opi/EtL9m20WSEFgwIwO+UVM8= @@ -586,12 +582,8 @@ github.com/jedib0t/go-pretty/v6 v6.7.10 h1:B/2qW2Bkv2L6n14PP8o1kx75kWzHOQ3YTluWz github.com/jedib0t/go-pretty/v6 v6.7.10/go.mod h1:YwC5CE4fJ1HFUDeivSV1r//AmANFHyqczZk+U6BDALU= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= -github.com/klauspost/cpuid/v2 v2.0.12 h1:p9dKCg8i4gmOxtv35DvrYoWqYzQrvEVdjQ762Y0OqZE= -github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= -github.com/knights-analytics/hugot v0.7.2 h1:zDXXAa7c1d4VOcKbqiIVvkLLpzeqjc9K8BApnAQKcVc= -github.com/knights-analytics/hugot v0.7.2/go.mod h1:BQ9lXqBv6g0ykhpDfyxJ8I7/is+GxLl15JKPKBvrVAQ= github.com/knights-analytics/hugot v0.7.3 h1:39UqU52s4nAmNIE4JG5ViASCvd8dhue7XGtt5RhK3T4= github.com/knights-analytics/hugot v0.7.3/go.mod h1:86tRz/GzyoNFHuUUzgiYnALQNZU8Vzd5F0pApYizwrs= github.com/knights-analytics/ortgenai v0.3.1 h1:0Awe43Zu+giDxzlpoNvx9ekbez/zxc8XMzKU++sOUB8= @@ -653,10 +645,10 @@ github.com/santhosh-tekuri/jsonschema/v6 v6.0.2 h1:KRzFb2m7YtdldCEkzs6KqmJw4nqEV github.com/santhosh-tekuri/jsonschema/v6 v6.0.2/go.mod h1:JXeL+ps8p7/KNMjDQk3TCwPpBy0wYklyWTfbkIzdIFU= github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc= github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= -github.com/sgtdi/fswatcher v1.2.0 h1:uSJuMc3/Eo/vaPnZWpJ42EFYb5j38cZENmkszOV0yhw= -github.com/sgtdi/fswatcher v1.2.0/go.mod h1:smzXnaqu0SYJQNIwGLLkvRkpH4RdEACB7avMSsSaqjQ= github.com/sgtdi/fswatcher v1.3.0 h1:2tFEnBml5EipRF4TvUP0x+T4ty2OSYlmvcnQ6dSTp04= github.com/sgtdi/fswatcher v1.3.0/go.mod h1:I4FUeG0e27WFw+ogs5OjZSgPKobnGrUa17EwjRjZQaY= +github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= +github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg= github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY= @@ -739,14 +731,14 @@ github.com/yalue/onnxruntime_go v1.30.1 h1:NaEng5lWbsHZ/8X1dtaw1mIj7eV1ozyjbFo// github.com/yalue/onnxruntime_go v1.30.1/go.mod h1:b4X26A8pekNb1ACJ58wAXgNKeUCGEAQ9dmACut9Sm/4= github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4= -github.com/zeebo/assert v1.1.0 h1:hU1L1vLTHsnO8x8c9KAR5GmM5QscxHg5RNU5z5qbUWY= -github.com/zeebo/assert v1.1.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= +github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/blake3 v0.2.4 h1:KYQPkhpRtcqh0ssGYcKLG1JYvddkEA8QwCM/yBqhaZI= github.com/zeebo/blake3 v0.2.4/go.mod h1:7eeQ6d2iXWRGF6npfaxl2CU+xy2Fjo2gxeyZGCRUjcE= github.com/zeebo/pcg v1.0.1 h1:lyqfGeWiv4ahac6ttHs+I5hwtH/+1mrhlCtVNQM2kHo= github.com/zeebo/pcg v1.0.1/go.mod h1:09F0S9iiKrwn9rlI5yjLkmrug154/YRW6KnnXVDM/l4= -go.etcd.io/bbolt v1.4.3 h1:dEadXpI6G79deX5prL3QRNP6JB8UxVkqo4UPnHaNXJo= -go.etcd.io/bbolt v1.4.3/go.mod h1:tKQlpPaYCVFctUIgFKFnAlvbmB3tpy1vkTnDWohtc0E= +go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= +go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -755,14 +747,10 @@ go.uber.org/zap v1.28.0 h1:IZzaP1Fv73/T/pBMLk4VutPl36uNC+OSUh3JLG3FIjo= go.uber.org/zap v1.28.0/go.mod h1:rDLpOi171uODNm/mxFcuYWxDsqWSAVkFdX4XojSKg/Q= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= -golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= -golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988= golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc= golang.org/x/exp v0.0.0-20260508232706-74f9aab9d74a h1:+3jdDGGB8NGb1Zktc737jlt3/A5f6UlwSzmvqUuufxw= golang.org/x/exp v0.0.0-20260508232706-74f9aab9d74a/go.mod h1:d2fgXJLVs4dYDHUk5lwMIfzRzSrWCfGZb0ZqeLa/Vcw= -golang.org/x/image v0.40.0 h1:Tw4GyDXMo+daZN1znreBRC3VayR1aLFUyUEOLUdW1a8= -golang.org/x/image v0.40.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA= golang.org/x/image v0.41.0 h1:8wS72eGJMJaBxK6okTzd4WaXumUlTVlb753MlsSvTCo= golang.org/x/image v0.41.0/go.mod h1:uIc348UZMSvS5Z65CVZ7iDPaNobNFEPeJ4kbqTOszmA= golang.org/x/mod v0.36.0 h1:JJjpVx6myfUsUdAzZuOSTTmRE0PfZeNWzzvKrP7amb4= @@ -770,8 +758,6 @@ golang.org/x/mod v0.36.0/go.mod h1:moc6ELqsWcOw5Ef3xVprK5ul/MvtVvkIXLziUOICjUQ= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= -golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= diff --git a/internal/analysis/analysis_test.go b/internal/analysis/analysis_test.go index 7fffe5c8..9d648ac0 100644 --- a/internal/analysis/analysis_test.go +++ b/internal/analysis/analysis_test.go @@ -146,11 +146,11 @@ func TestAnalyzeImpact_DropsHeuristicNoiseAtTransitiveDepths(t *testing.T) { } func TestAnalyzeImpact_RiskLevels(t *testing.T) { - assert.Equal(t, RiskLow, assessRisk(0, 0, 0)) - assert.Equal(t, RiskLow, assessRisk(1, 1, 0)) - assert.Equal(t, RiskMedium, assessRisk(2, 3, 0)) - assert.Equal(t, RiskHigh, assessRisk(5, 5, 0)) - assert.Equal(t, RiskCritical, assessRisk(10, 10, 0)) + assert.Equal(t, RiskLow, assessRisk(0, 0)) + assert.Equal(t, RiskLow, assessRisk(1, 1)) + assert.Equal(t, RiskMedium, assessRisk(2, 3)) + assert.Equal(t, RiskHigh, assessRisk(5, 5)) + assert.Equal(t, RiskCritical, assessRisk(10, 10)) } func TestScoreEntryPoint(t *testing.T) { diff --git a/internal/analysis/architecture.go b/internal/analysis/architecture.go index d4beb662..0f2010e5 100644 --- a/internal/analysis/architecture.go +++ b/internal/analysis/architecture.go @@ -19,7 +19,7 @@ import ( // reports a violation when a cross-layer dependency breaks the source // layer's allow/deny rules. Symbols in no declared layer, and edges // to such symbols, are unconstrained. -func EvaluateArchitecture(g *graph.Graph, arch config.ArchitectureConfig, changedSymbolIDs []string) []GuardViolation { +func EvaluateArchitecture(g graph.Store, arch config.ArchitectureConfig, changedSymbolIDs []string) []GuardViolation { if g == nil || arch.IsEmpty() { return nil } @@ -76,7 +76,7 @@ func EvaluateArchitecture(g *graph.Graph, arch config.ArchitectureConfig, change // evaluateArchRules checks the per-layer / per-pattern dependency-cone // rules — fan-out caps and caller-boundary restrictions — for a set // of changed symbols. -func evaluateArchRules(g *graph.Graph, arch config.ArchitectureConfig, changedSymbolIDs, layerNames []string) []GuardViolation { +func evaluateArchRules(g graph.Store, arch config.ArchitectureConfig, changedSymbolIDs, layerNames []string) []GuardViolation { if len(arch.Rules) == 0 { return nil } @@ -169,7 +169,7 @@ func callerWithinBoundary(callerPath string, rule config.ArchRule, callerLayer s // distinctCallTargets counts the distinct symbols a node calls or // references — the dependency-cone size. -func distinctCallTargets(g *graph.Graph, id string) int { +func distinctCallTargets(g graph.Store, id string) int { seen := make(map[string]bool) for _, e := range g.GetOutEdges(id) { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { diff --git a/internal/analysis/betweenness.go b/internal/analysis/betweenness.go index c07d207c..352c038b 100644 --- a/internal/analysis/betweenness.go +++ b/internal/analysis/betweenness.go @@ -72,33 +72,77 @@ const ( // // Pivot sampling is seeded with a fixed seed, so results are // reproducible run to run. -func ComputeBetweenness(g *graph.Graph) *BetweennessResult { +func ComputeBetweenness(g graph.Store) *BetweennessResult { if g == nil { return &BetweennessResult{Scores: map[string]float64{}} } - nodes := g.AllNodes() - n := len(nodes) + // Betweenness measures shortest-path centrality across the + // call / reference subgraph; only function and method nodes carry + // those edges. The scoring kernel only ever touches node IDs, so + // the unfiltered AllNodes() pull was wasted on the other 90% of + // the node table AND on the 9 unused columns of every retained + // row. NodeIDsByKinds returns just the id column from a single + // Cypher query; NodesByKindsScanner is the legacy fallback for + // backends that haven't shipped the id projection yet. + betweennessKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + bcNodeKinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + var ids []string + if scan, ok := g.(graph.NodeIDsByKinds); ok { + ids = scan.NodeIDsByKinds(bcNodeKinds) + } else if scan, ok := g.(graph.NodesByKindsScanner); ok { + ns := scan.NodesByKinds(bcNodeKinds) + ids = make([]string, 0, len(ns)) + for _, nd := range ns { + ids = append(ids, nd.ID) + } + } else { + all := g.AllNodes() + ids = make([]string, 0, len(all)) + for _, nd := range all { + if nd.Kind == graph.KindFunction || nd.Kind == graph.KindMethod { + ids = append(ids, nd.ID) + } + } + } + n := len(ids) if n == 0 { return &BetweennessResult{Scores: map[string]float64{}} } // Stable node ordering: betweenness itself is order-independent, // but a deterministic order makes the sampled pivot pick - // reproducible regardless of the map-iteration order AllNodes - // happens to return. - ids := make([]string, n) - for i, nd := range nodes { - ids[i] = nd.ID - } + // reproducible regardless of the iteration order + // NodeIDsByKinds happens to return. sort.Strings(ids) // Forward adjacency over the call / reference subgraph. + // EdgeAdjacencyForKinds returns only the (from, to) projection of + // function/method endpoints — the disk path collapses to one + // Cypher join with both endpoint kinds enforced server-side, so + // neither the cross-kind edges nor the ~10 unused columns ever + // cross cgo. Falls back to EdgesByKinds (and then EdgesByKind per + // kind) on backends that don't implement the adjacency capability. adj := make(map[string][]string, n) - for _, e := range g.AllEdges() { - if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { - continue + if adjScan, ok := g.(graph.EdgeAdjacencyForKinds); ok { + for pair := range adjScan.EdgeAdjacencyForKinds(betweennessKinds, bcNodeKinds) { + adj[pair[0]] = append(adj[pair[0]], pair[1]) + } + } else if es, ok := g.(graph.EdgesByKindsScanner); ok { + for e := range es.EdgesByKinds(betweennessKinds) { + if e == nil { + continue + } + adj[e.From] = append(adj[e.From], e.To) + } + } else { + for _, kind := range betweennessKinds { + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + adj[e.From] = append(adj[e.From], e.To) + } } - adj[e.From] = append(adj[e.From], e.To) } score := make(map[string]float64, n) @@ -156,7 +200,7 @@ func samplePivots(ids []string, k int) []string { rng := rand.New(rand.NewSource(betweennessSeed)) perm := rng.Perm(len(ids)) out := make([]string, k) - for i := 0; i < k; i++ { + for i := range k { out[i] = ids[perm[i]] } return out diff --git a/internal/analysis/communities.go b/internal/analysis/communities.go index df26ef9c..1290eebb 100644 --- a/internal/analysis/communities.go +++ b/internal/analysis/communities.go @@ -5,6 +5,7 @@ import ( "math" "path/filepath" "sort" + "strconv" "strings" "github.com/zzet/gortex/internal/graph" @@ -50,13 +51,13 @@ type CommunityResult struct { // The Louvain implementation is preserved as // DetectCommunitiesLouvain so we can benchmark, A/B, or fall back // without re-deriving the algorithm. -func DetectCommunities(g *graph.Graph) *CommunityResult { +func DetectCommunities(g graph.Store) *CommunityResult { return DetectCommunitiesLeiden(g) } // DetectCommunitiesLouvain is the original Louvain implementation, // retained for benchmarking and as a known-good fallback. -func DetectCommunitiesLouvain(g *graph.Graph) *CommunityResult { +func DetectCommunitiesLouvain(g graph.Store) *CommunityResult { nodes := g.AllNodes() edges := g.AllEdges() @@ -123,105 +124,7 @@ func DetectCommunitiesLouvain(g *graph.Graph) *CommunityResult { } sort.Strings(commIDs) // deterministic visitation comm, commNodes := louvainLocalMoves(commIDs, neighbors, degree, totalWeight) - - // Build result - nodeMap := make(map[string]*graph.Node) - for _, n := range nodes { - nodeMap[n.ID] = n - } - - result := &CommunityResult{ - NodeToComm: make(map[string]string), - } - - // Renumber communities. We sort by old id so renumbering is - // stable across reruns (the underlying ids are member ids, which - // were sorted to drive the local-moves loop deterministically). - oldIDs := make([]string, 0, len(commNodes)) - for cid := range commNodes { - if len(commNodes[cid]) >= 2 { - oldIDs = append(oldIDs, cid) - } - } - sort.Strings(oldIDs) - commRemap := make(map[string]string, len(oldIDs)) - for i, cid := range oldIDs { - commRemap[cid] = fmt.Sprintf("community-%d", i) - } - - for nodeID, cid := range comm { - if newID, ok := commRemap[cid]; ok { - result.NodeToComm[nodeID] = newID - } - } - - // Build Community objects - for oldID, members := range commNodes { - newID, ok := commRemap[oldID] - if !ok { - continue - } - - fileSet := make(map[string]bool) - for _, mid := range members { - if n, ok := nodeMap[mid]; ok { - fileSet[n.FilePath] = true - } - } - - files := make([]string, 0, len(fileSet)) - for f := range fileSet { - files = append(files, f) - } - sort.Strings(files) - - label := inferCommunityLabel(members, nodeMap, files) - cohesion := computeCohesion(members, neighbors) - hub := findHub(members, nodeMap, neighbors) - - c := Community{ - ID: newID, - Label: label, - Members: members, - Files: files, - Size: len(members), - Cohesion: cohesion, - Hub: hub, - } - result.Communities = append(result.Communities, c) - } - - // Multi-pass label disambiguation: Louvain often splits a single - // directory into many call-density-based sub-clusters (e.g. 48 - // different clusters whose files all live in parser/languages/). - // The directory-based label is identical for all of them, which - // reads as duplicate cards in the UI. We tag colliding labels - // with the cluster's hub symbol — the function/type that - // everything else in the cluster connects through — which is the - // most semantically meaningful disambiguator. - disambiguateLabels(result.Communities) - - // Sibling grouping. Louvain genuinely produces dozens of peer - // communities under a single dominant directory (48 clusters all - // rooted at parser/languages/ in this codebase). Formally those - // peers are not sub-communities at the *modularity* level — we - // confirmed phase-2 Louvain doesn't merge them — but in - // navigation terms they obviously belong together. We surface - // that by computing ParentID from the cluster's directory head - // (the part of the label before " · sample" and " +N dirs"): - // any two clusters whose head matches get the same ParentID, so - // the UI can render them under a shared section header. - assignDirectoryParents(result.Communities) - - // Sort by size descending - sort.Slice(result.Communities, func(i, j int) bool { - return result.Communities[i].Size > result.Communities[j].Size - }) - - // Compute modularity - result.Modularity = computeModularity(comm, neighbors, degree, totalWeight) - - return result + return finaliseCommunityPartition(nodes, comm, commNodes, neighbors, degree, totalWeight) } // disambiguateLabels makes every cluster label unique. The @@ -785,3 +688,174 @@ func namePrefixLabel(members []string, nodeMap map[string]*graph.Node) string { } return bestPrefix } + +// finaliseCommunityPartition converts a (nodeID → community label) +// partition into a fully-shaped CommunityResult: renumbered IDs, +// per-cluster files / cohesion / hub, label disambiguation, and +// sibling-group parent assignment. Shared by the in-process Louvain +// path (which builds the partition itself) and the backend-delegated +// path (DetectCommunitiesLouvainBackend, which takes the partition +// from graph.CommunityDetector). +// +// commNodes can be nil; when it is, the function inverts comm to +// recover the per-community member list (one extra pass — only used +// on the backend path where commNodes isn't pre-built). +func finaliseCommunityPartition( + nodes []*graph.Node, + comm map[string]string, + commNodes map[string][]string, + neighbors map[string]map[string]float64, + degree map[string]float64, + totalWeight float64, +) *CommunityResult { + if commNodes == nil { + commNodes = make(map[string][]string, len(comm)) + for nid, cid := range comm { + commNodes[cid] = append(commNodes[cid], nid) + } + } + + nodeMap := make(map[string]*graph.Node, len(nodes)) + for _, n := range nodes { + nodeMap[n.ID] = n + } + + result := &CommunityResult{ + NodeToComm: make(map[string]string), + } + + // Renumber: keep clusters of size >= 2, sort old labels for + // determinism, mint sequential "community-N" names. + oldIDs := make([]string, 0, len(commNodes)) + for cid := range commNodes { + if len(commNodes[cid]) >= 2 { + oldIDs = append(oldIDs, cid) + } + } + sort.Strings(oldIDs) + commRemap := make(map[string]string, len(oldIDs)) + for i, cid := range oldIDs { + commRemap[cid] = fmt.Sprintf("community-%d", i) + } + + for nodeID, cid := range comm { + if newID, ok := commRemap[cid]; ok { + result.NodeToComm[nodeID] = newID + } + } + + for oldID, members := range commNodes { + newID, ok := commRemap[oldID] + if !ok { + continue + } + fileSet := make(map[string]bool) + for _, mid := range members { + if n, ok := nodeMap[mid]; ok { + fileSet[n.FilePath] = true + } + } + files := make([]string, 0, len(fileSet)) + for f := range fileSet { + files = append(files, f) + } + sort.Strings(files) + + c := Community{ + ID: newID, + Label: inferCommunityLabel(members, nodeMap, files), + Members: members, + Files: files, + Size: len(members), + Cohesion: computeCohesion(members, neighbors), + Hub: findHub(members, nodeMap, neighbors), + } + result.Communities = append(result.Communities, c) + } + + disambiguateLabels(result.Communities) + assignDirectoryParents(result.Communities) + sort.Slice(result.Communities, func(i, j int) bool { + return result.Communities[i].Size > result.Communities[j].Size + }) + result.Modularity = computeModularity(comm, neighbors, degree, totalWeight) + return result +} + +// DetectCommunitiesLouvainBackend runs Louvain via the backend's +// engine-native implementation (graph.CommunityDetector — today +// only store_ladybug) and threads the resulting partition through +// the same post-processing the in-process DetectCommunitiesLouvain +// uses. The output is shape-identical: every Community label, +// hub, cohesion, parent, and modularity field is populated from +// the partition, so downstream consumers (UI, rerank pipeline) +// can't tell which path produced it. +// +// Returns nil when the backend errors — callers should fall +// through to the in-process path rather than surface a half-done +// CommunityResult. +func DetectCommunitiesLouvainBackend(g graph.Store, cd graph.CommunityDetector) *CommunityResult { + if g == nil || cd == nil { + return nil + } + hits, err := cd.Louvain(graph.CommunityOpts{}) + if err != nil || len(hits) == 0 { + return nil + } + + nodes := g.AllNodes() + symbolNodes := make(map[string]bool, len(nodes)) + for _, n := range nodes { + if n.Kind != graph.KindFile && n.Kind != graph.KindImport { + symbolNodes[n.ID] = true + } + } + + // Rebuild the same weighted neighbor view DetectCommunitiesLouvain + // uses — needed for cohesion / hub / modularity. The work is + // O(V + E) per call; small relative to the engine-native + // partitioning save. + type edgeKey struct{ a, b string } + weights := make(map[edgeKey]float64) + for _, e := range g.AllEdges() { + if !symbolNodes[e.From] || !symbolNodes[e.To] { + continue + } + w := edgeWeight(e.Kind) + if w == 0 { + continue + } + weights[edgeKey{e.From, e.To}] += w + weights[edgeKey{e.To, e.From}] += w + } + neighbors := make(map[string]map[string]float64) + for k, w := range weights { + if neighbors[k.a] == nil { + neighbors[k.a] = make(map[string]float64) + } + neighbors[k.a][k.b] = w + } + var totalWeight float64 + for _, w := range weights { + totalWeight += w + } + totalWeight /= 2 + degree := make(map[string]float64, len(symbolNodes)) + for id := range symbolNodes { + for _, w := range neighbors[id] { + degree[id] += w + } + } + + comm := make(map[string]string, len(hits)) + for _, h := range hits { + if !symbolNodes[h.NodeID] { + continue + } + comm[h.NodeID] = strconv.FormatInt(h.CommunityID, 10) + } + if len(comm) == 0 { + return nil + } + return finaliseCommunityPartition(nodes, comm, nil, neighbors, degree, totalWeight) +} diff --git a/internal/analysis/components.go b/internal/analysis/components.go new file mode 100644 index 00000000..b11016aa --- /dev/null +++ b/internal/analysis/components.go @@ -0,0 +1,294 @@ +package analysis + +import ( + "sort" + + "github.com/zzet/gortex/internal/graph" +) + +// ComponentResult is one connected component returned by +// ComputeWCC / ComputeSCC. Members are sorted ascending so the +// output is deterministic across runs. +type ComponentResult struct { + ID int `json:"id"` + Members []string `json:"members"` + Size int `json:"size"` +} + +// ComponentOptions filters the working set the algorithm runs +// against. Empty NodeKinds / EdgeKinds means "all kinds". +type ComponentOptions struct { + NodeKinds []graph.NodeKind + EdgeKinds []graph.EdgeKind + // MinSize trims trivial singleton components from the + // response — common for SCC where every non-cyclic symbol + // is its own 1-element SCC. + MinSize int +} + +// ComputeWCC returns the weakly connected components of g — pairs +// of nodes reachable from each other when every edge is treated +// as undirected. Components are sorted by size descending; ties +// broken by member ID for determinism. +// +// O(V + E). Used as the fallback when the backing graph.Store +// does not implement graph.ComponentFinder. +func ComputeWCC(g graph.Store, opts ComponentOptions) []ComponentResult { + if g == nil { + return nil + } + nodeAllow := makeComponentKindAllow(opts.NodeKinds) + edgeAllow := makeComponentEdgeAllow(opts.EdgeKinds) + + // Build a dense int index over allowed nodes. + nodes := g.AllNodes() + idx := make(map[string]int, len(nodes)) + dense := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || !nodeAllow(n.Kind) { + continue + } + idx[n.ID] = len(dense) + dense = append(dense, n.ID) + } + if len(dense) == 0 { + return nil + } + + // Undirected adjacency over allowed edges. + adj := make([][]int, len(dense)) + for _, e := range g.AllEdges() { + if e == nil || !edgeAllow(e.Kind) { + continue + } + i, ok1 := idx[e.From] + j, ok2 := idx[e.To] + if !ok1 || !ok2 || i == j { + continue + } + adj[i] = append(adj[i], j) + adj[j] = append(adj[j], i) + } + + // Union-find equivalence: BFS from each unseen node, mark + // every reachable node with the same component label. + comp := make([]int, len(dense)) + for i := range comp { + comp[i] = -1 + } + next := 0 + queue := make([]int, 0, 64) + for i := range dense { + if comp[i] != -1 { + continue + } + label := next + next++ + comp[i] = label + queue = append(queue[:0], i) + for len(queue) > 0 { + cur := queue[0] + queue = queue[1:] + for _, nb := range adj[cur] { + if comp[nb] == -1 { + comp[nb] = label + queue = append(queue, nb) + } + } + } + } + + return collectComponents(dense, comp, opts.MinSize) +} + +// ComputeSCC returns the strongly connected components of g — +// pairs of nodes mutually reachable along directed edges. Uses +// an iterative Tarjan's algorithm to avoid blowing the recursion +// stack on a deep call graph. O(V + E). +func ComputeSCC(g graph.Store, opts ComponentOptions) []ComponentResult { + if g == nil { + return nil + } + nodeAllow := makeComponentKindAllow(opts.NodeKinds) + edgeAllow := makeComponentEdgeAllow(opts.EdgeKinds) + + nodes := g.AllNodes() + idx := make(map[string]int, len(nodes)) + dense := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || !nodeAllow(n.Kind) { + continue + } + idx[n.ID] = len(dense) + dense = append(dense, n.ID) + } + if len(dense) == 0 { + return nil + } + + // Directed adjacency. Only out-edges — SCC walks one way. + adj := make([][]int, len(dense)) + for _, e := range g.AllEdges() { + if e == nil || !edgeAllow(e.Kind) { + continue + } + i, ok1 := idx[e.From] + j, ok2 := idx[e.To] + if !ok1 || !ok2 { + continue + } + adj[i] = append(adj[i], j) + } + + // Iterative Tarjan. State arrays sized to the dense node + // count; the call stack is replaced by an explicit (node, + // neighbour-iteration-index) stack. + n := len(dense) + const undefined = -1 + idxArr := make([]int, n) + lowlink := make([]int, n) + onStack := make([]bool, n) + for i := range idxArr { + idxArr[i] = undefined + } + stack := make([]int, 0, n) + type frame struct { + v int + ni int // next-neighbour index to visit + } + work := make([]frame, 0, n) + + var index int + comp := make([]int, n) + for i := range comp { + comp[i] = -1 + } + nextComp := 0 + + for start := 0; start < n; start++ { + if idxArr[start] != undefined { + continue + } + // Initialise the explicit DFS for this root. + idxArr[start] = index + lowlink[start] = index + index++ + stack = append(stack, start) + onStack[start] = true + work = append(work, frame{v: start, ni: 0}) + + for len(work) > 0 { + top := &work[len(work)-1] + v := top.v + neighbors := adj[v] + if top.ni < len(neighbors) { + w := neighbors[top.ni] + top.ni++ + if idxArr[w] == undefined { + // Descend into w. + idxArr[w] = index + lowlink[w] = index + index++ + stack = append(stack, w) + onStack[w] = true + work = append(work, frame{v: w, ni: 0}) + } else if onStack[w] { + if idxArr[w] < lowlink[v] { + lowlink[v] = idxArr[w] + } + } + continue + } + // All neighbours consumed; pop the frame and propagate + // the lowlink upward. + work = work[:len(work)-1] + if len(work) > 0 { + parent := &work[len(work)-1] + if lowlink[v] < lowlink[parent.v] { + lowlink[parent.v] = lowlink[v] + } + } + // Emit an SCC if v is its lowlink root. + if lowlink[v] == idxArr[v] { + label := nextComp + nextComp++ + for { + w := stack[len(stack)-1] + stack = stack[:len(stack)-1] + onStack[w] = false + comp[w] = label + if w == v { + break + } + } + } + } + } + + return collectComponents(dense, comp, opts.MinSize) +} + +// collectComponents groups dense node IDs by component label, +// applies MinSize, sorts members for determinism, and returns +// the slice ordered by size descending. +func collectComponents(dense []string, comp []int, minSize int) []ComponentResult { + groups := make(map[int][]string) + for i, id := range dense { + c := comp[i] + if c < 0 { + continue + } + groups[c] = append(groups[c], id) + } + out := make([]ComponentResult, 0, len(groups)) + for c, members := range groups { + if minSize > 0 && len(members) < minSize { + continue + } + sort.Strings(members) + out = append(out, ComponentResult{ID: c, Members: members, Size: len(members)}) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Size != out[j].Size { + return out[i].Size > out[j].Size + } + if len(out[i].Members) > 0 && len(out[j].Members) > 0 { + return out[i].Members[0] < out[j].Members[0] + } + return out[i].ID < out[j].ID + }) + // Renumber sequentially so the output IDs are 0..N-1 in + // size-descending order. Stable for snapshot tests. + for i := range out { + out[i].ID = i + } + return out +} + +func makeComponentKindAllow(kinds []graph.NodeKind) func(graph.NodeKind) bool { + if len(kinds) == 0 { + return func(graph.NodeKind) bool { return true } + } + set := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + return func(k graph.NodeKind) bool { + _, ok := set[k] + return ok + } +} + +func makeComponentEdgeAllow(kinds []graph.EdgeKind) func(graph.EdgeKind) bool { + if len(kinds) == 0 { + return func(graph.EdgeKind) bool { return true } + } + set := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + return func(k graph.EdgeKind) bool { + _, ok := set[k] + return ok + } +} diff --git a/internal/analysis/components_test.go b/internal/analysis/components_test.go new file mode 100644 index 00000000..f91ba637 --- /dev/null +++ b/internal/analysis/components_test.go @@ -0,0 +1,107 @@ +package analysis + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// seedComponentTestGraph builds the same hub-and-spoke graph the +// ladybug probe / conformance tests use: two SCC triangles + one +// hub every node points at. Gives predictable WCC + SCC answers. +func seedComponentTestGraph() *graph.Graph { + g := graph.New() + for _, id := range []string{"a", "b", "c", "d", "e", "f", "hub"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: id + ".go"}) + } + edges := [][2]string{ + {"a", "b"}, {"b", "c"}, {"c", "a"}, // triangle 1 + {"d", "e"}, {"e", "f"}, {"f", "d"}, // triangle 2 + {"c", "d"}, // bridge + {"a", "hub"}, {"b", "hub"}, {"c", "hub"}, + {"d", "hub"}, {"e", "hub"}, {"f", "hub"}, + } + for _, e := range edges { + g.AddEdge(&graph.Edge{From: e[0], To: e[1], Kind: graph.EdgeCalls, FilePath: "x.go"}) + } + return g +} + +func TestComputeWCC_OneComponent(t *testing.T) { + g := seedComponentTestGraph() + res := ComputeWCC(g, ComponentOptions{}) + require.Len(t, res, 1, "all 7 nodes form one WCC; got %v", res) + assert.Equal(t, 7, res[0].Size) +} + +func TestComputeWCC_HonoursEdgeFilter(t *testing.T) { + g := seedComponentTestGraph() + // Filter out the call edges entirely → no surviving edges → every node + // becomes its own singleton component. + res := ComputeWCC(g, ComponentOptions{ + EdgeKinds: []graph.EdgeKind{graph.EdgeReferences}, + }) + assert.Len(t, res, 7, + "with no surviving edges every node should be a singleton; got %v", res) +} + +func TestComputeSCC_ThreeComponents(t *testing.T) { + g := seedComponentTestGraph() + res := ComputeSCC(g, ComponentOptions{}) + // 7 SCCs: {a,b,c}, {d,e,f}, {hub} (singleton). But the hub is + // trivial — without MinSize, expect 3 with sizes [3, 3, 1]. + require.GreaterOrEqual(t, len(res), 3) + + bySize := map[int]int{} + for _, r := range res { + bySize[r.Size]++ + } + assert.Equal(t, 2, bySize[3], "should find two 3-node SCCs (the triangles); got %v", res) +} + +func TestComputeSCC_MinSize_DropsSingletons(t *testing.T) { + g := seedComponentTestGraph() + res := ComputeSCC(g, ComponentOptions{MinSize: 2}) + for _, r := range res { + assert.GreaterOrEqual(t, r.Size, 2, + "MinSize=2 should drop singleton SCCs; got %v", r) + } +} + +// TestComputeSCC_Iterative_NoStackOverflow constructs a deep +// straight-line graph (1 -> 2 -> 3 -> ... -> N) to make sure the +// iterative Tarjan stays in heap and doesn't blow the goroutine +// call stack. N = 10k; recursive Tarjan would fall over. +func TestComputeSCC_Iterative_NoStackOverflow(t *testing.T) { + const n = 10000 + g := graph.New() + for i := 0; i < n; i++ { + id := charID(i) + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + for i := 0; i < n-1; i++ { + g.AddEdge(&graph.Edge{ + From: charID(i), To: charID(i + 1), Kind: graph.EdgeCalls, FilePath: "x.go", + }) + } + res := ComputeSCC(g, ComponentOptions{}) + // A DAG of N nodes has N singleton SCCs. + assert.Equal(t, n, len(res)) +} + +func charID(i int) string { + // fmt.Sprintf is fine but we want zero allocs in the loop body — just + // build a deterministic string ID. + const hex = "0123456789abcdef" + out := make([]byte, 0, 8) + for x := i; ; x /= 16 { + out = append([]byte{hex[x%16]}, out...) + if x < 16 { + break + } + } + return "n_" + string(out) +} diff --git a/internal/analysis/connectivity.go b/internal/analysis/connectivity.go index 51eddfc3..59938a20 100644 --- a/internal/analysis/connectivity.go +++ b/internal/analysis/connectivity.go @@ -109,7 +109,14 @@ const connectivityNote = "Connectivity health is a graph-EXTRACTION diagnostic, // fileLimit caps how many files DeadWeightByFile carries — files are // ranked by dead-weight descending, ties broken by path; pass 0 or a // negative value for no cap. -func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) GraphConnectivityReport { +// +// Backends that implement graph.NodeDegreeAggregator serve every +// per-node count from one bulk Cypher pass; the fallback path runs +// the legacy per-node GetInEdges + GetOutEdges + ClassifyZeroEdge +// trio. The arithmetic is identical either way — the capability +// inlines ClassifyZeroEdge's "no incoming usage edge" check into the +// same row. +func GraphConnectivity(g graph.Store, nodes []*graph.Node, fileLimit int) GraphConnectivityReport { report := GraphConnectivityReport{Note: connectivityNote} if g == nil { return report @@ -127,6 +134,14 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph byKind := map[graph.NodeKind]*kindAgg{} byFile := map[string]*fileAgg{} + // Bulk per-node count fetch when the backend supports it; one + // Cypher pair vs. 3N per-node round-trips for the legacy path + // (the killer on Ladybug — see the NodeDegreeAggregator doc-comment + // for the workspace-scale numbers). Returns a map keyed on node ID + // or nil when the capability isn't available; the fallback path + // re-queries per node via the closure below. + counts := collectConnectivityCounts(g, nodes) + for _, n := range nodes { if n == nil { continue @@ -140,8 +155,15 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph } ka.total++ - inCount := len(g.GetInEdges(n.ID)) - outCount := len(g.GetOutEdges(n.ID)) + var inCount, outCount int + if counts != nil { + row := counts[n.ID] + inCount = row.InCount + outCount = row.OutCount + } else { + inCount = len(g.GetInEdges(n.ID)) + outCount = len(g.GetOutEdges(n.ID)) + } degree := inCount + outCount if degree > 0 { @@ -149,10 +171,12 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph } // Isolated == zero edges of any kind. ClassifyZeroEdge returns - // ZeroEdgePossibleExtractionGap for exactly this case, so the - // "isolated" definition stays bound to the shared zero-edge - // classification used for per-symbol caveats. - isolated := graph.ClassifyZeroEdge(g, n.ID) == graph.ZeroEdgePossibleExtractionGap + // ZeroEdgePossibleExtractionGap for exactly this case (for a + // known node), so the "isolated" definition stays bound to the + // shared zero-edge classification used for per-symbol caveats. + // We derive it from the counts directly; the underlying + // classifier's check is in == 0 && out == 0 for a known id. + isolated := degree == 0 leaf := degree == 1 if isolated { @@ -230,3 +254,36 @@ func GraphConnectivity(g *graph.Graph, nodes []*graph.Node, fileLimit int) Graph return report } + +// collectConnectivityCounts returns per-node in/out/usage counts for +// the supplied node slice via the backend's NodeDegreeAggregator +// capability. Returns nil when the backend doesn't implement the +// capability — GraphConnectivity then falls back to the legacy +// per-node g.GetInEdges/g.GetOutEdges path so semantics never differ. +// +// We pass UsageInboundEdgeKinds so the server fills UsageInCount — +// today GraphConnectivity only consumes In/Out totals, but the usage +// count rides on the same row at no extra round-trip cost and makes +// the capability self-contained for callers that need it next. +func collectConnectivityCounts(g graph.Store, nodes []*graph.Node) map[string]graph.NodeDegreeRow { + agg, ok := g.(graph.NodeDegreeAggregator) + if !ok { + return nil + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || n.ID == "" { + continue + } + ids = append(ids, n.ID) + } + if len(ids) == 0 { + return map[string]graph.NodeDegreeRow{} + } + rows := agg.NodeDegreeCounts(ids, graph.UsageInboundEdgeKinds()) + out := make(map[string]graph.NodeDegreeRow, len(rows)) + for _, r := range rows { + out[r.NodeID] = r + } + return out +} diff --git a/internal/analysis/contracts.go b/internal/analysis/contracts.go index 593b09c0..c2854a04 100644 --- a/internal/analysis/contracts.go +++ b/internal/analysis/contracts.go @@ -43,7 +43,7 @@ type parsedSignature struct { // VerifyChanges checks proposed signature changes against all callers and interface // implementors, returning any contract violations found. -func VerifyChanges(g *graph.Graph, engine *query.Engine, changes []SignatureChange) *VerifyResult { +func VerifyChanges(g graph.Store, engine *query.Engine, changes []SignatureChange) *VerifyResult { result := &VerifyResult{} for _, change := range changes { @@ -151,7 +151,7 @@ func VerifyChanges(g *graph.Graph, engine *query.Engine, changes []SignatureChan // checkInterfaceViolations checks if the changed symbol is a method that belongs to // an interface, and if so, verifies all other implementors still conform. // Traversal: EdgeMemberOf → parent type → EdgeImplements → interface → all implementors -func checkInterfaceViolations(g *graph.Graph, engine *query.Engine, node *graph.Node, newSig *parsedSignature, result *VerifyResult) { +func checkInterfaceViolations(g graph.Store, engine *query.Engine, node *graph.Node, newSig *parsedSignature, result *VerifyResult) { if node.Kind != graph.KindMethod { return } @@ -232,7 +232,7 @@ func checkInterfaceViolations(g *graph.Graph, engine *query.Engine, node *graph. } // findMemberMethods returns all method nodes that are members of the given type. -func findMemberMethods(g *graph.Graph, typeID string) []*graph.Node { +func findMemberMethods(g graph.Store, typeID string) []*graph.Node { inEdges := g.GetInEdges(typeID) var methods []*graph.Node for _, edge := range inEdges { diff --git a/internal/analysis/cycles.go b/internal/analysis/cycles.go index b9573af2..d7b37f2a 100644 --- a/internal/analysis/cycles.go +++ b/internal/analysis/cycles.go @@ -20,9 +20,8 @@ type Cycle struct { // DetectCycles finds all dependency cycles in the graph using Tarjan's SCC algorithm. // If scope is non-empty, only nodes whose FilePath starts with scope are considered. // Cycles are classified by edge type and community membership, then sorted by severity descending. -func DetectCycles(g *graph.Graph, communities *CommunityResult, scope string) []Cycle { +func DetectCycles(g graph.Store, communities *CommunityResult, scope string) []Cycle { nodes := g.AllNodes() - edges := g.AllEdges() // Build set of in-scope node IDs inScope := make(map[string]bool, len(nodes)) @@ -36,24 +35,35 @@ func DetectCycles(g *graph.Graph, communities *CommunityResult, scope string) [] inScope[n.ID] = true } - // Build adjacency list and track edge kinds between pairs + // Build adjacency list and track edge kinds between pairs. + // + // Edge collection streams only EdgeImports + EdgeCalls via + // EdgesByKind (two MATCH (...)-[e:Edge {kind: $kind}]->(...) on + // disk backends) instead of materialising every edge in the graph + // just to filter for two kinds -- ~500k edge rows over cgo dropped + // to the import-and-call subset (a few tens of thousands on the + // gortex workspace). adj := make(map[string][]string) edgeKinds := make(map[edgePair][]graph.EdgeKind) - for _, e := range edges { - if e.Kind != graph.EdgeImports && e.Kind != graph.EdgeCalls { - continue - } - if !inScope[e.From] || !inScope[e.To] { - continue - } - pair := edgePair{e.From, e.To} - // Avoid duplicate adjacency entries - if _, exists := edgeKinds[pair]; !exists { - adj[e.From] = append(adj[e.From], e.To) + collect := func(kind graph.EdgeKind) { + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + if !inScope[e.From] || !inScope[e.To] { + continue + } + pair := edgePair{e.From, e.To} + // Avoid duplicate adjacency entries + if _, exists := edgeKinds[pair]; !exists { + adj[e.From] = append(adj[e.From], e.To) + } + edgeKinds[pair] = append(edgeKinds[pair], kind) } - edgeKinds[pair] = append(edgeKinds[pair], e.Kind) } + collect(graph.EdgeImports) + collect(graph.EdgeCalls) // Run Tarjan's SCC sccs := tarjanSCC(inScope, adj) @@ -89,7 +99,7 @@ func DetectCycles(g *graph.Graph, communities *CommunityResult, scope string) [] // WouldCreateCycle checks if adding an edge from fromID to toID would create a cycle. // It performs DFS from toID to see if fromID is reachable. If so, adding fromID→toID // would close a cycle. Returns the cycle path from toID to fromID when found. -func WouldCreateCycle(g *graph.Graph, fromID, toID string) (bool, []string) { +func WouldCreateCycle(g graph.Store, fromID, toID string) (bool, []string) { edges := g.AllEdges() // Build adjacency from calls and imports edges diff --git a/internal/analysis/deadcode.go b/internal/analysis/deadcode.go index 2305212a..faa10205 100644 --- a/internal/analysis/deadcode.go +++ b/internal/analysis/deadcode.go @@ -3,6 +3,7 @@ package analysis import ( "math" "path/filepath" + "slices" "sort" "strings" "unicode" @@ -210,23 +211,48 @@ func isEntryPointNode(n *graph.Node) bool { return v } +// candidateNodeKinds enumerates the node kinds FindDeadCode is willing +// to flag (modulo the opt-in switches for fields / variables / +// constants). Used both for the per-kind allowlist handed to the +// DeadCodeCandidator capability and as the source of truth for the +// Go-fallback loop. Kept in lockstep with neverDeadCodeKinds: a kind +// MUST appear in exactly one of the two lists. +var candidateNodeKinds = []graph.NodeKind{ + graph.KindFunction, + graph.KindMethod, + graph.KindType, + graph.KindInterface, + graph.KindField, + graph.KindVariable, + graph.KindConstant, +} + // FindDeadCode returns all symbols with zero incoming calls or references, // excluding entry points, test functions, exported symbols, and user-excluded patterns. // By default, variables are excluded (see FindDeadCodeOptions for rationale). -func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []string, opts ...FindDeadCodeOptions) []DeadCodeEntry { +func FindDeadCode(g graph.Store, processes *ProcessResult, excludePatterns []string, opts ...FindDeadCodeOptions) []DeadCodeEntry { var opt FindDeadCodeOptions if len(opts) > 0 { opt = opts[0] } - nodes := g.AllNodes() - allEdges := g.AllEdges() - // Build set of interface-required method names per type. // If a type implements an interface, all methods that the interface // requires are alive even if never called directly (they satisfy the // contract). We index: typeID → set of required method names. - ifaceRequiredMethods := buildIfaceRequiredMethods(g, nodes, allEdges) + // Backends that implement graph.IfaceImplementsScanner serve this + // from one Cypher join; the fallback walks NodesByKind + EdgesByKind + // just like before. + ifaceRequiredMethods := buildIfaceRequiredMethods(g) + + // Pick the candidate-set source. When the backend implements + // DeadCodeCandidator, the WHERE-NOT-EXISTS filter runs server-side + // and only the surviving ~hundreds of true candidates cross the + // cgo boundary — see graph.DeadCodeCandidator's doc-comment for the + // 1.3M-row-vs-hundreds rationale. Otherwise the legacy + // AllNodes + GetInEdgesByNodeIDs fallback runs, identical to the + // pre-capability path. + candidates, incomingByID := collectDeadCodeCandidates(g, opt) // Build set of entry point node IDs from processes entryPoints := make(map[string]bool) @@ -242,19 +268,24 @@ func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []st // Files holding a framework entry point (Alembic migrations, // Next.js pages, ASP.NET host files) — every symbol inside is - // reachable from a runtime, not application-dead. + // reachable from a runtime, not application-dead. Computed via + // NodesByKind(KindFile) so on disk backends we don't have to + // materialise AllNodes() just to find the entry-point files. entryPointFiles := make(map[string]bool) - for _, n := range nodes { - if n.Kind == graph.KindFile && isEntryPointNode(n) { + for n := range g.NodesByKind(graph.KindFile) { + if n != nil && isEntryPointNode(n) { entryPointFiles[n.FilePath] = true } } var result []DeadCodeEntry - for _, n := range nodes { + for _, n := range candidates { // Skip kinds the analyzer never reports — structural, // extracted metadata, infra, function-shape, and value-only // nodes. See neverDeadCodeKinds for the full list and why. + // (The server-side candidator only ships nodes whose kind is + // in candidateNodeKinds, but the Go fallback path scans + // AllNodes so we keep the explicit gate.) if neverDeadCodeKinds[n.Kind] { continue } @@ -311,20 +342,19 @@ func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []st continue } - // Count incoming edges that indicate the symbol is used. - // The allowlist is per-kind: fields/variables/constants are - // exercised by Reads/Writes; functions/methods by Calls/ - // References; types by References/Instantiates/MemberOf/ - // Implements/Extends/Composes/TypedAs. See incomingUsageKinds - // for the rationale. - allowed := incomingUsageKinds(n.Kind) - inEdges := g.GetInEdges(n.ID) + // Re-check the per-kind incoming-edge allowlist when we still + // have the in-edge map from the Go fallback path. The + // server-side DeadCodeCandidator has already applied the + // equivalent filter, so incomingByID is nil for that path and + // the count check short-circuits to 0 (matching the + // candidator's contract). incomingCount := 0 - for _, e := range inEdges { - for _, k := range allowed { - if e.Kind == k { + if incomingByID != nil { + allowed := incomingUsageKinds(n.Kind) + inEdges := incomingByID[n.ID] + for _, e := range inEdges { + if slices.Contains(allowed, e.Kind) { incomingCount++ - break } } } @@ -413,35 +443,83 @@ func FindDeadCode(g *graph.Graph, processes *ProcessResult, excludePatterns []st return result } +// collectDeadCodeCandidates is the candidate-set splitter for +// FindDeadCode. When the backend implements DeadCodeCandidator the +// WHERE-NOT-EXISTS filter runs server-side and we never materialise +// the in-edge map (returned nil). Otherwise we fall back to today's +// AllNodes + batched-GetInEdgesByNodeIDs path, identical pre-Part-2 +// behaviour. The post-filter loop in FindDeadCode handles both shapes +// uniformly — incomingByID==nil means "filter already applied". +func collectDeadCodeCandidates(g graph.Store, opt FindDeadCodeOptions) (candidates []*graph.Node, incomingByID map[string][]*graph.Edge) { + if dc, ok := g.(graph.DeadCodeCandidator); ok { + kinds := candidateNodeKinds[:0:0] + for _, k := range candidateNodeKinds { + // Honour the IncludeFields / IncludeVariables / IncludeConstants + // opt-in switches at the candidate-source: kinds the caller + // explicitly excluded never need to cross cgo. The post- + // filter loop still re-checks these for the fallback path + // (which sees every kind) so the contract holds either way. + switch k { + case graph.KindField: + if !opt.IncludeFields { + continue + } + case graph.KindVariable: + if !opt.IncludeVariables { + continue + } + case graph.KindConstant: + if !opt.IncludeConstants { + continue + } + } + kinds = append(kinds, k) + } + allowed := make(map[graph.NodeKind][]graph.EdgeKind, len(kinds)) + for _, k := range kinds { + allowed[k] = incomingUsageKinds(k) + } + return dc.DeadCodeCandidates(kinds, allowed), nil + } + + // Fallback: pull every node and the batched in-edge map up front. + // Same shape as before the DeadCodeCandidator capability landed. + nodes := g.AllNodes() + nodeIDs := make([]string, 0, len(nodes)) + for _, n := range nodes { + nodeIDs = append(nodeIDs, n.ID) + } + return nodes, g.GetInEdgesByNodeIDs(nodeIDs) +} + // buildIfaceRequiredMethods returns a map from type ID → set of method names // that the type must implement to satisfy its interfaces. This is computed by: // 1. Collecting all interfaces with their required method names (from Meta["methods"]). // 2. Collecting all EdgeImplements edges (type → interface). // 3. For each type that implements an interface, merging all required method names. -func buildIfaceRequiredMethods(g *graph.Graph, nodes []*graph.Node, edges []*graph.Edge) map[string]map[string]bool { - // Step 1: interface ID → required method names +// +// On backends that implement graph.IfaceImplementsScanner this is a +// single Cypher join; otherwise the fallback iterates +// NodesByKind(KindInterface) + EdgesByKind(EdgeImplements). Both paths +// produce the same map. +func buildIfaceRequiredMethods(g graph.Store) map[string]map[string]bool { + if scanner, ok := g.(graph.IfaceImplementsScanner); ok { + return buildIfaceRequiredMethodsFromRows(scanner.IfaceImplementsRows()) + } + + // Fallback: walk interfaces + EdgeImplements edges Go-side. Uses + // NodesByKind(KindInterface) so disk backends still issue one + // MATCH per kind instead of pulling AllNodes. ifaceMethods := make(map[string]map[string]bool) - for _, n := range nodes { - if n.Kind != graph.KindInterface || n.Meta == nil { + for n := range g.NodesByKind(graph.KindInterface) { + if n == nil || n.Meta == nil { continue } raw, ok := n.Meta["methods"] if !ok { continue } - methods := make(map[string]bool) - switch v := raw.(type) { - case []string: - for _, m := range v { - methods[m] = true - } - case []any: - for _, m := range v { - if s, ok := m.(string); ok { - methods[s] = true - } - } - } + methods := decodeMethodNames(raw) if len(methods) > 0 { ifaceMethods[n.ID] = methods } @@ -451,12 +529,8 @@ func buildIfaceRequiredMethods(g *graph.Graph, nodes []*graph.Node, edges []*gra return nil } - // Step 2: type ID → set of required method names (from all implemented interfaces) result := make(map[string]map[string]bool) - for _, e := range edges { - if e.Kind != graph.EdgeImplements { - continue - } + for e := range g.EdgesByKind(graph.EdgeImplements) { // EdgeImplements: From=type, To=interface iface, ok := ifaceMethods[e.To] if !ok { @@ -473,6 +547,67 @@ func buildIfaceRequiredMethods(g *graph.Graph, nodes []*graph.Node, edges []*gra return result } +// buildIfaceRequiredMethodsFromRows reduces the server-side +// IfaceImplementsScanner row set to the typeID → method-name-set +// shape the rest of FindDeadCode consumes. Same join logic as the +// fallback path, just folded over rows that already carry the +// interface Meta. +func buildIfaceRequiredMethodsFromRows(rows []graph.IfaceImplementsRow) map[string]map[string]bool { + if len(rows) == 0 { + return nil + } + // Cache decoded method-name sets per interface so repeated rows + // (one per implementing type) don't re-decode the same Meta. + ifaceMethods := make(map[string]map[string]bool) + result := make(map[string]map[string]bool) + for _, r := range rows { + methods, ok := ifaceMethods[r.IfaceID] + if !ok { + raw, hasRaw := r.IfaceMeta["methods"] + if !hasRaw { + ifaceMethods[r.IfaceID] = nil + continue + } + methods = decodeMethodNames(raw) + ifaceMethods[r.IfaceID] = methods + } + if len(methods) == 0 { + continue + } + if result[r.TypeID] == nil { + result[r.TypeID] = make(map[string]bool) + } + for m := range methods { + result[r.TypeID][m] = true + } + } + if len(result) == 0 { + return nil + } + return result +} + +// decodeMethodNames normalises a Node.Meta["methods"] value into a +// set of method names. Accepts []string (in-memory backend) and +// []any (gob-decoded payload from Ladybug); anything else is treated +// as "no methods declared". +func decodeMethodNames(raw any) map[string]bool { + methods := make(map[string]bool) + switch v := raw.(type) { + case []string: + for _, m := range v { + methods[m] = true + } + case []any: + for _, m := range v { + if s, ok := m.(string); ok { + methods[s] = true + } + } + } + return methods +} + // hotspotBetweennessWeight scales the betweenness component of a // hotspot's raw score. Betweenness arrives normalized to 0-100 (same // range as the fan-in/out/crossing terms after their own @@ -488,9 +623,34 @@ const hotspotBetweennessWeight = 0.4 // centrality component — how often the symbol lies on a shortest path between // other symbols — that augments the fan-in/out signals rather than replacing them. // If threshold <= 0, the default threshold is mean + 2*stddev. -func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float64) []HotspotEntry { - nodes := g.AllNodes() - edges := g.AllEdges() +func FindHotspots(g graph.Store, communities *CommunityResult, threshold float64) []HotspotEntry { + // Pull only function/method node IDs — the hotspots ranking is + // callable-only, and the scoring math doesn't touch any column + // beyond the id. NodeIDsByKinds returns the projection from a + // single Cypher query (one C string per row instead of the ~10 + // columns NodesByKinds would ship). The full *Node rows are + // fetched in one batched GetNodesByIDs call AFTER the threshold + // filter, so a typical run materialises ~100 survivors rather + // than the whole ~4k function/method bucket. + hotspotKinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + var candidateIDs []string + if scan, ok := g.(graph.NodeIDsByKinds); ok { + candidateIDs = scan.NodeIDsByKinds(hotspotKinds) + } else if scan, ok := g.(graph.NodesByKindsScanner); ok { + ns := scan.NodesByKinds(hotspotKinds) + candidateIDs = make([]string, 0, len(ns)) + for _, n := range ns { + candidateIDs = append(candidateIDs, n.ID) + } + } else { + all := g.AllNodes() + candidateIDs = make([]string, 0, len(all)) + for _, n := range all { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + candidateIDs = append(candidateIDs, n.ID) + } + } + } // Build lookup maps for community membership nodeToComm := make(map[string]string) @@ -498,31 +658,45 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 nodeToComm = communities.NodeToComm } - // Build edge maps for fan-in and fan-out computation - // fan_in: incoming calls + references - // fan_out: outgoing calls - fanIn := make(map[string]int) - fanOut := make(map[string]int) - - for _, e := range edges { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fanIn[e.To]++ - } - if e.Kind == graph.EdgeCalls { - fanOut[e.From]++ - } + // Restrict the fan-count pass to the kinds hotspots cares about + // (function + method). NodeFanAggregator expects the candidate id + // list -- it never returns rows for ids the caller didn't ask + // for, so the cgo payload stays bounded by the candidate count + // rather than the whole graph. + fanIn, fanOut := CollectFanCounts(g, candidateIDs, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + []graph.EdgeKind{graph.EdgeCalls}, + ) + + // Community crossings per node: outgoing edges (Calls or + // References) whose target sits in a different community than + // the source. CommunityCrossingsByKind ships only the (from, to) + // projection from a single IN-list join — the disk path stops + // re-materialising the full edge row per kind. Backends that + // don't implement the capability fall back to the per-kind + // EdgesByKind walk that mirrors the in-memory reference. + crossingKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + var crossings map[string]int + if cc, ok := g.(graph.CommunityCrossingsByKind); ok { + crossings = cc.CommunityCrossingsByKind(crossingKinds, nodeToComm) } - - // Compute community crossings per node: outgoing edges to nodes in different communities - crossings := make(map[string]int) - for _, e := range edges { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fromComm := nodeToComm[e.From] - toComm := nodeToComm[e.To] - if fromComm != "" && toComm != "" && fromComm != toComm { - crossings[e.From]++ + if crossings == nil { + crossings = make(map[string]int) + countCrossings := func(kind graph.EdgeKind) { + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + fromComm := nodeToComm[e.From] + toComm := nodeToComm[e.To] + if fromComm != "" && toComm != "" && fromComm != toComm { + crossings[e.From]++ + } } } + for _, k := range crossingKinds { + countCrossings(k) + } } // Betweenness centrality — exact on small graphs, sampled on @@ -536,9 +710,13 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 } } - // Compute raw scores for function/method nodes only + // Compute raw scores for function/method nodes only. Keyed by id + // so the full *Node fetch is deferred until after the threshold + // filter — on a ~4k candidate set the surviving share is the top + // few percent, so this materialises ~100 nodes instead of the + // whole bucket. type rawEntry struct { - node *graph.Node + id string fanIn int fanOut int crossing int @@ -546,20 +724,16 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 rawScore float64 } - var entries []rawEntry - for _, n := range nodes { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } - - fi := fanIn[n.ID] - fo := fanOut[n.ID] - cc := crossings[n.ID] - bw := betweenness[n.ID] + entries := make([]rawEntry, 0, len(candidateIDs)) + for _, id := range candidateIDs { + fi := fanIn[id] + fo := fanOut[id] + cc := crossings[id] + bw := betweenness[id] raw := float64(fi)*2.0 + float64(fo)*1.5 + float64(cc)*3.0 + bw*hotspotBetweennessWeight entries = append(entries, rawEntry{ - node: n, + id: id, fanIn: fi, fanOut: fo, crossing: cc, @@ -607,25 +781,49 @@ func FindHotspots(g *graph.Graph, communities *CommunityResult, threshold float6 threshold = mean + 2.0*stddev } - // Filter and build result - var result []HotspotEntry - for i, e := range entries { + // Filter by threshold first to identify the surviving id set, so + // the full *Node materialisation is bounded by the result size, + // not the candidate count. + type survivor struct { + entryIdx int + score float64 + } + survivors := make([]survivor, 0, len(entries)) + for i := range entries { score := math.Round(normalized[i]*100) / 100 // round to 2 decimal places if score < threshold { continue } + survivors = append(survivors, survivor{entryIdx: i, score: score}) + } + if len(survivors) == 0 { + return nil + } + + survivorIDs := make([]string, 0, len(survivors)) + for _, s := range survivors { + survivorIDs = append(survivorIDs, entries[s.entryIdx].id) + } + nodesByID := g.GetNodesByIDs(survivorIDs) + result := make([]HotspotEntry, 0, len(survivors)) + for _, s := range survivors { + e := entries[s.entryIdx] + n := nodesByID[e.id] + if n == nil { + continue + } result = append(result, HotspotEntry{ - ID: e.node.ID, - Name: e.node.Name, - Kind: string(e.node.Kind), - FilePath: e.node.FilePath, - Line: e.node.StartLine, + ID: n.ID, + Name: n.Name, + Kind: string(n.Kind), + FilePath: n.FilePath, + Line: n.StartLine, FanIn: e.fanIn, FanOut: e.fanOut, CommunityCrossings: e.crossing, Betweenness: math.Round(e.betweenness*100) / 100, - ComplexityScore: score, + ComplexityScore: s.score, }) } @@ -811,3 +1009,90 @@ func matchesExcludePattern(filePath, nodeID string, patterns []string) bool { } return false } + +// CollectFanCounts returns per-id fan-in / fan-out counts filtered by +// edge kind. Backends that implement graph.NodeFanAggregator serve +// both counts from one bulk Cypher per direction (~candidateCount +// rows over cgo instead of the full edge set); the fallback path +// streams the requested kinds via EdgesByKind, accumulating into the +// fan maps Go-side -- still no AllEdges materialisation, just an +// in-memory walk of the per-kind edge buckets. +// +// Used by FindHotspots and the health_score analyzer. Both pass the +// same fanInKinds / fanOutKinds pair today; the function signature +// keeps them per-call so a future analyzer with a different kind +// split can share the same plumbing. +func CollectFanCounts(g graph.Store, ids []string, fanInKinds []graph.EdgeKind, fanOutKinds []graph.EdgeKind) (fanIn, fanOut map[string]int) { + fanIn = make(map[string]int, len(ids)) + fanOut = make(map[string]int, len(ids)) + if len(ids) == 0 { + return fanIn, fanOut + } + if agg, ok := g.(graph.NodeFanAggregator); ok { + for _, r := range agg.NodeFanCounts(ids, fanInKinds, fanOutKinds) { + if r.FanIn != 0 { + fanIn[r.NodeID] = r.FanIn + } + if r.FanOut != 0 { + fanOut[r.NodeID] = r.FanOut + } + } + return fanIn, fanOut + } + + // Fallback path: stream the requested kinds via EdgesByKind and + // tally Go-side. ID-set membership keeps the maps bounded to + // candidate ids, matching the capability contract. + idSet := make(map[string]struct{}, len(ids)) + for _, id := range ids { + if id != "" { + idSet[id] = struct{}{} + } + } + streamed := make(map[graph.EdgeKind]struct{}, len(fanInKinds)+len(fanOutKinds)) + stream := func(kind graph.EdgeKind, toIn, toOut bool) { + if _, ok := streamed[kind]; ok { + return + } + streamed[kind] = struct{}{} + for e := range g.EdgesByKind(kind) { + if e == nil { + continue + } + if toIn { + if _, ok := idSet[e.To]; ok { + fanIn[e.To]++ + } + } + if toOut { + if _, ok := idSet[e.From]; ok { + fanOut[e.From]++ + } + } + } + } + inKinds := make(map[graph.EdgeKind]struct{}, len(fanInKinds)) + for _, k := range fanInKinds { + inKinds[k] = struct{}{} + } + outKinds := make(map[graph.EdgeKind]struct{}, len(fanOutKinds)) + for _, k := range fanOutKinds { + outKinds[k] = struct{}{} + } + allKinds := make([]graph.EdgeKind, 0, len(inKinds)+len(outKinds)) + for k := range inKinds { + allKinds = append(allKinds, k) + } + for k := range outKinds { + if _, dup := inKinds[k]; dup { + continue + } + allKinds = append(allKinds, k) + } + for _, k := range allKinds { + _, toIn := inKinds[k] + _, toOut := outKinds[k] + stream(k, toIn, toOut) + } + return fanIn, fanOut +} diff --git a/internal/analysis/diffmap.go b/internal/analysis/diffmap.go index e9662760..bcf6214b 100644 --- a/internal/analysis/diffmap.go +++ b/internal/analysis/diffmap.go @@ -38,7 +38,7 @@ type DiffResult struct { // scope: "unstaged", "staged", "all", "compare" // baseRef: used when scope is "compare" (e.g., "main") // repoRoot: absolute path to the repository root -func MapGitDiff(g *graph.Graph, repoRoot, scope, baseRef string) (*DiffResult, error) { +func MapGitDiff(g graph.Store, repoRoot, scope, baseRef string) (*DiffResult, error) { args := buildDiffArgs(scope, baseRef) cmd := exec.Command("git", args...) cmd.Dir = repoRoot diff --git a/internal/analysis/guards.go b/internal/analysis/guards.go index 721faabd..e2180c46 100644 --- a/internal/analysis/guards.go +++ b/internal/analysis/guards.go @@ -30,7 +30,7 @@ type GuardViolation struct { // For "boundary" rules: reports a violation when any changed symbol whose file path // matches the Source prefix has outgoing call or reference edges to symbols whose // file paths match the Target prefix. -func EvaluateGuards(g *graph.Graph, rules []config.GuardRule, changedSymbolIDs []string) []GuardViolation { +func EvaluateGuards(g graph.Store, rules []config.GuardRule, changedSymbolIDs []string) []GuardViolation { var violations []GuardViolation // Pre-resolve changed symbols to nodes for efficient lookup. @@ -88,7 +88,7 @@ func evaluateCoChange(rule config.GuardRule, changedNodes []*graph.Node) []Guard // evaluateBoundary checks whether any changed symbol in the source prefix has // outgoing call or reference edges targeting symbols in the target prefix. -func evaluateBoundary(g *graph.Graph, rule config.GuardRule, changedNodes []*graph.Node) []GuardViolation { +func evaluateBoundary(g graph.Store, rule config.GuardRule, changedNodes []*graph.Node) []GuardViolation { var violations []GuardViolation seen := make(map[string]bool) diff --git a/internal/analysis/hierarchy.go b/internal/analysis/hierarchy.go index 685826a5..a5af19da 100644 --- a/internal/analysis/hierarchy.go +++ b/internal/analysis/hierarchy.go @@ -129,7 +129,7 @@ func hierarchyLeafKinds(k graph.NodeKind) bool { // The base graph is read-only here — BuildHierarchy never mutates g // and never persists a second graph. An unknown level yields an empty // view carrying that level, so callers can surface a clean error. -func BuildHierarchy(g *graph.Graph, level ResolutionLevel, communities *CommunityResult) *HierarchyView { +func BuildHierarchy(g graph.Store, level ResolutionLevel, communities *CommunityResult) *HierarchyView { view := &HierarchyView{Level: level, SelfLoops: map[string]int{}} if g == nil || !ValidResolutionLevel(level) { return view diff --git a/internal/analysis/hits.go b/internal/analysis/hits.go index 36168573..40e62ddc 100644 --- a/internal/analysis/hits.go +++ b/internal/analysis/hits.go @@ -65,7 +65,7 @@ const hitsIterations = 40 // // then L2-normalises both vectors so the scores stay bounded. A nil // or empty graph yields an empty, safe-to-query result. -func ComputeHITS(g *graph.Graph) *HITSResult { +func ComputeHITS(g graph.Store) *HITSResult { if g == nil { return &HITSResult{Authorities: map[string]float64{}, Hubs: map[string]float64{}} } diff --git a/internal/analysis/impact.go b/internal/analysis/impact.go index d8f7dbb0..6f39974f 100644 --- a/internal/analysis/impact.go +++ b/internal/analysis/impact.go @@ -54,7 +54,7 @@ type ImpactResult struct { // edges, matching the live walk's behavior. Fall back to live BFS // when any seed lacks the index — the slow path is identical to the // pre-index implementation so consumer semantics never diverge. -func AnalyzeImpact(g *graph.Graph, symbolIDs []string, communities *CommunityResult, processes *ProcessResult) *ImpactResult { +func AnalyzeImpact(g graph.Store, symbolIDs []string, communities *CommunityResult, processes *ProcessResult) *ImpactResult { result := &ImpactResult{ ByDepth: make(map[int][]ImpactEntry), } @@ -95,7 +95,7 @@ func AnalyzeImpact(g *graph.Graph, symbolIDs []string, communities *CommunityRes // Determine risk level d1 := len(result.ByDepth[1]) d2 := len(result.ByDepth[2]) - result.Risk = assessRisk(d1, d2, len(result.TestFiles)) + result.Risk = assessRisk(d1, d2) // Find affected processes if processes != nil { @@ -174,7 +174,7 @@ func AnalyzeImpact(g *graph.Graph, symbolIDs []string, communities *CommunityRes // per discovered node, attributing the in-edge that introduced it to // EdgeConfidence / ConfidenceLabel. Kept as the always-correct // fallback for fillImpactFromReach. -func fillImpactLive(g *graph.Graph, result *ImpactResult, symbolIDs []string) { +func fillImpactLive(g graph.Store, result *ImpactResult, symbolIDs []string) { visited := make(map[string]bool) for _, id := range symbolIDs { visited[id] = true @@ -228,7 +228,7 @@ func fillImpactLive(g *graph.Graph, result *ImpactResult, symbolIDs []string) { // deterministic-by-shard-iteration choice closely enough for tests // that compare ByDepth ID sets, which is the contract consumers rely // on. EdgeConfidence is set from that representative edge. -func fillImpactFromReach(g *graph.Graph, result *ImpactResult, symbolIDs []string) bool { +func fillImpactFromReach(g graph.Store, result *ImpactResult, symbolIDs []string) bool { if len(symbolIDs) == 0 { return true } @@ -347,7 +347,7 @@ func filterHeuristicEntries(entries []ImpactEntry) []ImpactEntry { return kept } -func assessRisk(directDeps, transitiveDeps, testFiles int) RiskLevel { +func assessRisk(directDeps, transitiveDeps int) RiskLevel { if directDeps >= 10 || (directDeps >= 5 && transitiveDeps >= 20) { return RiskCritical } diff --git a/internal/analysis/impact_reach_test.go b/internal/analysis/impact_reach_test.go index 29c3a0fd..94345065 100644 --- a/internal/analysis/impact_reach_test.go +++ b/internal/analysis/impact_reach_test.go @@ -215,12 +215,23 @@ func TestAnalyzeImpact_FastPathSubMillisecond(t *testing.T) { reach.BuildIndex(g) const absoluteCeiling = 15 * time.Millisecond - // Per BenchmarkAnalyzeImpact_FastPath vs LiveWalk the steady- - // state speedup on this fixture is ~1.8x. We gate at 1.3x to - // absorb wall-clock noise (short timed loops have more variance - // than the benchmark harness's adaptive sampling) while still - // catching a regression that drops in a live walk. - const minSpeedup = 1.3 + // The reach live walk (compute) now batches its whole-BFS-level + // edge + node fetches into GetInEdgesByNodeIDs / GetNodesByIDs + // instead of issuing one GetInEdges + one GetNode per node. On the + // in-memory backend those batched reads are nearly as cheap as the + // precomputed fast path (both are then dominated by the identical + // per-entry GetNode rendering in fillImpactFromReach), so the old + // ~1.8x relative speedup no longer holds here — it collapses to + // ~1.0x. The precompute's large win is now realised on disk + // backends (Ladybug), where each per-node query the batching + // eliminates was a cgo round-trip, not a map read. + // + // We therefore keep the absolute sub-ms guarantee (the user-facing + // contract: a blast-radius query stays interactive) and a loose + // regression guard that the fast path is not materially SLOWER than + // the batched live walk — without re-asserting the obsolete + // in-memory speedup premise. + const minSpeedup = 0.9 speedup := float64(avgLive) / float64(avgFast) t.Logf("AnalyzeImpact on 1000-caller fan-in: fast=%v live=%v speedup=%.2fx (over %d iters)", @@ -229,8 +240,11 @@ func TestAnalyzeImpact_FastPathSubMillisecond(t *testing.T) { if avgFast > absoluteCeiling { t.Errorf("fast-path AnalyzeImpact too slow: avg=%v (absolute ceiling=%v)", avgFast, absoluteCeiling) } + if avgLive > absoluteCeiling { + t.Errorf("live-walk AnalyzeImpact too slow: avg=%v (absolute ceiling=%v)", avgLive, absoluteCeiling) + } if speedup < minSpeedup { - t.Errorf("fast-path speedup regressed: %.2fx (want >= %.2fx)", speedup, minSpeedup) + t.Errorf("fast-path is materially slower than the live walk: %.2fx (want >= %.2fx)", speedup, minSpeedup) } } diff --git a/internal/analysis/incremental_communities.go b/internal/analysis/incremental_communities.go index c1bc4448..d7714518 100644 --- a/internal/analysis/incremental_communities.go +++ b/internal/analysis/incremental_communities.go @@ -76,7 +76,7 @@ type leidenGraph struct { // the resulting weighted graph. Returns nil when the graph has no // clustering-relevant edges — the caller then yields an empty // partition. -func buildLeidenGraph(g *graph.Graph) *leidenGraph { +func buildLeidenGraph(g graph.Store) *leidenGraph { nodes := g.AllNodes() edges := g.AllEdges() @@ -166,6 +166,18 @@ type LeidenPartitionCache struct { edgeIdentityRevisions int } +// PackageFingerprints returns the cached per-package fingerprint map. +// Callers MUST treat the returned value as read-only — it is the live +// map the cache reuses on the next call. Used by the MCP server to +// report total_packages from a cache hit without re-running the +// fingerprint pass. +func (c *LeidenPartitionCache) PackageFingerprints() map[string]uint64 { + if c == nil { + return nil + } + return c.pkgFingerprint +} + // IncrementalCommunityStats reports what the incremental path did on // a single call — useful for tests and for surfacing on the wire. type IncrementalCommunityStats struct { @@ -217,7 +229,7 @@ func packageKey(filePath string) string { // kind change, or edge added/removed/reweighted flips the // fingerprint of every package it touches and leaves all others // bit-identical. -func fingerprintPackages(g *graph.Graph) map[string]uint64 { +func fingerprintPackages(g graph.Store) map[string]uint64 { nodes := g.AllNodes() edges := g.AllEdges() @@ -315,7 +327,7 @@ func diffPackageFingerprints(old, cur map[string]uint64) map[string]bool { // - the graph's edge-provenance revision moved under the cache, or // - the changed-package fraction exceeds changedFractionFullRecompute. func DetectCommunitiesLeidenIncremental( - g *graph.Graph, + g graph.Store, cache *LeidenPartitionCache, ) (*CommunityResult, *LeidenPartitionCache, IncrementalCommunityStats) { curFP := fingerprintPackages(g) @@ -399,7 +411,7 @@ type incrementalResult struct { // community into the gain calculation but never move themselves, so // every unchanged package's assignment is preserved bit-for-bit. func incrementalLeiden( - g *graph.Graph, + g graph.Store, lg *leidenGraph, cache *LeidenPartitionCache, changedPkgs map[string]bool, diff --git a/internal/analysis/kcore.go b/internal/analysis/kcore.go new file mode 100644 index 00000000..c34b256d --- /dev/null +++ b/internal/analysis/kcore.go @@ -0,0 +1,156 @@ +package analysis + +import ( + "sort" + + "github.com/zzet/gortex/internal/graph" +) + +// KCoreHit is one row of the k-core decomposition output: a node +// plus its k-degree (the largest k for which it stays in the +// k-core after iterative degree-< k pruning). High k-degree +// signals a node sits inside a densely connected core; a chain of +// leaves all have k-degree 1, a triangle has k-degree 2, a +// 4-clique has k-degree 3. +type KCoreHit struct { + NodeID string + KDegree int +} + +// KCoreOptions filters the working set. Empty NodeKinds / +// EdgeKinds means "all kinds". Edges are treated as undirected +// (k-core is defined on undirected graphs; matches Ladybug's +// engine-native behaviour). +type KCoreOptions struct { + NodeKinds []graph.NodeKind + EdgeKinds []graph.EdgeKind +} + +// ComputeKCore returns the k-core decomposition of g. Classic +// algorithm — Batagelj & Zaversnik 2003, O(V + E): +// +// 1. compute every node's undirected degree +// 2. process nodes in degree-ascending order +// 3. when a node is removed, decrement its still-present +// neighbours' degrees so they can be picked up at the right +// level +// +// Used as the fallback when the backing graph.Store does not +// implement graph.KCorer. +func ComputeKCore(g graph.Store, opts KCoreOptions) []KCoreHit { + if g == nil { + return nil + } + nodeAllow := makeComponentKindAllow(opts.NodeKinds) + edgeAllow := makeComponentEdgeAllow(opts.EdgeKinds) + + // Dense index over allowed nodes. + nodes := g.AllNodes() + idx := make(map[string]int, len(nodes)) + dense := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n == nil || !nodeAllow(n.Kind) { + continue + } + idx[n.ID] = len(dense) + dense = append(dense, n.ID) + } + if len(dense) == 0 { + return nil + } + + // Undirected adjacency; dedupe self-loops + parallel edges. + type edge struct{ a, b int } + seenEdge := make(map[edge]bool) + adj := make([][]int, len(dense)) + for _, e := range g.AllEdges() { + if e == nil || !edgeAllow(e.Kind) { + continue + } + i, ok1 := idx[e.From] + j, ok2 := idx[e.To] + if !ok1 || !ok2 || i == j { + continue + } + key := edge{i, j} + if i > j { + key = edge{j, i} + } + if seenEdge[key] { + continue + } + seenEdge[key] = true + adj[i] = append(adj[i], j) + adj[j] = append(adj[j], i) + } + + n := len(dense) + degree := make([]int, n) + maxDeg := 0 + for i := range dense { + degree[i] = len(adj[i]) + if degree[i] > maxDeg { + maxDeg = degree[i] + } + } + + // Bucket sort by degree (Batagelj & Zaversnik). bucket[d] + // holds dense-indices currently at degree d; pos[v] is v's + // position in its bucket; vertOrder is the global processing + // order populated as we drain the buckets. + bucket := make([][]int, maxDeg+1) + pos := make([]int, n) + for v, d := range degree { + pos[v] = len(bucket[d]) + bucket[d] = append(bucket[d], v) + } + + kdeg := make([]int, n) + processed := make([]bool, n) + for d := 0; d <= maxDeg; d++ { + for len(bucket[d]) > 0 { + // Pop the back of bucket[d] (O(1)). + v := bucket[d][len(bucket[d])-1] + bucket[d] = bucket[d][:len(bucket[d])-1] + if processed[v] { + continue + } + processed[v] = true + kdeg[v] = d + for _, w := range adj[v] { + if processed[w] { + continue + } + if degree[w] > d { + // Move w one bucket down. + old := degree[w] + // O(1) removal: swap with the back element + // of the old bucket and adjust its pos. + i := pos[w] + last := len(bucket[old]) - 1 + if i != last { + other := bucket[old][last] + bucket[old][i] = other + pos[other] = i + } + bucket[old] = bucket[old][:last] + degree[w] = old - 1 + pos[w] = len(bucket[degree[w]]) + bucket[degree[w]] = append(bucket[degree[w]], w) + } + } + } + } + + out := make([]KCoreHit, 0, n) + for v, id := range dense { + out = append(out, KCoreHit{NodeID: id, KDegree: kdeg[v]}) + } + sort.Slice(out, func(i, j int) bool { + if out[i].KDegree != out[j].KDegree { + return out[i].KDegree > out[j].KDegree + } + return out[i].NodeID < out[j].NodeID + }) + return out +} diff --git a/internal/analysis/kcore_test.go b/internal/analysis/kcore_test.go new file mode 100644 index 00000000..e341b761 --- /dev/null +++ b/internal/analysis/kcore_test.go @@ -0,0 +1,93 @@ +package analysis + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func TestComputeKCore_KnownStructure(t *testing.T) { + // 4-clique + leaf attached to one of its members: + // a -- b + // | / | + // | / | + // c -- d + // | + // leaf + // Every clique node has k-degree 3 (the 4-clique is a 3-core); + // leaf has k-degree 1. + g := graph.New() + for _, id := range []string{"a", "b", "c", "d", "leaf"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + for _, e := range [][2]string{ + {"a", "b"}, {"a", "c"}, {"a", "d"}, + {"b", "c"}, {"b", "d"}, + {"c", "d"}, {"c", "leaf"}, + } { + g.AddEdge(&graph.Edge{From: e[0], To: e[1], Kind: graph.EdgeCalls, FilePath: "x.go"}) + } + + hits := ComputeKCore(g, KCoreOptions{}) + require.Len(t, hits, 5) + byID := map[string]int{} + for _, h := range hits { + byID[h.NodeID] = h.KDegree + } + for _, id := range []string{"a", "b", "c", "d"} { + assert.Equal(t, 3, byID[id], + "4-clique members should have k-degree 3; got %v", byID) + } + assert.Equal(t, 1, byID["leaf"], + "leaf should have k-degree 1; got %v", byID) +} + +func TestComputeKCore_LineGraph(t *testing.T) { + // 1 -- 2 -- 3 -- 4: every node has at most 2 neighbours, + // and after peeling the two endpoints the remaining pair + // drops below k=2, so k-degree is 1 across the board. + g := graph.New() + for _, id := range []string{"1", "2", "3", "4"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + for _, e := range [][2]string{ + {"1", "2"}, {"2", "3"}, {"3", "4"}, + } { + g.AddEdge(&graph.Edge{From: e[0], To: e[1], Kind: graph.EdgeCalls, FilePath: "x.go"}) + } + hits := ComputeKCore(g, KCoreOptions{}) + for _, h := range hits { + assert.Equal(t, 1, h.KDegree, + "line graph nodes all have k-degree 1; got %v", hits) + } +} + +func TestComputeKCore_EmptyGraph(t *testing.T) { + g := graph.New() + hits := ComputeKCore(g, KCoreOptions{}) + assert.Empty(t, hits) +} + +func TestComputeKCore_EdgeFilter(t *testing.T) { + g := graph.New() + for _, id := range []string{"a", "b", "c"} { + g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: "x.go"}) + } + g.AddEdge(&graph.Edge{From: "a", To: "b", Kind: graph.EdgeCalls, FilePath: "x.go"}) + g.AddEdge(&graph.Edge{From: "b", To: "c", Kind: graph.EdgeReferences, FilePath: "x.go"}) + + // Only call edges survive — a-b stays, b-c drops. + hits := ComputeKCore(g, KCoreOptions{ + EdgeKinds: []graph.EdgeKind{graph.EdgeCalls}, + }) + byID := map[string]int{} + for _, h := range hits { + byID[h.NodeID] = h.KDegree + } + assert.Equal(t, 1, byID["a"]) + assert.Equal(t, 1, byID["b"]) + assert.Equal(t, 0, byID["c"], "c is isolated under the filter") +} diff --git a/internal/analysis/leiden.go b/internal/analysis/leiden.go index 425be412..55a64867 100644 --- a/internal/analysis/leiden.go +++ b/internal/analysis/leiden.go @@ -31,7 +31,7 @@ import ( // // Result has the same shape as DetectCommunities so the call site // can swap them out without other changes. -func DetectCommunitiesLeiden(g *graph.Graph) *CommunityResult { +func DetectCommunitiesLeiden(g graph.Store) *CommunityResult { result, _ := detectCommunitiesLeidenRaw(g) return result } @@ -45,7 +45,7 @@ func DetectCommunitiesLeiden(g *graph.Graph) *CommunityResult { // ids and drops singletons, neither of which can drive a restricted // re-optimization. The returned partition is nil when the graph has // no clustering-relevant edges (the result is then empty too). -func detectCommunitiesLeidenRaw(g *graph.Graph) (*CommunityResult, *leidenPartition) { +func detectCommunitiesLeidenRaw(g graph.Store) (*CommunityResult, *leidenPartition) { lg := buildLeidenGraph(g) if lg == nil { return &CommunityResult{NodeToComm: make(map[string]string)}, nil @@ -386,7 +386,7 @@ func leidenAggregate( // label / hub / disambiguation / parent-grouping pipeline so the UI // can render Leiden output identically. func buildCommunityResult( - g *graph.Graph, + g graph.Store, finalComm map[string]string, neighbors map[string]map[string]float64, totalWeight float64, diff --git a/internal/analysis/pagerank.go b/internal/analysis/pagerank.go index b39fdc24..afd65d4d 100644 --- a/internal/analysis/pagerank.go +++ b/internal/analysis/pagerank.go @@ -40,7 +40,7 @@ const ( // Dangling nodes (no outgoing call/reference edge — leaf utilities) // redistribute their mass uniformly each iteration so the scores stay // a proper probability distribution. -func ComputePageRank(g *graph.Graph) *PageRankResult { +func ComputePageRank(g graph.Store) *PageRankResult { if g == nil { return &PageRankResult{Scores: map[string]float64{}} } diff --git a/internal/analysis/processes.go b/internal/analysis/processes.go index 1f9463cf..468047b2 100644 --- a/internal/analysis/processes.go +++ b/internal/analysis/processes.go @@ -37,7 +37,7 @@ type ProcessResult struct { } // DiscoverProcesses finds execution flows by identifying entry points and tracing forward. -func DiscoverProcesses(g *graph.Graph) *ProcessResult { +func DiscoverProcesses(g graph.Store) *ProcessResult { nodes := g.AllNodes() edges := g.AllEdges() diff --git a/internal/analysis/scaffold.go b/internal/analysis/scaffold.go index 98211834..175bf892 100644 --- a/internal/analysis/scaffold.go +++ b/internal/analysis/scaffold.go @@ -20,7 +20,7 @@ import ( // // This interface avoids a circular dependency with the indexer package. type SourceReader interface { - Graph() *graph.Graph + Graph() graph.Store ResolveFilePath(graphPath string) string } @@ -152,7 +152,7 @@ func filterCallerNodes(sg *query.SubGraph, exampleID string) []*graph.Node { // generateRegistrationCode creates a registration/wiring edit by analyzing how // the example symbol is called by its depth-1 callers. -func generateRegistrationCode(g *graph.Graph, callers []*graph.Node, example *graph.Node, newName string) *ScaffoldEdit { +func generateRegistrationCode(g graph.Store, callers []*graph.Node, example *graph.Node, newName string) *ScaffoldEdit { if len(callers) == 0 { return nil } @@ -190,7 +190,7 @@ func generateRegistrationCode(g *graph.Graph, callers []*graph.Node, example *gr // generateTestStub creates a test stub edit by finding the test file and test // functions associated with the example symbol. -func generateTestStub(g *graph.Graph, reader SourceReader, example *graph.Node, newName string) *ScaffoldEdit { +func generateTestStub(g graph.Store, reader SourceReader, example *graph.Node, newName string) *ScaffoldEdit { testFilePath := deriveTestFilePath(example.FilePath) // Check if the test file exists on disk. Resolve abs path through diff --git a/internal/analysis/scaffold_test.go b/internal/analysis/scaffold_test.go index 46f7c855..1ddd0d1b 100644 --- a/internal/analysis/scaffold_test.go +++ b/internal/analysis/scaffold_test.go @@ -22,7 +22,7 @@ type mockSourceReader struct { rootPath string } -func (m *mockSourceReader) Graph() *graph.Graph { return m.g } +func (m *mockSourceReader) Graph() graph.Store { return m.g } func (m *mockSourceReader) ResolveFilePath(relPath string) string { if filepath.IsAbs(relPath) { return relPath diff --git a/internal/analysis/spectral.go b/internal/analysis/spectral.go index 65b60a6f..fdae9cdd 100644 --- a/internal/analysis/spectral.go +++ b/internal/analysis/spectral.go @@ -33,7 +33,7 @@ const ( // // The result has the same shape as DetectCommunities so analyze // kind=clusters can swap algorithms transparently. -func SpectralClusters(g *graph.Graph) *CommunityResult { +func SpectralClusters(g graph.Store) *CommunityResult { nodes := g.AllNodes() edges := g.AllEdges() diff --git a/internal/artifacts/artifacts.go b/internal/artifacts/artifacts.go index 07de87d7..46ef489f 100644 --- a/internal/artifacts/artifacts.go +++ b/internal/artifacts/artifacts.go @@ -56,7 +56,7 @@ type Artifact struct { // repoPrefix scopes node IDs / paths in a multi-repo graph; pass "" // for a single-repo graph. Best-effort — missing or unreadable files // are skipped rather than failing the whole pass. -func Materialize(g *graph.Graph, root string, entries []config.ArtifactEntry, repoPrefix string) []Artifact { +func Materialize(g graph.Store, root string, entries []config.ArtifactEntry, repoPrefix string) []Artifact { if g == nil || root == "" || len(entries) == 0 { return nil } @@ -81,7 +81,7 @@ func Materialize(g *graph.Graph, root string, entries []config.ArtifactEntry, re } // materializeOne reads one artifact file and projects it onto the graph. -func materializeOne(g *graph.Graph, root, rel string, entry config.ArtifactEntry, repoPrefix string, nameIndex map[string][]string) (Artifact, bool) { +func materializeOne(g graph.Store, root, rel string, entry config.ArtifactEntry, repoPrefix string, nameIndex map[string][]string) (Artifact, bool) { data, err := os.ReadFile(filepath.Join(root, rel)) if err != nil { return Artifact{}, false @@ -147,7 +147,7 @@ func materializeOne(g *graph.Graph, root, rel string, entry config.ArtifactEntry // buildSymbolIndex maps every sufficiently-long symbol name to the // node IDs that declare it, scoped to repoPrefix. -func buildSymbolIndex(g *graph.Graph, repoPrefix string) map[string][]string { +func buildSymbolIndex(g graph.Store, repoPrefix string) map[string][]string { index := make(map[string][]string) for _, n := range g.AllNodes() { switch n.Kind { diff --git a/internal/blame/blame.go b/internal/blame/blame.go index 5d2e28a8..1f735a8b 100644 --- a/internal/blame/blame.go +++ b/internal/blame/blame.go @@ -46,13 +46,27 @@ type Author struct { Timestamp time.Time // author-time } -// Run executes `git blame -p` on the file and returns a map from -// 1-based line number to Author. errors include both git invocation -// failures (file not in repo, repo not initialised) and parse -// failures. Callers may treat any error as "skip this file" — the -// enrichment pass is best-effort. +// Run executes `git blame -p` on the file at the current worktree +// (HEAD) and returns a map from 1-based line number to Author. errors +// include both git invocation failures (file not in repo, repo not +// initialised) and parse failures. Callers may treat any error as +// "skip this file" — the enrichment pass is best-effort. func Run(repoRoot, relPath string) (map[int]Author, error) { - cmd := exec.Command("git", "-C", repoRoot, "blame", "-p", "--", relPath) + return RunAt(repoRoot, "", relPath) +} + +// RunAt is Run with an explicit revision (branch / tag / SHA). Pass +// "" for HEAD. Used by enrichments that must blame the default branch +// regardless of the user's current checkout — e.g. the churn enricher +// pinning to `origin/main` so feature-branch work-in-progress doesn't +// pollute the persisted data. +func RunAt(repoRoot, rev, relPath string) (map[int]Author, error) { + args := []string{"-C", repoRoot, "blame", "-p"} + if rev != "" { + args = append(args, rev) + } + args = append(args, "--", relPath) + cmd := exec.Command("git", args...) out, err := cmd.Output() if err != nil { return nil, fmt.Errorf("git blame %s: %w", relPath, err) @@ -189,7 +203,7 @@ func PersonNodeID(email string) string { return "team::" + strings.ToLower(strings.TrimSpace(email)) } -func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { +func EnrichGraph(g graph.Store, repoRoot string) (int, error) { if g == nil || repoRoot == "" { return 0, nil } @@ -212,6 +226,18 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { } enriched := 0 + // Symbol nodes we stamp meta.last_authored on. They must be + // round-tripped back through the store at the end: on the in-memory + // backend the in-place mutation already persists (n is canonical), + // but on disk backends (Ladybug) n is a per-call AllNodes + // reconstruction, so without the write-back the last_authored stamp + // is silently discarded — leaving stale_code / ownership / + // health_score's recency axis empty on Ladybug even after a + // successful `gortex enrich blame`. (The person nodes and + // EdgeAuthored edges below already persist via AddNode/AddEdge; only + // the symbol-node Meta was being dropped.) Mirrors the reach index, + // coverage, and releases enrichers. + var stamped []*graph.Node // Person nodes are deduplicated within this enrichment pass. // IDs are repo-scoped: in multi-repo mode the same email touching // two repos becomes two distinct KindTeam nodes so per-repo @@ -235,6 +261,7 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { "email": latest.Email, "timestamp": latest.Timestamp.Unix(), } + stamped = append(stamped, n) enriched++ if latest.Email == "" { @@ -277,6 +304,12 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { g.AddEdge(edge) } } + // Persist the symbol-node last_authored stamps in one batch (the + // durable write on disk backends; an idempotent re-insert on the + // in-memory backend). + if len(stamped) > 0 { + g.AddBatch(stamped, nil) + } return enriched, nil } diff --git a/internal/churn/churn.go b/internal/churn/churn.go new file mode 100644 index 00000000..a08a757a --- /dev/null +++ b/internal/churn/churn.go @@ -0,0 +1,386 @@ +// Package churn computes per-symbol and per-file commit density from +// the git log of a chosen branch (typically the default branch) and +// persists the result on graph nodes. Once enriched, the MCP tool +// get_churn_rate is a pure graph scan — no `git` subprocess at read +// time. The graph store is the source of truth; the disk-backed +// LadyBug backend keeps the data across daemon restarts, while +// in-memory backends recompute on demand. +// +// Design notes: +// +// - We blame at an explicit rev (the default branch) rather than +// HEAD. Feature-branch work-in-progress doesn't pollute the +// persisted churn signal — the data answers "what's churning on +// main" regardless of where the agent is checked out. +// +// - Per-file blame is invoked once and projected onto every symbol +// in the file. The repo walk inside `git blame` dominates the +// cost; per-symbol invocations would multiply it by the symbol +// count. +// +// - After mutating n.Meta we re-call g.AddNode(n). The in-memory +// store treats this as a no-op (the pointer is already in the +// graph); the LadyBug backend treats it as an UPSERT that +// re-serialises Meta to its on-disk row. This is the only path +// that persists Meta mutations into LadyBug — without it the +// enrichment would be invisible on the next daemon restart. +package churn + +import ( + "bufio" + "bytes" + "context" + "fmt" + "os/exec" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/zzet/gortex/internal/blame" + "github.com/zzet/gortex/internal/graph" +) + +// Options controls how the enricher resolves and persists churn data. +type Options struct { + // Branch is the rev to blame and log. Required — call site is + // expected to resolve the repo's default branch (origin/main, + // origin/master, …) and pass it in. We do not default to HEAD + // because the whole point of pre-computation is to pin the + // signal to a stable branch. + Branch string + // Now lets tests fix the clock for deterministic age_days. When + // zero, time.Now() is used. + Now time.Time +} + +// Result summarises an enrichment pass. +type Result struct { + Files int // file nodes stamped with a churn summary + Symbols int // function/method nodes stamped with per-symbol churn + Branch string // the rev used (echoed back for the CLI) + HeadSHA string // the resolved SHA at enrich time (stored on each file) +} + +// EnrichGraph computes per-symbol and per-file churn and stamps the +// data on graph nodes. Returns counts plus the resolved SHA. Errors +// only when the repo can't be opened or the branch can't be resolved +// at all; per-file failures are best-effort and skip that file. +// +// Persistence: every mutated node is re-upserted via g.AddNode(n). +// On LadyBug-backed stores this round-trips through the Cypher MERGE +// path; on the in-memory store the pointer was already mutated in +// place, but the redundant AddNode call keeps the semantics uniform +// between backends and lets the enricher run against either. +func EnrichGraph(ctx context.Context, g graph.Store, repoRoot string, opts Options) (Result, error) { + if g == nil || repoRoot == "" { + return Result{}, fmt.Errorf("churn: graph and repoRoot are required") + } + if strings.TrimSpace(opts.Branch) == "" { + return Result{}, fmt.Errorf("churn: Options.Branch is required (default-branch resolution belongs to the caller)") + } + now := opts.Now + if now.IsZero() { + now = time.Now() + } + headSHA := runGit(repoRoot, "rev-parse", "--verify", "--quiet", opts.Branch) + if headSHA == "" { + return Result{}, fmt.Errorf("churn: branch %q does not resolve in %s", opts.Branch, repoRoot) + } + + // Group symbols by file path. We deliberately keep file nodes in + // a separate map so we can stamp their summary even when no + // function/method is in scope (some files contain only types or + // constants). + type bucket struct { + file *graph.Node // optional — may be nil + symbols []*graph.Node + } + byPath := map[string]*bucket{} + for _, n := range g.AllNodes() { + if n.FilePath == "" { + continue + } + switch n.Kind { + case graph.KindFile: + b := byPath[n.FilePath] + if b == nil { + b = &bucket{} + byPath[n.FilePath] = b + } + b.file = n + case graph.KindFunction, graph.KindMethod: + if n.StartLine == 0 { + continue + } + b := byPath[n.FilePath] + if b == nil { + b = &bucket{} + byPath[n.FilePath] = b + } + b.symbols = append(b.symbols, n) + } + } + + res := Result{Branch: opts.Branch, HeadSHA: headSHA} + for filePath, b := range byPath { + if err := ctx.Err(); err != nil { + return res, err + } + if len(b.symbols) == 0 && b.file == nil { + continue + } + rel := stripRepoPrefix(filePath, repoRoot) + commits, err := fileCommits(repoRoot, opts.Branch, rel) + if err != nil || len(commits) == 0 { + continue + } + var blameLines map[int]blame.Author + if len(b.symbols) > 0 { + blameLines, _ = blame.RunAt(repoRoot, opts.Branch, rel) + } + + // File summary: aggregate across all commits. + if b.file != nil { + stampFileChurn(b.file, commits, headSHA, opts.Branch, now) + g.AddNode(b.file) + res.Files++ + } + + if len(blameLines) == 0 { + continue + } + // Per-symbol: project blame line range, then look up each + // commit's timestamp/author in the commits map. Falls back + // to blame timestamps when the commit isn't in the log + // (shallow clones, signed-off cherry-picks). + for _, s := range b.symbols { + if stampSymbolChurn(s, blameLines, commits, now) { + g.AddNode(s) + res.Symbols++ + } + } + } + return res, nil +} + +// commitRecord is one row of `git log --format=%H|%ct|%ae`. +type commitRecord struct { + SHA string + When time.Time + Email string +} + +// fileCommits returns the commit history for relPath on branch. +// Ordered newest → oldest. Empty slice when the file has no history +// on that branch (untracked, or the rev predates the file). +func fileCommits(repoRoot, branch, relPath string) ([]commitRecord, error) { + cmd := exec.Command("git", "-C", repoRoot, "log", branch, + "--no-merges", "--follow", "--format=%H|%ct|%ae", "--", relPath) + out, err := cmd.Output() + if err != nil { + return nil, err + } + var records []commitRecord + scanner := bufio.NewScanner(bytes.NewReader(out)) + scanner.Buffer(make([]byte, 64*1024), 8*1024*1024) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + parts := strings.SplitN(line, "|", 3) + if len(parts) != 3 { + continue + } + ts, err := strconv.ParseInt(parts[1], 10, 64) + if err != nil { + continue + } + records = append(records, commitRecord{ + SHA: parts[0], + When: time.Unix(ts, 0), + Email: parts[2], + }) + } + return records, scanner.Err() +} + +// stampFileChurn writes the file-level summary onto n.Meta["churn"] +// and pins enrichment provenance under n.Meta["churn_meta"]. +func stampFileChurn(n *graph.Node, commits []commitRecord, headSHA, branch string, now time.Time) { + if n.Meta == nil { + n.Meta = map[string]any{} + } + commitCount := len(commits) + first := commits[len(commits)-1].When + last := commits[0].When + ageDays := int(now.Sub(first).Hours() / 24) + activeDays := ageDays + if activeDays < 1 { + activeDays = 1 + } + n.Meta["churn"] = map[string]any{ + "commit_count": commitCount, + "age_days": ageDays, + "churn_rate": roundTwo(float64(commitCount) / float64(activeDays)), + "last_author": commits[0].Email, + "last_commit_at": last.UTC().Format(time.RFC3339), + } + n.Meta["churn_meta"] = map[string]any{ + "head_sha": headSHA, + "branch": branch, + "computed_at": now.UTC().Format(time.RFC3339), + } +} + +// stampSymbolChurn projects the file's blame onto the symbol's line +// range and stamps n.Meta["churn"]. Returns true when the symbol's +// range had at least one blamed line — false when blame produced no +// coverage (uncommitted lines or the file is untracked at the rev). +func stampSymbolChurn(n *graph.Node, blameLines map[int]blame.Author, commits []commitRecord, now time.Time) bool { + endLine := n.EndLine + if endLine == 0 { + endLine = n.StartLine + } + commitsSeen := map[string]struct{}{} + var oldest, newest time.Time + latestEmail := "" + for line := n.StartLine; line <= endLine; line++ { + a, ok := blameLines[line] + if !ok { + continue + } + commitsSeen[a.Commit] = struct{}{} + if oldest.IsZero() || a.Timestamp.Before(oldest) { + oldest = a.Timestamp + } + if newest.IsZero() || a.Timestamp.After(newest) { + newest = a.Timestamp + latestEmail = a.Email + } + } + if len(commitsSeen) == 0 { + return false + } + // Prefer the canonical author email from the log over the blame + // author email when both exist — `git log` carries the merged-in + // author identity, while blame may show the original + // pre-rebase author. + if email := latestAuthorFromCommits(commitsSeen, commits); email != "" { + latestEmail = email + } + ageDays := 0 + if !oldest.IsZero() { + ageDays = int(now.Sub(oldest).Hours() / 24) + } + activeDays := ageDays + if activeDays < 1 { + activeDays = 1 + } + if n.Meta == nil { + n.Meta = map[string]any{} + } + n.Meta["churn"] = map[string]any{ + "commit_count": len(commitsSeen), + "age_days": ageDays, + "churn_rate": roundTwo(float64(len(commitsSeen)) / float64(activeDays)), + "last_author": latestEmail, + "last_commit_at": newest.UTC().Format(time.RFC3339), + } + return true +} + +// latestAuthorFromCommits picks the email of the most-recent commit +// that touches the symbol's range, using the per-file log as the +// authority for author identity (blame can lag a rebase / cherry-pick). +func latestAuthorFromCommits(commitsSeen map[string]struct{}, commits []commitRecord) string { + for _, c := range commits { + if _, ok := commitsSeen[c.SHA]; ok { + return c.Email + } + } + return "" +} + +// roundTwo rounds to two decimals so the JSON output stays compact +// — single-digit precision swallows the difference between 0.03 and +// 0.04 churn-per-day, which matters for ranking. +func roundTwo(v float64) float64 { + return float64(int64(v*100+0.5)) / 100 +} + +// stripRepoPrefix removes a leading repo segment from multi-repo +// indexer paths so the path we hand to git is repo-relative. Mirrors +// the helper in internal/blame; duplicated rather than exported +// because the blame copy is unexported by design. +func stripRepoPrefix(filePath, repoRoot string) string { + if !strings.Contains(filePath, "/") { + return filePath + } + if _, err := exec.LookPath("git"); err != nil { + return filePath + } + abs := filepath.Join(repoRoot, filePath) + if fileExists(abs) { + return filePath + } + if idx := strings.Index(filePath, "/"); idx >= 0 { + trimmed := filePath[idx+1:] + if fileExists(filepath.Join(repoRoot, trimmed)) { + return trimmed + } + } + return filePath +} + +var fileExists = func(path string) bool { + cmd := exec.Command("test", "-f", path) + return cmd.Run() == nil +} + +// runGit shells out and returns trimmed stdout, or "" on error. Used +// only for the one-shot rev-parse; full enrichment calls go through +// fileCommits / blame.RunAt directly. +func runGit(repoRoot string, args ...string) string { + cmd := exec.Command("git", append([]string{"-C", repoRoot}, args...)...) + out, err := cmd.Output() + if err != nil { + return "" + } + return strings.TrimSpace(string(out)) +} + +// DefaultBranch returns the repository's default branch as a +// rev-parseable reference (preferring "origin/" when an upstream +// is configured, falling back to a local branch when not). Returns "" +// when none of the candidates resolve — the caller is then expected +// to surface a clear error rather than silently picking the current +// branch (feature branches must not pollute the persisted data). +// +// Exposed so MCP-side enrich handlers can resolve the same branch +// the CLI does without duplicating the probe order across packages. +func DefaultBranch(repoRoot string) string { + probe := func(args ...string) (string, bool) { + cmd := exec.Command("git", append([]string{"-C", repoRoot}, args...)...) + out, err := cmd.Output() + if err != nil { + return "", false + } + return strings.TrimSpace(string(out)), true + } + if ref, ok := probe("symbolic-ref", "--short", "refs/remotes/origin/HEAD"); ok && ref != "" { + return ref + } + for _, candidate := range []string{"origin/main", "origin/master", "origin/trunk"} { + if _, ok := probe("rev-parse", "--verify", "--quiet", candidate); ok { + return candidate + } + } + for _, candidate := range []string{"main", "master", "trunk"} { + if _, ok := probe("rev-parse", "--verify", "--quiet", candidate); ok { + return candidate + } + } + return "" +} diff --git a/internal/churn/churn_test.go b/internal/churn/churn_test.go new file mode 100644 index 00000000..5302c0dd --- /dev/null +++ b/internal/churn/churn_test.go @@ -0,0 +1,200 @@ +package churn + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/zzet/gortex/internal/graph" +) + +func TestEnrichGraph_StampsSymbolAndFile(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + repoDir := initRepo(t) + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() {}\n", "initial") + // Touch the file twice more so churn_rate is non-trivial. + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() { _ = 1 }\n", "second") + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() { _ = 2 }\n", "third") + + g := graph.New() + g.AddNode(&graph.Node{ + ID: "main.go", Kind: graph.KindFile, Name: "main.go", FilePath: "main.go", + }) + g.AddNode(&graph.Node{ + ID: "main.go::Hello", + Kind: graph.KindFunction, + Name: "Hello", + FilePath: "main.go", + StartLine: 3, EndLine: 3, + }) + + res, err := EnrichGraph(context.Background(), g, repoDir, Options{ + Branch: currentBranch(t, repoDir), + Now: time.Now(), + }) + if err != nil { + t.Fatalf("enrich: %v", err) + } + if res.Files != 1 || res.Symbols != 1 { + t.Errorf("res = %+v, want Files=1 Symbols=1", res) + } + if res.HeadSHA == "" { + t.Error("HeadSHA should be set") + } + + // File summary present. + fileNode := g.GetNode("main.go") + fileChurn, ok := fileNode.Meta["churn"].(map[string]any) + if !ok { + t.Fatalf("file Meta[churn] missing: %+v", fileNode.Meta) + } + if cc, _ := fileChurn["commit_count"].(int); cc != 3 { + t.Errorf("file commit_count = %v, want 3", fileChurn["commit_count"]) + } + if _, ok := fileChurn["churn_rate"].(float64); !ok { + t.Errorf("file churn_rate missing or not float: %T %v", fileChurn["churn_rate"], fileChurn["churn_rate"]) + } + // Provenance present. + if _, ok := fileNode.Meta["churn_meta"].(map[string]any); !ok { + t.Errorf("file churn_meta missing: %+v", fileNode.Meta) + } + + // Per-symbol churn. + sym := g.GetNode("main.go::Hello") + symChurn, ok := sym.Meta["churn"].(map[string]any) + if !ok { + t.Fatalf("symbol Meta[churn] missing: %+v", sym.Meta) + } + if cc, _ := symChurn["commit_count"].(int); cc < 1 { + t.Errorf("symbol commit_count = %v, want >= 1", symChurn["commit_count"]) + } + if _, ok := symChurn["last_author"].(string); !ok { + t.Errorf("symbol last_author missing: %+v", symChurn) + } +} + +func TestEnrichGraph_SkipsFilesWithNoHistory(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + repoDir := initRepo(t) + writeAndCommit(t, repoDir, "main.go", "package main\n\nfunc Hello() {}\n", "initial") + + g := graph.New() + // Refer to a file that exists on disk but isn't tracked by git. + if err := os.WriteFile(filepath.Join(repoDir, "untracked.go"), []byte("package main\n"), 0o644); err != nil { + t.Fatal(err) + } + g.AddNode(&graph.Node{ID: "untracked.go", Kind: graph.KindFile, FilePath: "untracked.go"}) + + res, err := EnrichGraph(context.Background(), g, repoDir, Options{ + Branch: currentBranch(t, repoDir), + }) + if err != nil { + t.Fatalf("enrich: %v", err) + } + if res.Files != 0 || res.Symbols != 0 { + t.Errorf("untracked file should yield no stamps, got %+v", res) + } +} + +func TestEnrichGraph_RequiresBranch(t *testing.T) { + g := graph.New() + _, err := EnrichGraph(context.Background(), g, "/tmp/anywhere", Options{}) + if err == nil { + t.Fatal("expected error when Branch is empty") + } + if !strings.Contains(err.Error(), "Branch is required") { + t.Errorf("unexpected error: %v", err) + } +} + +func TestEnrichGraph_RejectsUnresolvableBranch(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + repoDir := initRepo(t) + writeAndCommit(t, repoDir, "main.go", "package main\n", "initial") + + g := graph.New() + _, err := EnrichGraph(context.Background(), g, repoDir, Options{Branch: "does-not-exist"}) + if err == nil { + t.Fatal("expected error when branch does not resolve") + } +} + +func TestRoundTwo(t *testing.T) { + cases := []struct { + in float64 + want float64 + }{ + {0.0, 0.0}, + {0.125, 0.13}, + {1.0 / 3.0, 0.33}, + {99.999, 100.0}, + } + for _, c := range cases { + if got := roundTwo(c.in); got != c.want { + t.Errorf("roundTwo(%v) = %v, want %v", c.in, got, c.want) + } + } +} + +// --- helpers --- + +func initRepo(t *testing.T) string { + t.Helper() + dir := t.TempDir() + for _, args := range [][]string{ + {"init", "-q", "-b", "main"}, + {"config", "user.email", "test@example.com"}, + {"config", "user.name", "Tester"}, + {"config", "commit.gpgsign", "false"}, + } { + cmd := exec.Command("git", args...) + cmd.Dir = dir + if out, err := cmd.CombinedOutput(); err != nil { + t.Fatalf("git %v: %v\n%s", args, err, out) + } + } + return dir +} + +func writeAndCommit(t *testing.T, dir, rel, body, msg string) { + t.Helper() + if err := os.WriteFile(filepath.Join(dir, rel), []byte(body), 0o644); err != nil { + t.Fatal(err) + } + add := exec.Command("git", "add", rel) + add.Dir = dir + if out, err := add.CombinedOutput(); err != nil { + t.Fatalf("git add: %v\n%s", err, out) + } + commit := exec.Command("git", "commit", "-q", "-m", msg) + commit.Dir = dir + commit.Env = append(commit.Environ(), + "GIT_AUTHOR_NAME=Tester", "GIT_AUTHOR_EMAIL=test@example.com", + "GIT_COMMITTER_NAME=Tester", "GIT_COMMITTER_EMAIL=test@example.com") + if out, err := commit.CombinedOutput(); err != nil { + t.Fatalf("git commit: %v\n%s", err, out) + } +} + +func currentBranch(t *testing.T, dir string) string { + t.Helper() + cmd := exec.Command("git", "rev-parse", "--abbrev-ref", "HEAD") + cmd.Dir = dir + out, err := cmd.Output() + if err != nil { + t.Fatalf("rev-parse: %v", err) + } + return strings.TrimSpace(string(out)) +} diff --git a/internal/cochange/cochange.go b/internal/cochange/cochange.go index 0fb53dc4..2c8b4e2a 100644 --- a/internal/cochange/cochange.go +++ b/internal/cochange/cochange.go @@ -196,12 +196,12 @@ func orderedPair(a, b string) [2]string { // // Best-effort: returns (0, nil) when root is not a git repository. // Idempotent — graph.AddEdge dedupes, so repeated runs converge. -func EnrichGraph(g *graph.Graph, root, repoPrefix string) (int, error) { +func EnrichGraph(g graph.Store, root, repoPrefix string) (int, error) { return EnrichGraphWith(g, root, repoPrefix, Options{}) } // EnrichGraphWith is EnrichGraph with explicit scan tuning. -func EnrichGraphWith(g *graph.Graph, root, repoPrefix string, opts Options) (int, error) { +func EnrichGraphWith(g graph.Store, root, repoPrefix string, opts Options) (int, error) { if g == nil || root == "" { return 0, nil } @@ -217,7 +217,7 @@ func EnrichGraphWith(g *graph.Graph, root, repoPrefix string, opts Options) (int // carrying that RepoPrefix are matched, against the prefix-stripped // node path (the pairs hold git-relative paths). Pass "" for a // single-repo graph. Idempotent — graph.AddEdge dedupes. -func AddEdges(g *graph.Graph, pairs []Pair, repoPrefix string) int { +func AddEdges(g graph.Store, pairs []Pair, repoPrefix string) int { if g == nil || len(pairs) == 0 { return 0 } diff --git a/internal/codeowners/parser.go b/internal/codeowners/parser.go index 5d449071..a014aa86 100644 --- a/internal/codeowners/parser.go +++ b/internal/codeowners/parser.go @@ -28,14 +28,18 @@ type Rule struct { matcher *gitignore.GitIgnore } -// matchPattern compiles the rule's pattern as a single-line gitignore -// matcher. We compile lazily so the rule list is cheap to construct -// for repos that never call MatchFile. +// matchPattern returns the rule's gitignore matcher. Parse precompiles +// it, so for any Parse-built Rule the field is non-nil and MatchFile's +// concurrent hot path only reads it — no data race on a shared rule list +// (applyCoverageDomains matches files across goroutines against one +// list). For a Rule hand-constructed outside Parse the field is nil; +// compile a throwaway matcher rather than caching into r.matcher, so +// concurrent callers still can't race on the field. func (r *Rule) matchPattern() *gitignore.GitIgnore { - if r.matcher == nil { - r.matcher = gitignore.CompileIgnoreLines(r.Pattern) + if r.matcher != nil { + return r.matcher } - return r.matcher + return gitignore.CompileIgnoreLines(r.Pattern) } // Parse reads a CODEOWNERS file's bytes and returns the rule list in @@ -67,6 +71,9 @@ func Parse(source []byte) []Rule { continue } rule := Rule{Pattern: fields[0]} + // Precompile the matcher in this single-goroutine parse so the + // concurrent MatchFile hot path only reads rule.matcher. + rule.matcher = gitignore.CompileIgnoreLines(rule.Pattern) if len(fields) > 1 { rule.Owners = append(rule.Owners, fields[1:]...) } diff --git a/internal/codeowners/parser_race_test.go b/internal/codeowners/parser_race_test.go new file mode 100644 index 00000000..6ec5c6ea --- /dev/null +++ b/internal/codeowners/parser_race_test.go @@ -0,0 +1,34 @@ +package codeowners_test + +import ( + "sync" + "testing" + + "github.com/zzet/gortex/internal/codeowners" +) + +// TestMatchFile_ConcurrentNoRace exercises MatchFile from many goroutines over +// a single shared rule list — the way the indexer's per-file coverage +// goroutines (applyCoverageDomains) call it. Pre-fix, matchPattern lazily +// compiled and cached r.matcher without synchronisation, so concurrent first +// calls raced on the shared *Rule (and on the half-published GitIgnore). Run +// under -race; it must be clean. +func TestMatchFile_ConcurrentNoRace(t *testing.T) { + rules := codeowners.Parse([]byte( + "*.go @gophers\n" + + "/docs/ @writers\n" + + "src/**/*.ts @frontend @core\n" + + "*.md @docs\n", + )) + paths := []string{"main.go", "docs/readme.md", "src/a/b/c.ts", "x/y/z.py", "pkg/foo.go", "README.md"} + + var wg sync.WaitGroup + for range 64 { + wg.Go(func() { + for i := range 200 { + _ = codeowners.MatchFile(paths[i%len(paths)], rules) + } + }) + } + wg.Wait() +} diff --git a/internal/contracts/bind.go b/internal/contracts/bind.go index bfa2e483..d6e43cd2 100644 --- a/internal/contracts/bind.go +++ b/internal/contracts/bind.go @@ -31,7 +31,7 @@ import ( // 4. Tiebreak: prefer candidates in files that mention a registration // call like `pb.Register{Service}Server(` or `r.{HTTPVerb}(`. // 5. Uniquely bind or skip (never guess among multiple). -func BindProviderSymbols(reg *Registry, g *graph.Graph) int { +func BindProviderSymbols(reg *Registry, g graph.Store) int { if reg == nil || g == nil { return 0 } @@ -83,7 +83,7 @@ func BindProviderSymbols(reg *Registry, g *graph.Graph) int { // `Register{Service}Server(` call. // 4. Same method name, any receiver — only if there's exactly one // candidate in the repo. -func bindGRPCProvider(c Contract, g *graph.Graph) string { +func bindGRPCProvider(c Contract, g graph.Store) string { method, _ := c.Meta["method"].(string) service, _ := c.Meta["service"].(string) if method == "" || service == "" { @@ -123,7 +123,7 @@ func bindGRPCProvider(c Contract, g *graph.Graph) string { // widely, this is lower-confidence than gRPC binding; a stricter // implementation would also check the Gin/Echo route registration // file, but v1 just name-matches. Returns "" if no unambiguous bind. -func bindOpenAPIProvider(c Contract, g *graph.Graph) string { +func bindOpenAPIProvider(c Contract, g graph.Store) string { op, _ := c.Meta["operationId"].(string) if op == "" { // Fall back to the last path segment; OpenAPI specs diff --git a/internal/contracts/bind_test.go b/internal/contracts/bind_test.go index 84b62fb7..5435b41c 100644 --- a/internal/contracts/bind_test.go +++ b/internal/contracts/bind_test.go @@ -11,7 +11,7 @@ import ( // bindGRPCProvider. func newBindTestGraph(repoPrefix string, methods []struct { id, name, recv string -}) *graph.Graph { +}) graph.Store { g := graph.New() for _, m := range methods { n := &graph.Node{ diff --git a/internal/contracts/load_from_graph.go b/internal/contracts/load_from_graph.go new file mode 100644 index 00000000..e5ee7d20 --- /dev/null +++ b/internal/contracts/load_from_graph.go @@ -0,0 +1,82 @@ +package contracts + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// LoadRegistryFromGraph rebuilds a Registry by scanning every +// KindContract node under repoPrefix and reconstructing the Contract +// struct from Node.Meta. The reverse of the AddNode stamping the +// indexer's commitContracts (and contracts/wrapper.go's +// commitInlinedContractToGraph) do — both write the full record onto +// Meta so a daemon restart can rehydrate without replaying the gob +// snapshot. +// +// Empty repoPrefix loads every contract — useful for ad-hoc probes, +// not a path the daemon normally takes (the warmup rehydrates the +// per-repo registries one prefix at a time so a stale repo's +// contracts don't bleed into a fresh sibling). Returns nil when no +// contracts are recorded for the prefix. +func LoadRegistryFromGraph(g graph.Store, repoPrefix string) *Registry { + if g == nil { + return nil + } + all := g.GetRepoNodes(repoPrefix) + if len(all) == 0 { + return nil + } + reg := NewRegistry() + for _, n := range all { + if n == nil || n.Kind != graph.KindContract { + continue + } + c := contractFromNode(n) + if c.ID == "" { + continue + } + reg.Add(c) + } + if len(reg.All()) == 0 { + return nil + } + return reg +} + +// contractFromNode decodes a Contract from a KindContract graph node's +// Meta payload. Inverse of the AddNode stamping the indexer does. +// Missing fields are left at their zero value — preserves forward +// compatibility if the indexer adds new Meta keys before this loader +// learns about them. +func contractFromNode(n *graph.Node) Contract { + c := Contract{ + ID: n.ID, + FilePath: n.FilePath, + RepoPrefix: n.RepoPrefix, + } + if n.Meta == nil { + return c + } + if v, ok := n.Meta["type"].(string); ok { + c.Type = ContractType(v) + } + if v, ok := n.Meta["role"].(string); ok { + c.Role = Role(v) + } + if v, ok := n.Meta["symbol_id"].(string); ok { + c.SymbolID = v + } + if v, ok := n.Meta["line"].(int); ok { + c.Line = v + } else if v, ok := n.Meta["line"].(int64); ok { + c.Line = int(v) + } + if v, ok := n.Meta["confidence"].(float64); ok { + c.Confidence = v + } + c.WorkspaceID = n.WorkspaceID + c.ProjectID = n.ProjectID + if v, ok := n.Meta["contract_meta"].(map[string]any); ok && len(v) > 0 { + c.Meta = v + } + return c +} diff --git a/internal/contracts/wrapper.go b/internal/contracts/wrapper.go index af38080c..97068f10 100644 --- a/internal/contracts/wrapper.go +++ b/internal/contracts/wrapper.go @@ -38,7 +38,7 @@ type SourceReader func(n *graph.Node) ([]byte, bool) // their per-repo registries — the transient merged registry MultiIndexer // hands in is rebuilt on every ReconcileContractEdges call, so mutations // to it don't survive between invocations). -func InlineWrappers(reg *Registry, g *graph.Graph, read SourceReader) []Contract { +func InlineWrappers(reg *Registry, g graph.Store, read SourceReader) []Contract { if reg == nil || g == nil || read == nil { return nil } @@ -145,7 +145,7 @@ type wrapperInfo struct { // matching a regex pattern: lines + fileNodes + lang + tree feed // EnrichHTTPContractWithTree, which dispatches to the per-language // schema_enrich_*.go detectors and (for Go) the AST overlay. -func enrichInlinedWrapperContract(c *Contract, g *graph.Graph, caller *graph.Node, src []byte) { +func enrichInlinedWrapperContract(c *Contract, g graph.Store, caller *graph.Node, src []byte) { if c == nil || caller == nil || len(src) == 0 { return } @@ -195,19 +195,28 @@ func isWrapperPath(path string) bool { // contracts list output and in the matcher's graph view. Idempotency // matters because ReconcileContractEdges runs on every repo change — // without it each track/index would duplicate edges. -func commitInlinedContractToGraph(g *graph.Graph, c Contract) { +func commitInlinedContractToGraph(g graph.Store, c Contract) { if g == nil { return } if g.GetNode(c.ID) == nil { g.AddNode(&graph.Node{ - ID: c.ID, - Kind: graph.KindContract, - Name: c.ID, - FilePath: c.FilePath, - Language: "contract", - RepoPrefix: c.RepoPrefix, - Meta: map[string]any{"type": string(c.Type), "role": string(c.Role)}, + ID: c.ID, + Kind: graph.KindContract, + Name: c.ID, + FilePath: c.FilePath, + Language: "contract", + RepoPrefix: c.RepoPrefix, + WorkspaceID: c.EffectiveWorkspace(), + ProjectID: c.EffectiveProject(), + Meta: map[string]any{ + "type": string(c.Type), + "role": string(c.Role), + "symbol_id": c.SymbolID, + "line": c.Line, + "confidence": c.Confidence, + "contract_meta": c.Meta, + }, }) } if c.SymbolID == "" { diff --git a/internal/coverage/coverage.go b/internal/coverage/coverage.go index 82c4f8f1..26af9cf8 100644 --- a/internal/coverage/coverage.go +++ b/internal/coverage/coverage.go @@ -168,7 +168,7 @@ func (s CoverageStats) Percent() float64 { // file paths are repo-relative (`pkg/file.go`). Pass "" to skip // the prefix-strip, useful when the profile was generated against // raw paths. -func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { +func EnrichGraph(g graph.Store, segments []Segment, modulePath string) int { if g == nil || len(segments) == 0 { return 0 } @@ -182,6 +182,16 @@ func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { } enriched := 0 + // Collect every node whose Meta we stamp so we can round-trip it + // back through the store at the end. On the in-memory backend the + // in-place mutation already persists (n is the canonical node); on + // disk backends (Ladybug) n is a per-call GetNode/AllNodes + // reconstruction, so without the write-back the coverage_pct stamp + // is silently discarded the moment AllNodes' slice goes out of + // scope — leaving analyze:coverage_gaps / health_score's coverage + // axis empty on Ladybug. Mirrors releases.EnrichGraph and the reach + // index, which already round-trip Meta through AddNode/AddBatch. + var stamped []*graph.Node for _, n := range g.AllNodes() { if !shouldEnrichCoverage(n.Kind) { continue @@ -206,6 +216,7 @@ func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { "num_stmt": stats.NumStmt, "hit": stats.Hit, } + stamped = append(stamped, n) enriched++ // EdgeCoveredBy: invert each EdgeTests pointing at this @@ -240,6 +251,13 @@ func EnrichGraph(g *graph.Graph, segments []Segment, modulePath string) int { }) } } + // Persist the stamped node Meta back through the store in one batch + // (a no-op-ish re-insert on the in-memory backend, the durable write + // on disk backends). Without this the coverage_pct stamps never + // survive on Ladybug. + if len(stamped) > 0 { + g.AddBatch(stamped, nil) + } return enriched } diff --git a/internal/daemon/paths.go b/internal/daemon/paths.go index c32c6557..4484738d 100644 --- a/internal/daemon/paths.go +++ b/internal/daemon/paths.go @@ -93,8 +93,12 @@ func LogFilePath() string { return filepath.Join(os.TempDir(), "gortex-daemon.log") } -// SnapshotPath returns the path the daemon saves graph snapshots to on -// periodic saves and clean shutdown. Loaded on startup for fast cold starts. +// SnapshotPath returns the legacy backend-agnostic snapshot path — +// `daemon.gob.gz` under the state dir. Kept for callers that haven't +// moved to backend-tagged storage yet (cloud indexer worker, ad-hoc +// `gortex index --snapshot` runs). The daemon itself routes through +// BackendSnapshotPath so a memory ↔ ladybug switch can't read the +// other backend's snapshot — see that function's doc. func SnapshotPath() string { if override := os.Getenv("GORTEX_DAEMON_SNAPSHOT"); override != "" { return override @@ -105,6 +109,51 @@ func SnapshotPath() string { return filepath.Join(os.TempDir(), "gortex-daemon.gob.gz") } +// BackendSnapshotPath returns a backend-tagged snapshot path so the +// memory and ladybug backends use distinct files. The memory backend +// snapshot is a full gob+gzip of the in-memory graph; the ladybug +// backend snapshot is metadata-only (FileMtimes, contracts, vector +// index) because the graph itself lives in `store.lbug`. Loading the +// memory backend's snapshot into a ladybug daemon (or vice versa) +// silently produced wrong state — empty graph after ladybug→memory +// switch, decode-and-discard nodes after memory→ladybug — so a fresh +// daemon now picks the right file by backend tag. +// +// Empty backend tag falls back to SnapshotPath() so embedded callers +// that don't know the backend (the cloud indexer worker) keep working. +// +// GORTEX_DAEMON_SNAPSHOT overrides every backend tag — the override +// is an explicit "use exactly this path" signal. +func BackendSnapshotPath(backend string) string { + if override := os.Getenv("GORTEX_DAEMON_SNAPSHOT"); override != "" { + return override + } + tag := normalizeBackendTag(backend) + if tag == "" { + return SnapshotPath() + } + filename := "daemon-" + tag + ".gob.gz" + if dir, ok := stateDir(); ok { + return filepath.Join(dir, filename) + } + return filepath.Join(os.TempDir(), "gortex-"+filename) +} + +// normalizeBackendTag canonicalizes a backend identifier into the +// short tag used in the snapshot filename — "memory" / "ladybug" / +// etc. Empty / unknown input returns the empty string so the caller +// can fall back to the legacy unsuffixed path. +func normalizeBackendTag(backend string) string { + switch backend { + case "memory", "mem", "in-memory": + return "memory" + case "ladybug", "lbug": + return "ladybug" + default: + return "" + } +} + // EnsureParentDir creates the parent directory of path with permissions // 0o700 (user only). Daemon state files live under the user's cache dir // and should not be world-readable. The mode is advisory on Windows, diff --git a/internal/daemon/pidfile_test.go b/internal/daemon/pidfile_test.go new file mode 100644 index 00000000..9182ecb2 --- /dev/null +++ b/internal/daemon/pidfile_test.go @@ -0,0 +1,67 @@ +package daemon + +import ( + "os" + "path/filepath" + "strconv" + "testing" +) + +// TestRunningPID covers the four states RunningPID must distinguish: no PID +// file, a live owner, a stale owner (process gone), and a corrupt file. The +// stale case is the load-bearing one — misreading a crashed daemon's leftover +// PID file as "running" would block every subsequent start. +func TestRunningPID(t *testing.T) { + pidPath := filepath.Join(t.TempDir(), "daemon.pid") + t.Setenv("GORTEX_DAEMON_PIDFILE", pidPath) + + t.Run("no pid file", func(t *testing.T) { + if pid, ok := RunningPID(); ok { + t.Fatalf("want (0,false), got (%d,%v)", pid, ok) + } + }) + + t.Run("live owner", func(t *testing.T) { + writePID(t, pidPath, os.Getpid()) + pid, ok := RunningPID() + if !ok || pid != os.Getpid() { + t.Fatalf("want (%d,true), got (%d,%v)", os.Getpid(), pid, ok) + } + }) + + t.Run("live owner with trailing newline", func(t *testing.T) { + // A pidfile written by `echo`/a process manager ends in "\n". The + // guard must still detect the live owner — otherwise a restart + // silently races the store lock again. + if err := os.WriteFile(pidPath, []byte(strconv.Itoa(os.Getpid())+"\n"), 0o600); err != nil { + t.Fatal(err) + } + if pid, ok := RunningPID(); !ok || pid != os.Getpid() { + t.Fatalf("want (%d,true), got (%d,%v)", os.Getpid(), pid, ok) + } + }) + + t.Run("stale owner", func(t *testing.T) { + // A PID well above any platform's pid_max — guaranteed not live. + writePID(t, pidPath, 1<<30) + if pid, ok := RunningPID(); ok { + t.Fatalf("stale pid must read as not running, got (%d,%v)", pid, ok) + } + }) + + t.Run("corrupt file", func(t *testing.T) { + if err := os.WriteFile(pidPath, []byte("not-a-pid"), 0o600); err != nil { + t.Fatal(err) + } + if pid, ok := RunningPID(); ok { + t.Fatalf("corrupt pid file must read as not running, got (%d,%v)", pid, ok) + } + }) +} + +func writePID(t *testing.T, path string, pid int) { + t.Helper() + if err := os.WriteFile(path, []byte(strconv.Itoa(pid)), 0o600); err != nil { + t.Fatal(err) + } +} diff --git a/internal/daemon/proto.go b/internal/daemon/proto.go index 5a7d4db8..3161352d 100644 --- a/internal/daemon/proto.go +++ b/internal/daemon/proto.go @@ -91,6 +91,16 @@ const ( ControlStatus = "status" ControlShutdown = "shutdown" ControlSearchSymbols = "search_symbols" + // ControlEnrichChurn dispatches to Controller.EnrichChurn — the daemon + // runs the churn enricher against its in-process graph so the CLI + // (and the post-commit / post-merge git hooks) don't have to fight + // the LadyBug write lock the daemon holds. + ControlEnrichChurn = "enrich_churn" + // ControlEnrichReleases dispatches to Controller.EnrichReleases. + // Same routing rationale as ControlEnrichChurn — the CLI hands the + // enrichment to the daemon when one is up so the write lock stays + // uncontested. + ControlEnrichReleases = "enrich_releases" ) // TrackParams is the payload for ControlTrack. @@ -239,6 +249,51 @@ type SearchSymbolsResult struct { Hits []SymbolHit `json:"hits"` } +// EnrichChurnParams is the payload for ControlEnrichChurn. +// +// Path scopes the enrichment to a single tracked repo (matched by +// prefix, abs path, or "" for "every tracked repo"). Branch overrides +// the default-branch resolution — pass "origin/main" / "main" / a tag +// / a SHA. Empty Branch means the daemon picks the default branch +// from each repo's working tree. +type EnrichChurnParams struct { + Path string `json:"path,omitempty"` + Branch string `json:"branch,omitempty"` +} + +// EnrichChurnResult is the payload returned under Result for a +// successful ControlEnrichChurn call. Counts are summed across every +// repo that participated (typically one). +type EnrichChurnResult struct { + Files int `json:"files"` + Symbols int `json:"symbols"` + Branch string `json:"branch"` + HeadSHA string `json:"head_sha"` + DurationMS int64 `json:"duration_ms"` +} + +// EnrichReleasesParams is the payload for ControlEnrichReleases. +// +// Path scopes the enrichment to a single tracked repo (prefix or +// absolute root, "" for "every tracked repo"). Branch restricts the +// considered tags to those reachable from that branch; empty Branch +// means "every tag in the repo" — matches the legacy `analyze +// kind=releases` behaviour. +type EnrichReleasesParams struct { + Path string `json:"path,omitempty"` + Branch string `json:"branch,omitempty"` +} + +// EnrichReleasesResult is the payload returned under Result for a +// successful ControlEnrichReleases call. Files is the count of file +// nodes stamped with meta.added_in across every repo that +// participated. +type EnrichReleasesResult struct { + Files int `json:"files"` + Branch string `json:"branch,omitempty"` + DurationMS int64 `json:"duration_ms"` +} + // TrackedRepoStatus is one row in StatusResponse.TrackedRepos. type TrackedRepoStatus struct { Prefix string `json:"prefix"` diff --git a/internal/daemon/server.go b/internal/daemon/server.go index 6a19e483..686c5cb8 100644 --- a/internal/daemon/server.go +++ b/internal/daemon/server.go @@ -13,6 +13,7 @@ import ( "os/signal" "runtime" "strconv" + "strings" "sync" "time" @@ -97,6 +98,15 @@ type Controller interface { // (Claude Code's Grep-redirect hook) that need a single short answer // without setting up a full MCP session. SearchSymbols(ctx context.Context, params SearchSymbolsParams) (SearchSymbolsResult, error) + // EnrichChurn runs the per-symbol / per-file churn enricher against + // the daemon's in-process graph. Exposed over the control surface so + // CLI invocations (and the post-commit / post-merge git hook) can + // trigger it without taking the LadyBug write lock the daemon owns. + EnrichChurn(ctx context.Context, params EnrichChurnParams) (EnrichChurnResult, error) + // EnrichReleases runs the per-file release enricher against the + // daemon's in-process graph. Same routing rationale as + // EnrichChurn — keeps the LadyBug write lock with the daemon. + EnrichReleases(ctx context.Context, params EnrichReleasesParams) (EnrichReleasesResult, error) // Shutdown is invoked via the control surface and should return // quickly; the daemon's actual shutdown work happens after the // response is written. @@ -517,6 +527,36 @@ func (s *Server) handleControl(_ *Session, req ControlRequest) ControlResponse { return controlErr(ErrInternal, err.Error()) } return ControlResponse{OK: true} + + case ControlEnrichChurn: + var p EnrichChurnParams + if err := unmarshalParams(req.Params, &p); err != nil { + return controlErr(ErrInternal, err.Error()) + } + result, err := s.Controller.EnrichChurn(ctx, p) + if err != nil { + return controlErr(ErrInternal, err.Error()) + } + buf, err := json.Marshal(result) + if err != nil { + return controlErr(ErrInternal, "marshal enrich_churn result: "+err.Error()) + } + return ControlResponse{OK: true, Result: buf} + + case ControlEnrichReleases: + var p EnrichReleasesParams + if err := unmarshalParams(req.Params, &p); err != nil { + return controlErr(ErrInternal, err.Error()) + } + result, err := s.Controller.EnrichReleases(ctx, p) + if err != nil { + return controlErr(ErrInternal, err.Error()) + } + buf, err := json.Marshal(result) + if err != nil { + return controlErr(ErrInternal, "marshal enrich_releases result: "+err.Error()) + } + return ControlResponse{OK: true, Result: buf} } return controlErr(ErrInternal, "unknown control kind: "+req.Kind) } @@ -573,6 +613,39 @@ func (s *Server) writePIDFile() error { return os.WriteFile(path, []byte(strconv.Itoa(os.Getpid())), 0o600) } +// RunningPID reports the PID of a live daemon recorded in the PID file, or +// (0, false) when none is. Unlike IsRunning — which only probes the control +// socket — this still reports a daemon that is *mid-shutdown*: the +// ControlShutdown handler tears the listener down ~100ms after acking, but +// the process stays alive while it flushes and closes the store, and it +// holds the store's on-disk lock until it exits. That window is exactly what +// turned a quick restart into a "failed to open database" lock conflict, so +// callers that must not start a second daemon over the top of a dying one — +// or that need to wait for it to exit — consult this, not the socket. +// +// A PID file whose process is dead is stale (the owner crashed without +// cleanup) and reported as not-running, mirroring writePIDFile's own +// staleness handling. +func RunningPID() (int, bool) { + b, err := os.ReadFile(PIDFilePath()) + if err != nil { + return 0, false + } + // TrimSpace so a PID file written with a trailing newline — by a shell + // `echo`, a process manager, or a hand edit — still parses. The daemon + // writes it without one, but tolerating both is free and the silent + // failure mode (guard never fires, restart races the lock again) is + // exactly the bug this helper exists to prevent. + pid, err := strconv.Atoi(strings.TrimSpace(string(b))) + if err != nil || pid <= 0 { + return 0, false + } + if !platform.ProcessAlive(pid) { + return 0, false + } + return pid, true +} + func (s *Server) trackConn(c net.Conn) { s.connsMu.Lock() s.conns[c] = struct{}{} diff --git a/internal/daemon/server_test.go b/internal/daemon/server_test.go index cf8dfdf3..b0b6db1b 100644 --- a/internal/daemon/server_test.go +++ b/internal/daemon/server_test.go @@ -84,6 +84,14 @@ func (f *fakeController) SearchSymbols(_ context.Context, p SearchSymbolsParams) return SearchSymbolsResult{Hits: f.searchHits}, nil } +func (f *fakeController) EnrichChurn(_ context.Context, _ EnrichChurnParams) (EnrichChurnResult, error) { + return EnrichChurnResult{}, nil +} + +func (f *fakeController) EnrichReleases(_ context.Context, _ EnrichReleasesParams) (EnrichReleasesResult, error) { + return EnrichReleasesResult{}, nil +} + // newDaemon spins up a Server on a short socket path + Fake controller. // macOS limits Unix socket paths to ~104 chars (sizeof(sun_path)), and // Go's t.TempDir() path can exceed that for long test names, so we mint diff --git a/internal/dataflow/dataflow.go b/internal/dataflow/dataflow.go index 390c29cc..932ec699 100644 --- a/internal/dataflow/dataflow.go +++ b/internal/dataflow/dataflow.go @@ -79,13 +79,13 @@ func (p Path) Length() int { return len(p.Edges) } // Engine is the dataflow query backend. It holds a reference to // the graph and exposes the two MCP-ready primitives. Concurrency- -// safe by virtue of relying only on graph.Graph's read methods. +// safe by virtue of relying only on graph.Store's read methods. type Engine struct { - g *graph.Graph + g graph.Store } // New returns an engine backed by the given graph. -func New(g *graph.Graph) *Engine { return &Engine{g: g} } +func New(g graph.Store) *Engine { return &Engine{g: g} } // IsDataflowKind returns true for the three edge kinds the BFS // traverses. @@ -372,6 +372,17 @@ func (p TaintPattern) matches(n *graph.Node) bool { // distinct symbol IDs whose nodes match the pattern. Returns the // caller-friendly nodes themselves so MCP responses can include // names + paths without a second lookup. +// +// The seed set is bounded by taintEligibleKinds — the fixed 8-kind +// allowlist (function/method/param/field/variable/constant/type/ +// interface) that taintEligible enforces. Iterating the per-kind +// NodesByKind bucket of each lets the backend stream only those +// kinds instead of materialising the full node table over cgo; +// on Ladybug AllNodes() pulled ~70k rows per request just to land +// at a handful of taint candidates. Pattern post-filters (name / +// path / pattern-supplied kind) still run Go-side — they compose +// AND, can't be projected onto the bucket index efficiently, and +// the per-bucket population is already small. func (e *Engine) ResolveCandidates(p TaintPattern, limit int) []*graph.Node { if e == nil || e.g == nil || p.Empty() { return nil @@ -380,37 +391,37 @@ func (e *Engine) ResolveCandidates(p TaintPattern, limit int) []*graph.Node { limit = 100 } out := make([]*graph.Node, 0, 16) - for _, n := range e.g.AllNodes() { - if !taintEligible(n) { - continue - } - if !p.matches(n) { - continue - } - out = append(out, n) + for _, k := range taintEligibleKinds { if len(out) >= limit { break } + for n := range e.g.NodesByKind(k) { + if n == nil { + continue + } + if !p.matches(n) { + continue + } + out = append(out, n) + if len(out) >= limit { + break + } + } } sort.SliceStable(out, func(i, j int) bool { return out[i].ID < out[j].ID }) return out } -// taintEligible filters the node universe to symbols that could -// plausibly be a dataflow source or sink. Files / imports / pkg +// taintEligibleKinds is the seed-bucket allowlist of node kinds that +// could plausibly be a dataflow source or sink. Files / imports / pkg // markers don't carry value semantics, so excluding them up front -// keeps the candidate set focused. -func taintEligible(n *graph.Node) bool { - if n == nil { - return false - } - switch n.Kind { - case graph.KindFunction, graph.KindMethod, graph.KindParam, - graph.KindField, graph.KindVariable, graph.KindConstant, - graph.KindType, graph.KindInterface: - return true - } - return false +// keeps the candidate set focused. Kept as a slice (not a set) so +// callers can iterate the NodesByKind bucket of each kind in a stable +// order. +var taintEligibleKinds = []graph.NodeKind{ + graph.KindFunction, graph.KindMethod, graph.KindParam, + graph.KindField, graph.KindVariable, graph.KindConstant, + graph.KindType, graph.KindInterface, } // TaintFinding is one (source, sink) hit produced by TaintPaths. diff --git a/internal/docs/docs.go b/internal/docs/docs.go index a5a8876e..cc333797 100644 --- a/internal/docs/docs.go +++ b/internal/docs/docs.go @@ -105,7 +105,7 @@ type BlameSummary struct { // Deps bundles the runtime dependencies injected by the MCP/CLI layer. type Deps struct { - Graph *graph.Graph + Graph graph.Store History HistoryProvider Blame BlameRunner } @@ -189,7 +189,7 @@ func Generate(deps Deps, opts Options) (*Bundle, error) { // walkNodes does a single pass over symbol nodes and emits the // ownership and stale-code tables in a single pass. -func walkNodes(g *graph.Graph, opts Options, now time.Time) ([]OwnershipRow, []StaleCodeRow) { +func walkNodes(g graph.Store, opts Options, now time.Time) ([]OwnershipRow, []StaleCodeRow) { type ownerStats struct { row OwnershipRow fileSet map[string]struct{} diff --git a/internal/exporter/cypher.go b/internal/exporter/cypher.go index b278818b..34985c52 100644 --- a/internal/exporter/cypher.go +++ b/internal/exporter/cypher.go @@ -25,7 +25,7 @@ import ( // // CREATE INDEX ON :GortexNode(id); // Memgraph // CREATE INDEX FOR (n:GortexNode) ON (n.id); // Neo4j 5.x -func WriteCypher(w io.Writer, g *graph.Graph, opts Options) (Stats, error) { +func WriteCypher(w io.Writer, g graph.Store, opts Options) (Stats, error) { cw := &countingWriter{w: w} nodes, edges, _ := snapshot(g, opts) diff --git a/internal/exporter/exporter.go b/internal/exporter/exporter.go index 61ac6a60..8a53b91a 100644 --- a/internal/exporter/exporter.go +++ b/internal/exporter/exporter.go @@ -1,6 +1,6 @@ // Package exporter writes the in-memory graph to portable formats so users // can load it into external visualization and query tools (Neo4j, Memgraph, -// Kuzu via Cypher; yEd, Gephi, Cytoscape via GraphML). +// Ladybug via Cypher; yEd, Gephi, Cytoscape via GraphML). // // The exporter is read-only and operates on a snapshot — it never mutates // the graph. Filters (repo, kinds) are applied during emission. @@ -69,7 +69,7 @@ func (o *Options) nodeFilter(n *graph.Node) bool { // When opts.DropSynthetic is false (default), edges pointing at IDs that are // not real graph nodes (`unresolved::*`, `external::*`, `annotation::*`) get // synthesized stub nodes added to the result so the call topology is preserved. -func snapshot(g *graph.Graph, opts Options) ([]*graph.Node, []*graph.Edge, map[string]bool) { +func snapshot(g graph.Store, opts Options) ([]*graph.Node, []*graph.Edge, map[string]bool) { allNodes := g.AllNodes() allEdges := g.AllEdges() diff --git a/internal/exporter/graphml.go b/internal/exporter/graphml.go index 913fabf0..a265d601 100644 --- a/internal/exporter/graphml.go +++ b/internal/exporter/graphml.go @@ -15,7 +15,7 @@ import ( // All Gortex node properties are projected to GraphML attributes. // Free-form Meta is JSON-encoded into a single `meta_json` attribute so no // information is lost — viewers that don't care about it ignore it. -func WriteGraphML(w io.Writer, g *graph.Graph, opts Options) (Stats, error) { +func WriteGraphML(w io.Writer, g graph.Store, opts Options) (Stats, error) { cw := &countingWriter{w: w} nodes, edges, _ := snapshot(g, opts) diff --git a/internal/exporter/mermaid.go b/internal/exporter/mermaid.go index fbb8f13d..c68072ca 100644 --- a/internal/exporter/mermaid.go +++ b/internal/exporter/mermaid.go @@ -44,7 +44,7 @@ func (o MermaidOpts) withDefaults() MermaidOpts { // WriteMermaid emits a single Mermaid diagram for the chosen scope. // Use this when the caller asks for one file. For multi-file output // the CLI calls WriteMermaid once per scope into separate files. -func WriteMermaid(w io.Writer, g *graph.Graph, opts MermaidOpts) (Stats, error) { +func WriteMermaid(w io.Writer, g graph.Store, opts MermaidOpts) (Stats, error) { opts = opts.withDefaults() cw := &countingWriter{w: w} @@ -66,7 +66,7 @@ func WriteMermaid(w io.Writer, g *graph.Graph, opts MermaidOpts) (Stats, error) // renderForScope dispatches the Scope to the right diagram builder and // returns the rendered Mermaid plus a (nodes, edges) count that the // caller surfaces in Stats. -func renderForScope(g *graph.Graph, opts MermaidOpts) (body string, nodes, edges int, err error) { +func renderForScope(g graph.Store, opts MermaidOpts) (body string, nodes, edges int, err error) { switch strings.ToLower(opts.Scope) { case "architecture": body, nodes, edges = renderArchitecture(g, opts) @@ -101,7 +101,7 @@ func renderForScope(g *graph.Graph, opts MermaidOpts) (body string, nodes, edges // renderArchitecture builds a top-level community map with hub // annotations. Mirrors the layout used by the wiki page. -func renderArchitecture(g *graph.Graph, opts MermaidOpts) (string, int, int) { +func renderArchitecture(g graph.Store, opts MermaidOpts) (string, int, int) { comms := analysis.DetectCommunities(g) var sb strings.Builder sb.WriteString("graph TB\n") @@ -147,7 +147,7 @@ func renderArchitecture(g *graph.Graph, opts MermaidOpts) (string, int, int) { // renderCommunities is identical to architecture today but exposes // `graph LR` for a wider canvas. Caller picks via Scope. -func renderCommunities(g *graph.Graph, opts MermaidOpts) (string, int, int) { +func renderCommunities(g graph.Store, opts MermaidOpts) (string, int, int) { comms := analysis.DetectCommunities(g) var sb strings.Builder sb.WriteString("graph LR\n") @@ -187,7 +187,7 @@ func renderCommunities(g *graph.Graph, opts MermaidOpts) (string, int, int) { // renderProcesses lists every process as a small flowchart of // caller→callee pairs, capped to keep the rendering responsive. -func renderProcesses(g *graph.Graph, _ MermaidOpts) (string, int, int) { +func renderProcesses(g graph.Store, _ MermaidOpts) (string, int, int) { procs := analysis.DiscoverProcesses(g) var sb strings.Builder sb.WriteString("graph LR\n") @@ -244,7 +244,7 @@ func renderProcesses(g *graph.Graph, _ MermaidOpts) (string, int, int) { // emitCrossCommEdges writes EdgeCalls between communities (filtered // to the kept set) and returns the edge count. -func emitCrossCommEdges(sb *strings.Builder, g *graph.Graph, comms *analysis.CommunityResult, keep map[string]bool) int { +func emitCrossCommEdges(sb *strings.Builder, g graph.Store, comms *analysis.CommunityResult, keep map[string]bool) int { type edge struct { from, to string count int diff --git a/internal/githooks/install.go b/internal/githooks/install.go index bc32776a..ce02cb5b 100644 --- a/internal/githooks/install.go +++ b/internal/githooks/install.go @@ -18,11 +18,33 @@ import ( // Begin and end markers wrap the gortex-managed block inside a hook // file. The MARKER_BEGIN / MARKER_END convention is checked by every // install/uninstall pass and never re-written verbatim by the user. +// +// These exported constants preserve the post-commit form for callers +// that pre-date multi-hook support; new code goes through markerBegin +// / markerEnd which derive the strings from the hook name (so +// post-merge gets its own pair). const ( MarkerBegin = "# gortex-managed:post-commit:begin" MarkerEnd = "# gortex-managed:post-commit:end" ) +// SupportedHooks enumerates the hook names that InstallHook accepts. +// Anything else returns an error so we don't silently scatter our +// markers into hooks we haven't audited. +var SupportedHooks = []string{"post-commit", "post-merge"} + +func isSupportedHook(name string) bool { + for _, h := range SupportedHooks { + if h == name { + return true + } + } + return false +} + +func markerBegin(hook string) string { return "# gortex-managed:" + hook + ":begin" } +func markerEnd(hook string) string { return "# gortex-managed:" + hook + ":end" } + // InstallOpts controls what the installed hook runs. type InstallOpts struct { // Binary is the gortex executable path. Defaults to "gortex" @@ -42,6 +64,23 @@ type InstallOpts struct { // DocsOutPath is the docs bundle output path. Defaults to // "CHANGELOG_AUTO.md". DocsOutPath string + // RegenChurn toggles a `gortex enrich churn` run. The companion + // MCP tool get_churn_rate reads the data this enrich pass writes, + // so wiring this into post-commit / post-merge keeps the signal + // fresh without the agent paying the recompute cost at read time. + RegenChurn bool + // ChurnBranch overrides the branch the enricher pins to. Empty + // means "let `gortex enrich churn` resolve the default branch + // at run time" — the right default for shared repos where the + // branch name varies per checkout. + ChurnBranch string + // RegenReleases toggles a `gortex enrich releases` run. Same + // motivation as RegenChurn: keeps `analyze kind=releases` answers + // fresh without paying the per-call tag walk. + RegenReleases bool + // ReleasesBranch is the rev whose reachable tags bound the + // timeline. Empty means "resolve at hook run time". + ReleasesBranch string } func (o InstallOpts) withDefaults() InstallOpts { @@ -62,12 +101,11 @@ func (o InstallOpts) withDefaults() InstallOpts { // hookCommands builds the body the installer writes inside the // marker block. The body is a `#!/bin/sh` snippet that runs every -// enabled action and tolerates failures so the commit still -// completes when gortex isn't on PATH. -func hookCommands(opts InstallOpts) []string { +// enabled action and tolerates failures so the hook always completes. +func hookCommands(hook string, opts InstallOpts) []string { var cmds []string - cmds = append(cmds, "# Auto-regenerate gortex artefacts after each commit.") - cmds = append(cmds, "# Failures are tolerated so the commit always completes.") + cmds = append(cmds, fmt.Sprintf("# Auto-regenerate gortex artefacts on %s.", hook)) + cmds = append(cmds, "# Failures are tolerated so the hook always completes.") if opts.RegenMermaid { cmds = append(cmds, fmt.Sprintf("(%s export --format mermaid --scope all --out-dir %q --on-commit) >/dev/null 2>&1 || true", opts.Binary, opts.MermaidOutDir)) @@ -80,6 +118,22 @@ func hookCommands(opts InstallOpts) []string { cmds = append(cmds, fmt.Sprintf("(%s docs . --out %q) >/dev/null 2>&1 || true", opts.Binary, opts.DocsOutPath)) } + if opts.RegenChurn { + if strings.TrimSpace(opts.ChurnBranch) == "" { + cmds = append(cmds, fmt.Sprintf("(%s enrich churn) >/dev/null 2>&1 || true", opts.Binary)) + } else { + cmds = append(cmds, fmt.Sprintf("(%s enrich churn --branch=%q) >/dev/null 2>&1 || true", + opts.Binary, opts.ChurnBranch)) + } + } + if opts.RegenReleases { + if strings.TrimSpace(opts.ReleasesBranch) == "" { + cmds = append(cmds, fmt.Sprintf("(%s enrich releases) >/dev/null 2>&1 || true", opts.Binary)) + } else { + cmds = append(cmds, fmt.Sprintf("(%s enrich releases --branch=%q) >/dev/null 2>&1 || true", + opts.Binary, opts.ReleasesBranch)) + } + } if len(cmds) == 2 { // No actions selected — note it explicitly. cmds = append(cmds, "# (no regeneration actions enabled)") @@ -89,10 +143,22 @@ func hookCommands(opts InstallOpts) []string { // HookPath resolves the absolute path of the post-commit hook for the // repository rooted at repoRoot. Honours core.hooksPath when set. +// Thin wrapper over HookPathFor — preserved for backwards compatibility. func HookPath(repoRoot string) (string, error) { + return HookPathFor(repoRoot, "post-commit") +} + +// HookPathFor resolves the absolute path of the named hook file in +// the repository rooted at repoRoot. Honours core.hooksPath when set. +// hook is a bare hook name from SupportedHooks ("post-commit", +// "post-merge", …). +func HookPathFor(repoRoot, hook string) (string, error) { if repoRoot == "" { return "", fmt.Errorf("githooks: repoRoot is empty") } + if !isSupportedHook(hook) { + return "", fmt.Errorf("githooks: unsupported hook %q (supported: %s)", hook, strings.Join(SupportedHooks, ", ")) + } gitDir, err := runGit(repoRoot, "rev-parse", "--git-dir") if err != nil { return "", fmt.Errorf("githooks: not a git repository at %q: %w", repoRoot, err) @@ -112,15 +178,15 @@ func HookPath(repoRoot string) (string, error) { if err := os.MkdirAll(hooksDir, 0o755); err != nil { return "", fmt.Errorf("githooks: create hooks dir %q: %w", hooksDir, err) } - return filepath.Join(hooksDir, "post-commit"), nil + return filepath.Join(hooksDir, hook), nil } // StatusReport describes the current state of the post-commit hook. type StatusReport struct { - HookPath string `json:"hook_path"` - Exists bool `json:"exists"` - Managed bool `json:"managed"` // true iff our marker block is present - Body string `json:"body,omitempty"` + HookPath string `json:"hook_path"` + Exists bool `json:"exists"` + Managed bool `json:"managed"` // true iff our marker block is present + Body string `json:"body,omitempty"` } // Status reports the current state of the post-commit hook. Never @@ -148,36 +214,45 @@ func Status(repoRoot string) (StatusReport, error) { return rep, nil } -// InstallPostCommit writes a post-commit hook with the configured -// commands inside our marker block. Idempotent: re-running replaces +// InstallPostCommit is a backwards-compatible wrapper over InstallHook +// that installs the post-commit hook. New callers should reach for +// InstallHook directly so they can install post-merge too. +func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { + return InstallHook(repoRoot, "post-commit", opts) +} + +// InstallHook writes the named hook with the configured commands +// inside a hook-specific marker block. Idempotent: re-running replaces // just the gortex block, leaving any other content intact. // // Returns the absolute path of the hook so callers can show it to the -// user. -func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { +// user. `hook` must be one of SupportedHooks. +func InstallHook(repoRoot, hook string, opts InstallOpts) (string, error) { opts = opts.withDefaults() - hookPath, err := HookPath(repoRoot) + hookPath, err := HookPathFor(repoRoot, hook) if err != nil { return "", err } - cmds := hookCommands(opts) + cmds := hookCommands(hook, opts) + mBegin := markerBegin(hook) + mEnd := markerEnd(hook) var newBlock bytes.Buffer - newBlock.WriteString(MarkerBegin) + newBlock.WriteString(mBegin) newBlock.WriteString("\n") for _, line := range cmds { newBlock.WriteString(line) newBlock.WriteString("\n") } - newBlock.WriteString(MarkerEnd) + newBlock.WriteString(mEnd) newBlock.WriteString("\n") existing, _ := os.ReadFile(hookPath) // nil bytes when file doesn't exist var out bytes.Buffer if len(existing) == 0 { out.WriteString("#!/bin/sh\n") - out.WriteString("# Installed by `gortex githook install post-commit`.\n") + fmt.Fprintf(&out, "# Installed by `gortex githook install %s`.\n", hook) out.WriteString("# Marker block below is regenerated on each install/uninstall;\n") out.WriteString("# add your own commands outside the markers and they will be preserved.\n\n") out.Write(newBlock.Bytes()) @@ -187,10 +262,10 @@ func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { if !strings.HasPrefix(body, "#!") { out.WriteString("#!/bin/sh\n") } - if strings.Contains(body, MarkerBegin) && strings.Contains(body, MarkerEnd) { + if strings.Contains(body, mBegin) && strings.Contains(body, mEnd) { // Replace existing block. - before, rest, _ := strings.Cut(body, MarkerBegin) - _, after, _ := strings.Cut(rest, MarkerEnd) + before, rest, _ := strings.Cut(body, mBegin) + _, after, _ := strings.Cut(rest, mEnd) after = strings.TrimLeft(after, "\n") out.WriteString(before) out.Write(newBlock.Bytes()) @@ -214,18 +289,25 @@ func InstallPostCommit(repoRoot string, opts InstallOpts) (string, error) { return hookPath, nil } -// UninstallPostCommit removes the gortex-managed block. If the file -// then contains nothing but the shebang and our installer comment, -// the file is deleted entirely. Otherwise we leave the residual -// (user-authored) content in place. +// UninstallPostCommit is a backwards-compatible wrapper. +func UninstallPostCommit(repoRoot string) (string, bool, error) { + return UninstallHook(repoRoot, "post-commit") +} + +// UninstallHook removes the gortex-managed block from the named hook. +// If the file then contains nothing but the shebang and our installer +// comment, the file is deleted entirely. Otherwise we leave the +// residual (user-authored) content in place. // // Returns the path of the hook (whether it now exists or was deleted) // and a bool indicating "block was found and removed". -func UninstallPostCommit(repoRoot string) (string, bool, error) { - hookPath, err := HookPath(repoRoot) +func UninstallHook(repoRoot, hook string) (string, bool, error) { + hookPath, err := HookPathFor(repoRoot, hook) if err != nil { return "", false, err } + mBegin := markerBegin(hook) + mEnd := markerEnd(hook) body, err := os.ReadFile(hookPath) if err != nil { if os.IsNotExist(err) { @@ -234,11 +316,11 @@ func UninstallPostCommit(repoRoot string) (string, bool, error) { return "", false, err } b := string(body) - if !strings.Contains(b, MarkerBegin) || !strings.Contains(b, MarkerEnd) { + if !strings.Contains(b, mBegin) || !strings.Contains(b, mEnd) { return hookPath, false, nil } - before, rest, _ := strings.Cut(b, MarkerBegin) - _, after, _ := strings.Cut(rest, MarkerEnd) + before, rest, _ := strings.Cut(b, mBegin) + _, after, _ := strings.Cut(rest, mEnd) after = strings.TrimLeft(after, "\n") cleaned := strings.TrimRight(before, "\n") + "\n" + after cleaned = strings.TrimSpace(cleaned) diff --git a/internal/githooks/install_test.go b/internal/githooks/install_test.go index 8a61810d..0de5217f 100644 --- a/internal/githooks/install_test.go +++ b/internal/githooks/install_test.go @@ -192,6 +192,78 @@ func TestStatus_NewRepo(t *testing.T) { } } +func TestInstallHook_PostMergeAndChurn(t *testing.T) { + repo := initRepo(t) + path, err := InstallHook(repo, "post-merge", InstallOpts{RegenChurn: true, ChurnBranch: "origin/main"}) + if err != nil { + t.Fatalf("InstallHook post-merge: %v", err) + } + if filepath.Base(path) != "post-merge" { + t.Errorf("expected post-merge hook file, got %s", path) + } + body, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read hook: %v", err) + } + got := string(body) + for _, want := range []string{ + "# gortex-managed:post-merge:begin", + "# gortex-managed:post-merge:end", + "gortex enrich churn", + `--branch="origin/main"`, + } { + if !strings.Contains(got, want) { + t.Errorf("hook missing %q. Body:\n%s", want, got) + } + } + // Post-commit and post-merge should be independently managed. + if _, err := InstallHook(repo, "post-commit", InstallOpts{RegenChurn: true}); err != nil { + t.Fatalf("InstallHook post-commit: %v", err) + } + if _, removed, err := UninstallHook(repo, "post-merge"); err != nil || !removed { + t.Fatalf("UninstallHook post-merge removed=%v err=%v", removed, err) + } + // Post-commit hook should still exist after we uninstalled post-merge. + postCommitPath, err := HookPathFor(repo, "post-commit") + if err != nil { + t.Fatalf("HookPathFor: %v", err) + } + if _, err := os.Stat(postCommitPath); err != nil { + t.Errorf("post-commit hook should survive post-merge uninstall: %v", err) + } +} + +func TestInstallHook_RegenReleases(t *testing.T) { + repo := initRepo(t) + path, err := InstallHook(repo, "post-merge", InstallOpts{ + RegenReleases: true, + ReleasesBranch: "origin/main", + }) + if err != nil { + t.Fatalf("InstallHook post-merge: %v", err) + } + body, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read hook: %v", err) + } + got := string(body) + for _, want := range []string{ + "gortex enrich releases", + `--branch="origin/main"`, + } { + if !strings.Contains(got, want) { + t.Errorf("hook missing %q. Body:\n%s", want, got) + } + } +} + +func TestInstallHook_RejectsUnsupportedHook(t *testing.T) { + repo := initRepo(t) + if _, err := InstallHook(repo, "pre-push", InstallOpts{RegenMermaid: true}); err == nil { + t.Fatal("expected error for unsupported hook pre-push") + } +} + func TestHookPath_HonoursCoreHooksPath(t *testing.T) { repo := initRepo(t) customHooks := filepath.Join(repo, "custom-hooks") diff --git a/internal/graph/edge.go b/internal/graph/edge.go index 50046b0f..e2bdd5cb 100644 --- a/internal/graph/edge.go +++ b/internal/graph/edge.go @@ -3,8 +3,19 @@ package graph type EdgeKind string const ( - EdgeImports EdgeKind = "imports" - EdgeDefines EdgeKind = "defines" + EdgeImports EdgeKind = "imports" + // EdgeContains links a file node to its non-symbol children — import + // nodes today, and a natural home for future side-band kinds + // (todos, fixtures) that "belong to" a file without being defined + // by it. EdgeDefines is the wrong fit for these because the file + // does not semantically *define* an import; it *contains* the + // import statement. Splitting the kinds lets walkers that want + // "real definitions" follow EdgeDefines and walkers that want the + // full file neighbourhood union both. The Ladybug-backed + // GetFileSubGraph relies on this union to fetch every file + // neighbour via the rel-table FROM index in one pass. + EdgeContains EdgeKind = "contains" + EdgeDefines EdgeKind = "defines" EdgeCalls EdgeKind = "calls" EdgeInstantiates EdgeKind = "instantiates" EdgeImplements EdgeKind = "implements" @@ -228,9 +239,17 @@ const ( // dataflow without materialising a graph node per local variable, // edges target a synthetic ID of the form: // - // #local:@ + // #local:@+ // - // where ownerID is the enclosing function/method/closure node. + // where ownerID is the enclosing function/method/closure node + // and the offset is the local's 1-based line minus the owner's + // declaration line (leading `+` flags the value as a relative + // offset). The offset-based ID keeps locals stable across edits + // that shift the function as a whole — only edits inside the + // function above a binding shift that binding's ID. Each ID is + // also materialised as a KindLocal node linked to the owner + // via EdgeMemberOf; the search index excludes KindLocal so + // these per-binding nodes don't pollute name lookups. // These IDs are valid edge endpoints — BFS traverses them — but // no graph node is created, keeping search results free of // every transient binding in every function body. @@ -472,6 +491,15 @@ func BaseKindForCrossRepo(cr EdgeKind) (EdgeKind, bool) { return "", false } +// BaseKindsForCrossRepo returns the set of base edge kinds that have a +// parallel cross_repo_* variant. The slice is the single source of +// truth for callers (DetectCrossRepoEdges, the CrossRepoCandidates +// storage capability) that need the kind list without iterating +// CrossRepoKindFor over every edge. +func BaseKindsForCrossRepo() []EdgeKind { + return []EdgeKind{EdgeCalls, EdgeImplements, EdgeExtends} +} + type Edge struct { From string `json:"from"` To string `json:"to"` @@ -605,7 +633,7 @@ func DefaultOriginFor(kind EdgeKind, confidence float64, semanticSource string) } // Structural AST edges are unambiguous by construction. switch kind { - case EdgeDefines, EdgeImports, EdgeExtends, EdgeMemberOf, + case EdgeDefines, EdgeImports, EdgeContains, EdgeExtends, EdgeMemberOf, EdgeImplements, EdgeProvides, EdgeConsumes, EdgeMatches, // Coverage structural edges: the extractor produces an // unambiguous source→target binding for each, so they share @@ -656,7 +684,7 @@ func DefaultOriginFor(kind EdgeKind, confidence float64, semanticSource string) func ConfidenceLabelFor(kind EdgeKind, confidence float64) string { // Structural edges from AST are always extracted. switch kind { - case EdgeDefines, EdgeImports, EdgeExtends, EdgeMemberOf, EdgeImplements, + case EdgeDefines, EdgeImports, EdgeContains, EdgeExtends, EdgeMemberOf, EdgeImplements, EdgeProvides, EdgeConsumes, EdgeMatches, EdgeParamOf, EdgeAliases, EdgeComposes, EdgeOverrides, EdgeLicensedAs, EdgeOwns, EdgeAuthored, EdgeGeneratedBy, EdgeDependsOnModule, diff --git a/internal/graph/extraction_gap.go b/internal/graph/extraction_gap.go index a8d69162..b2f12ced 100644 --- a/internal/graph/extraction_gap.go +++ b/internal/graph/extraction_gap.go @@ -61,6 +61,24 @@ var usageEdgeKinds = map[EdgeKind]bool{ EdgeTests: true, } +// UsageInboundEdgeKinds returns the canonical list of incoming edge +// kinds that classify a symbol as "used" by ClassifyZeroEdge. Exposed +// for capability callers (NodeDegreeAggregator) that need to mirror +// the in-graph usage filter server-side. Order is stable so the slice +// is safe to pass directly to a Cypher parameter binding. +func UsageInboundEdgeKinds() []EdgeKind { + return []EdgeKind{ + EdgeCalls, + EdgeReferences, + EdgeInstantiates, + EdgeImplements, + EdgeExtends, + EdgeReads, + EdgeWrites, + EdgeTests, + } +} + // ClassifyZeroEdge inspects a symbol's incoming and outgoing edges and // returns how an empty usage/caller/impact query for it should be read. // @@ -75,7 +93,7 @@ var usageEdgeKinds = map[EdgeKind]bool{ // An unknown symbol ID is reported as an extraction gap: a query whose // target is not even in the graph is exactly as untrustworthy as one // whose target was never wired up. -func ClassifyZeroEdge(g *Graph, symbolID string) ZeroEdgeClass { +func ClassifyZeroEdge(g Store, symbolID string) ZeroEdgeClass { if g == nil || symbolID == "" { return ZeroEdgePossibleExtractionGap } @@ -113,7 +131,7 @@ var zeroEdgeMessages = map[ZeroEdgeClass]string{ // query result on symbolID. It returns nil when the symbol has // incoming usage edges (ZeroEdgeNone) — a non-empty result carries no // caveat — so callers can attach the return value unconditionally. -func CaveatForZeroEdge(g *Graph, symbolID string) *ZeroEdgeCaveat { +func CaveatForZeroEdge(g Store, symbolID string) *ZeroEdgeCaveat { class := ClassifyZeroEdge(g, symbolID) if class == ZeroEdgeNone { return nil diff --git a/internal/graph/graph.go b/internal/graph/graph.go index 849aef5e..10726061 100644 --- a/internal/graph/graph.go +++ b/internal/graph/graph.go @@ -1,6 +1,9 @@ package graph import ( + "iter" + "slices" + "strings" "sync" "sync/atomic" ) @@ -484,6 +487,784 @@ func (g *Graph) ResolveMutex() *sync.Mutex { return &g.resolveMu } +// ReindexEdges is the batched sibling of ReindexEdge. The in-memory +// store has no per-call commit overhead so the implementation is a +// straight loop; the value of the batch API lives in the disk +// backends, where it collapses N transaction commits into one. +func (g *Graph) ReindexEdges(batch []EdgeReindex) { + for _, r := range batch { + if r.Edge == nil { + continue + } + g.ReindexEdge(r.Edge, r.OldTo) + } +} + +// EdgesByKind yields every edge whose Kind matches. In-memory +// implementation iterates the materialised AllEdges() slice and +// filters; the algorithmic cost is identical to a hand-written +// "for _, e := range g.AllEdges() { if e.Kind == kind }" loop, which +// is what most call sites used before the predicate API existed. +// Disk backends override this with an index-backed scan. +func (g *Graph) EdgesByKind(kind EdgeKind) iter.Seq[*Edge] { + return func(yield func(*Edge) bool) { + for _, e := range g.AllEdges() { + if e == nil || e.Kind != kind { + continue + } + if !yield(e) { + return + } + } + } +} + +// EdgesByKinds is the in-memory reference implementation of +// EdgesByKindsScanner. Single pass over AllEdges with a small +// pre-built kind set — same algorithmic cost as the legacy `for _, e +// := range g.AllEdges() { if e.Kind == X || e.Kind == Y }` loop the +// edge-driven analyzers used before this capability existed. Disk +// backends override with a single `WHERE kind IN $kinds` query so the +// edge-driven analyzers stop firing one EdgesByKind per kind (or +// worse, scanning AllEdges and filtering Go-side). +// +// Empty kinds yields nothing — matches the disk contract. +func (g *Graph) EdgesByKinds(kinds []EdgeKind) iter.Seq[*Edge] { + if len(kinds) == 0 { + return func(yield func(*Edge) bool) {} + } + set := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + set[k] = struct{}{} + } + if len(set) == 0 { + return func(yield func(*Edge) bool) {} + } + return func(yield func(*Edge) bool) { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := set[e.Kind]; !ok { + continue + } + if !yield(e) { + return + } + } + } +} + +// NodesByKind yields every node whose Kind matches. Same semantics +// and same in-memory cost story as EdgesByKind. +func (g *Graph) NodesByKind(kind NodeKind) iter.Seq[*Node] { + return func(yield func(*Node) bool) { + for _, n := range g.AllNodes() { + if n == nil || n.Kind != kind { + continue + } + if !yield(n) { + return + } + } + } +} + +// GetNodesByIDs returns a map id→*Node for every input ID that +// exists in the store. The in-memory implementation loops the +// existing GetNode — algorithmic cost identical to a hand-written +// loop in the caller, no concurrency win here. The value of the +// batched API lives in the disk backends, where it collapses N +// per-id SQL/bolt queries into one. +func (g *Graph) GetNodesByIDs(ids []string) map[string]*Node { + if len(ids) == 0 { + return nil + } + out := make(map[string]*Node, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, ok := out[id]; ok { + continue + } + if n := g.GetNode(id); n != nil { + out[id] = n + } + } + return out +} + +// FindNodesByNames is the batched sibling of FindNodesByName. +func (g *Graph) FindNodesByNames(names []string) map[string][]*Node { + if len(names) == 0 { + return nil + } + out := make(map[string][]*Node, len(names)) + for _, name := range names { + if name == "" { + continue + } + if _, ok := out[name]; ok { + continue + } + matches := g.FindNodesByName(name) + if len(matches) > 0 { + out[name] = matches + } + } + return out +} + +// EdgesWithUnresolvedTarget yields every edge whose To has the +// "unresolved::" prefix — the resolver's main pending-edge filter. +// In-memory iterates all edges and prefix-checks; disk backends back +// it with a range scan on a to-keyed index. +func (g *Graph) EdgesWithUnresolvedTarget() iter.Seq[*Edge] { + return func(yield func(*Edge) bool) { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if !strings.HasPrefix(e.To, "unresolved::") { + continue + } + if !yield(e) { + return + } + } + } +} + +// DeadCodeCandidates is the in-memory reference implementation of +// DeadCodeCandidator. Iterates the requested node kinds and filters +// out anything whose incoming-edge bucket contains an allowlist match +// — same algorithm the analysis.FindDeadCode loop runs, just exposed +// as a single capability the disk backends can short-circuit with +// one Cypher per kind. Pure map / slice walks here; the win lives +// in disk backends where the equivalent path materialises the full +// in-edge map over cgo. +func (g *Graph) DeadCodeCandidates(allowedNodeKinds []NodeKind, allowedInEdgeKinds map[NodeKind][]EdgeKind) []*Node { + if len(allowedNodeKinds) == 0 { + return nil + } + // Build a per-kind set so the inner loop can match against a map + // instead of re-scanning the allowlist slice for every edge. + allowedSet := make(map[NodeKind]map[EdgeKind]struct{}, len(allowedNodeKinds)) + for _, k := range allowedNodeKinds { + set := make(map[EdgeKind]struct{}, len(allowedInEdgeKinds[k])) + for _, ek := range allowedInEdgeKinds[k] { + set[ek] = struct{}{} + } + allowedSet[k] = set + } + + var out []*Node + for _, k := range allowedNodeKinds { + allowed, hasAllow := allowedSet[k] + anyKindCounts := !hasAllow || len(allowed) == 0 + for n := range g.NodesByKind(k) { + if n == nil { + continue + } + incoming := g.GetInEdges(n.ID) + dead := true + for _, e := range incoming { + if e == nil { + continue + } + if anyKindCounts { + dead = false + break + } + if _, ok := allowed[e.Kind]; ok { + dead = false + break + } + } + if dead { + out = append(out, n) + } + } + } + return out +} + +// IfaceImplementsRows is the in-memory reference implementation of +// IfaceImplementsScanner. Joins KindInterface nodes carrying +// Meta["methods"] with their EdgeImplements predecessors and returns +// one row per (typeID, ifaceID, ifaceMeta) tuple. +func (g *Graph) IfaceImplementsRows() []IfaceImplementsRow { + // Index interfaces with methods by ID so the edge walk is O(edges) + // rather than O(edges × interfaces). + ifaceMeta := make(map[string]map[string]any) + for n := range g.NodesByKind(KindInterface) { + if n == nil || n.Meta == nil { + continue + } + if _, ok := n.Meta["methods"]; !ok { + continue + } + ifaceMeta[n.ID] = n.Meta + } + if len(ifaceMeta) == 0 { + return nil + } + var out []IfaceImplementsRow + for e := range g.EdgesByKind(EdgeImplements) { + if e == nil { + continue + } + meta, ok := ifaceMeta[e.To] + if !ok { + continue + } + out = append(out, IfaceImplementsRow{ + TypeID: e.From, + IfaceID: e.To, + IfaceMeta: meta, + }) + } + return out +} + +// NodeDegreeCounts is the in-memory reference implementation of +// NodeDegreeAggregator. Walks the per-node in/out edge buckets the +// in-memory backend already maintains — same cost as the per-node +// loop GraphConnectivity ran before this capability landed, just +// folded into one method call so the analyzer can pick the disk +// backend's bulk implementation transparently. Missing ids are +// elided from the result (matching the disk contract). +func (g *Graph) NodeDegreeCounts(ids []string, usageKinds []EdgeKind) []NodeDegreeRow { + if len(ids) == 0 { + return nil + } + usage := make(map[EdgeKind]struct{}, len(usageKinds)) + for _, k := range usageKinds { + usage[k] = struct{}{} + } + seen := make(map[string]struct{}, len(ids)) + out := make([]NodeDegreeRow, 0, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + // Skip unknown ids — the disk backend's WHERE n.id IN $ids + // clause naturally drops them; mirror that here so both + // backends return the same row count. + if g.GetNode(id) == nil { + continue + } + in := g.GetInEdges(id) + row := NodeDegreeRow{ + NodeID: id, + InCount: len(in), + OutCount: len(g.GetOutEdges(id)), + } + if len(usage) > 0 { + for _, e := range in { + if e == nil { + continue + } + if _, ok := usage[e.Kind]; ok { + row.UsageInCount++ + } + } + } + out = append(out, row) + } + return out +} + +// FileImporters is the in-memory reference implementation of the +// FileImporters capability. Iterates EdgeImports via the byKind +// bucket — same cost as the legacy AllEdges()+filter loop in +// handleCheckReferences, but exposes the predicate as a single call +// the disk backends can short-circuit with one Cypher. +// +// Matches edges whose To node satisfies filePath == n.FilePath OR +// filePath == n.ID. The dual match keeps parity with the indexer's +// two import shapes: file-targeted imports point at the file node +// (n.ID == filePath), while symbol-targeted imports land on a symbol +// whose FilePath equals filePath. +func (g *Graph) FileImporters(filePath string) []FileImporterRow { + if filePath == "" { + return nil + } + var out []FileImporterRow + for e := range g.EdgesByKind(EdgeImports) { + if e == nil { + continue + } + to := g.GetNode(e.To) + if to == nil { + continue + } + if to.FilePath != filePath && to.ID != filePath { + continue + } + from := g.GetNode(e.From) + if from == nil { + continue + } + out = append(out, FileImporterRow{ + FromFile: from.FilePath, + FromID: from.ID, + FromName: from.Name, + FromKind: from.Kind, + }) + } + return out +} + +// NodeFanCounts is the in-memory reference implementation of +// NodeFanAggregator. Two passes over the per-node in/out edge buckets +// the in-memory backend already maintains, filtered by the caller's +// kind sets. Disk backends override with one Cypher per direction +// to drop the AllEdges() materialisation FindHotspots / health_score +// were running every call. +func (g *Graph) NodeFanCounts(ids []string, fanInKinds []EdgeKind, fanOutKinds []EdgeKind) []NodeFanRow { + if len(ids) == 0 { + return nil + } + inSet := make(map[EdgeKind]struct{}, len(fanInKinds)) + for _, k := range fanInKinds { + inSet[k] = struct{}{} + } + outSet := make(map[EdgeKind]struct{}, len(fanOutKinds)) + for _, k := range fanOutKinds { + outSet[k] = struct{}{} + } + seen := make(map[string]struct{}, len(ids)) + out := make([]NodeFanRow, 0, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + if g.GetNode(id) == nil { + continue + } + row := NodeFanRow{NodeID: id} + if len(inSet) > 0 { + for _, e := range g.GetInEdges(id) { + if e == nil { + continue + } + if _, ok := inSet[e.Kind]; ok { + row.FanIn++ + } + } + } + if len(outSet) > 0 { + for _, e := range g.GetOutEdges(id) { + if e == nil { + continue + } + if _, ok := outSet[e.Kind]; ok { + row.FanOut++ + } + } + } + out = append(out, row) + } + return out +} + +// InEdgeCountsByKind is the in-memory reference implementation of +// the InEdgeCounter capability. Walks each requested EdgeKind via +// the byKind bucket and increments a per-To counter. Same algorithm +// the AllEdges-bucketing fallback in handleGetUntestedSymbols runs; +// the win lives in disk backends where AllEdges() materialises every +// edge over cgo just to bucket by target. +// +// Dedupes the kind set up front so a sloppy caller passing the same +// kind twice doesn't double-count — matches the Cypher backend's +// IN-list dedup. +func (g *Graph) InEdgeCountsByKind(kinds []EdgeKind) map[string]int { + if len(kinds) == 0 { + return nil + } + seen := make(map[EdgeKind]struct{}, len(kinds)) + out := make(map[string]int) + for _, k := range kinds { + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + for e := range g.EdgesByKind(k) { + if e == nil { + continue + } + out[e.To]++ + } + } + return out +} + +// NodesInFilesByKind is the in-memory reference implementation of +// the NodesInFilesByKindFinder capability. Filters NodesByKind for +// each requested kind down to the file set. Same algorithm as the +// Go-side loop in find_declaration's buildDeclFileIndex; the win +// lives in disk backends where AllNodes() over cgo dwarfs the few +// hundred surviving rows. +func (g *Graph) NodesInFilesByKind(files []string, kinds []NodeKind) []*Node { + if len(files) == 0 || len(kinds) == 0 { + return nil + } + wanted := make(map[string]struct{}, len(files)) + for _, f := range files { + if f == "" { + continue + } + wanted[f] = struct{}{} + } + if len(wanted) == 0 { + return nil + } + // Dedup the kinds so a sloppy caller doesn't double-scan. + seenKind := make(map[NodeKind]struct{}, len(kinds)) + var out []*Node + for _, k := range kinds { + if _, ok := seenKind[k]; ok { + continue + } + seenKind[k] = struct{}{} + for n := range g.NodesByKind(k) { + if n == nil { + continue + } + if _, ok := wanted[n.FilePath]; !ok { + continue + } + out = append(out, n) + } + } + return out +} + +// NodesByKinds is the in-memory reference implementation of the +// NodesByKindsScanner capability. Loops the existing NodesByKind +// iterator per requested kind — algorithmic cost identical to the +// hand-written `for _, n := range AllNodes() if n.Kind == K` pattern +// the metadata analyzers used before. The win lives in the disk +// backends, where one IN-list Cypher replaces the AllNodes() pull. +// +// Dedupes the kind set up front so a sloppy caller passing the same +// kind twice doesn't double-yield — matches the Cypher backend's +// IN-list dedup. Empty kinds returns nil without touching the store. +func (g *Graph) NodesByKinds(kinds []NodeKind) []*Node { + if len(kinds) == 0 { + return nil + } + seen := make(map[NodeKind]struct{}, len(kinds)) + var out []*Node + for _, k := range kinds { + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + for n := range g.NodesByKind(k) { + if n == nil { + continue + } + out = append(out, n) + } + } + return out +} + +// EdgeAdjacencyForKinds is the in-memory reference implementation of +// the EdgeAdjacencyForKinds capability. One AllEdges scan that yields +// (from, to) pairs whose Kind is in the supplied edge-kind set AND +// whose endpoints both have a Kind in the node-kind set — identical +// shape to the Cypher join the disk backends fold into a single +// query. +// +// Empty edgeKinds or empty nodeKinds yields nothing — matches the +// disk contract. +func (g *Graph) EdgeAdjacencyForKinds(edgeKinds []EdgeKind, nodeKinds []NodeKind) iter.Seq[[2]string] { + if len(edgeKinds) == 0 || len(nodeKinds) == 0 { + return func(yield func([2]string) bool) {} + } + eset := make(map[EdgeKind]struct{}, len(edgeKinds)) + for _, k := range edgeKinds { + if k == "" { + continue + } + eset[k] = struct{}{} + } + nset := make(map[NodeKind]struct{}, len(nodeKinds)) + for _, k := range nodeKinds { + if k == "" { + continue + } + nset[k] = struct{}{} + } + if len(eset) == 0 || len(nset) == 0 { + return func(yield func([2]string) bool) {} + } + return func(yield func([2]string) bool) { + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := eset[e.Kind]; !ok { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if _, ok := nset[from.Kind]; !ok { + continue + } + if _, ok := nset[to.Kind]; !ok { + continue + } + if !yield([2]string{e.From, e.To}) { + return + } + } + } +} + +// CommunityCrossingsByKind is the in-memory reference implementation +// of the CommunityCrossingsByKind capability. AllEdges scan with the +// kind-set filter, then a Go-side community comparison per edge — +// the exact loop FindHotspots.countCrossings ran before this +// capability existed. +// +// Empty kinds or empty nodeToComm returns nil. Zero-count sources +// never surface (matches the disk contract — callers probe by +// existence). +func (g *Graph) CommunityCrossingsByKind(kinds []EdgeKind, nodeToComm map[string]string) map[string]int { + if len(kinds) == 0 || len(nodeToComm) == 0 { + return nil + } + set := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + set[k] = struct{}{} + } + if len(set) == 0 { + return nil + } + out := make(map[string]int) + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := set[e.Kind]; !ok { + continue + } + from := nodeToComm[e.From] + to := nodeToComm[e.To] + if from == "" || to == "" || from == to { + continue + } + out[e.From]++ + } + if len(out) == 0 { + return nil + } + return out +} + +// NodeIDsByKinds is the in-memory reference implementation of the +// NodeIDsByKinds capability. Single AllNodes pass with a kind-set +// filter, deduped on input — same algorithm as NodesByKinds but +// returns only the ID column. The disk-backend win is the projection +// drop, not the algorithmic shape. +func (g *Graph) NodeIDsByKinds(kinds []NodeKind) []string { + if len(kinds) == 0 { + return nil + } + seen := make(map[NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + seen[k] = struct{}{} + } + if len(seen) == 0 { + return nil + } + var out []string + for _, n := range g.AllNodes() { + if n == nil { + continue + } + if _, ok := seen[n.Kind]; !ok { + continue + } + out = append(out, n.ID) + } + return out +} + +// EdgeKindCounts is the in-memory reference implementation of the +// EdgeKindCounter capability. One AllEdges scan with a per-kind +// tally — the exact loop the get_surprising_connections Go fallback +// already runs today, just exposed as a single method call so the +// disk backends can short-circuit with a Cypher GROUP BY. +// +// Empty graph returns nil so callers can short-circuit a downstream +// "kindCounts != nil" gate. +func (g *Graph) EdgeKindCounts() map[EdgeKind]int { + out := map[EdgeKind]int{} + for _, e := range g.AllEdges() { + if e == nil { + continue + } + out[e.Kind]++ + } + if len(out) == 0 { + return nil + } + return out +} + +// CrossRepoEdgeCounts is the in-memory reference implementation of +// CrossRepoEdgeAggregator. Iterates the four cross_repo_* byKind +// buckets and groups by (kind, fromRepoPrefix, toRepoPrefix). Same +// algorithm as the architecture handler's AllEdges loop but exposes +// it as a single capability so disk backends can fold the join into +// one Cypher. +// +// Returns nil when the graph carries no cross-repo edges (single- +// repo mode) so the caller's empty-list rendering kicks in without +// allocating. +func (g *Graph) CrossRepoEdgeCounts() []CrossRepoEdgeRow { + type key struct { + kind EdgeKind + fromRepo string + toRepo string + } + counts := map[key]int{} + for _, k := range []EdgeKind{ + EdgeCrossRepoCalls, + EdgeCrossRepoImplements, + EdgeCrossRepoExtends, + } { + for e := range g.EdgesByKind(k) { + if e == nil { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + counts[key{kind: e.Kind, fromRepo: from.RepoPrefix, toRepo: to.RepoPrefix}]++ + } + } + if len(counts) == 0 { + return nil + } + out := make([]CrossRepoEdgeRow, 0, len(counts)) + for k, c := range counts { + out = append(out, CrossRepoEdgeRow{ + Kind: k.kind, FromRepo: k.fromRepo, ToRepo: k.toRepo, Count: c, + }) + } + return out +} + +// FileImportCounts is the in-memory reference implementation of +// FileImportAggregator. Iterates the EdgeImports byKind bucket and +// groups by the target file path — coalescing to To-node FilePath +// or, when the indexer pointed the import edge at the file node +// directly, the target ID. Same algorithm as the AllEdges loop in +// mostImportedFiles; the win lives in disk backends where AllEdges +// + per-edge GetNode round-trips over cgo dwarf the few hundred +// surviving rows. +// +// scope, when non-nil, bounds the result to edges whose target ID +// lies in the slice (session-workspace clamp). A nil scope counts +// every imports edge. An empty (non-nil) scope returns nil — never +// a whole-graph scan. +func (g *Graph) FileImportCounts(scope []string) []FileImportCountRow { + if scope != nil && len(scope) == 0 { + return nil + } + var allowed map[string]struct{} + if scope != nil { + allowed = make(map[string]struct{}, len(scope)) + for _, id := range scope { + if id == "" { + continue + } + allowed[id] = struct{}{} + } + if len(allowed) == 0 { + return nil + } + } + counts := map[string]int{} + for e := range g.EdgesByKind(EdgeImports) { + if e == nil { + continue + } + target := g.GetNode(e.To) + if target == nil { + continue + } + if allowed != nil { + if _, ok := allowed[target.ID]; !ok { + continue + } + } + path := target.FilePath + if path == "" { + path = target.ID + } + if path == "" { + continue + } + counts[path]++ + } + if len(counts) == 0 { + return nil + } + out := make([]FileImportCountRow, 0, len(counts)) + for p, c := range counts { + out = append(out, FileImportCountRow{FilePath: p, Count: c}) + } + return out +} + +// SetEdgeProvenanceBatch is the batched sibling of SetEdgeProvenance. +// Same story as ReindexEdges: per-call in memory, one transaction in +// the disk backends. Returns the number of edges whose Origin +// actually changed (matches the sum of per-edge SetEdgeProvenance +// boolean returns). +func (g *Graph) SetEdgeProvenanceBatch(batch []EdgeProvenanceUpdate) int { + changed := 0 + for _, u := range batch { + if u.Edge == nil { + continue + } + if g.SetEdgeProvenance(u.Edge, u.NewOrigin) { + changed++ + } + } + return changed +} + // shardIdx picks the shard index for an ID using FNV-1a. Inlined to // avoid the per-call hash-object allocation that the stdlib's // fnv.New32a() incurs — shardIdx is on the hottest path in the graph @@ -708,7 +1489,7 @@ func (g *Graph) AddBatch(nodes []*Node, edges []*Edge) { inEdgesByShard[shardIdx(e.To)] = append(inEdgesByShard[shardIdx(e.To)], e) } - for i := 0; i < numShards; i++ { + for i := range numShards { if len(nodesByShard[i]) == 0 && len(outEdgesByShard[i]) == 0 && len(inEdgesByShard[i]) == 0 { continue } @@ -1036,6 +1817,40 @@ func (g *Graph) FindNodesByNameInRepo(name, repoPrefix string) []*Node { return out } +// FindNodesByNameContaining returns nodes whose Name (case-insensitive) +// contains substr. The in-memory backend has no name-substring index, +// so this is a single pass over the byName buckets (which already group +// nodes by exact name — the same allocation we'd pay for one FindNodesByName +// call per distinct name). limit caps the slice; 0 means "no limit". +// +// Stable order is the caller's responsibility — bucket iteration is +// deterministic per shard but cross-shard order isn't fixed. +func (g *Graph) FindNodesByNameContaining(substr string, limit int) []*Node { + if substr == "" { + return nil + } + needle := strings.ToLower(substr) + var out []*Node + for _, s := range g.shards { + s.mu.RLock() + for name, bucket := range s.byName { + if !strings.Contains(strings.ToLower(name), needle) { + continue + } + out = append(out, bucket...) + if limit > 0 && len(out) >= limit { + s.mu.RUnlock() + return out[:limit] + } + } + s.mu.RUnlock() + } + if limit > 0 && len(out) > limit { + out = out[:limit] + } + return out +} + // GetFileNodes returns all nodes defined in the given file. func (g *Graph) GetFileNodes(filePath string) []*Node { var out []*Node @@ -1071,6 +1886,48 @@ func (g *Graph) GetInEdges(nodeID string) []*Edge { return out } +// GetOutEdgesByNodeIDs returns a map id→outgoing edges for every input +// id. The in-memory backend loops the existing GetOutEdges — cost +// matches a hand-written loop in the caller. The value of the batched +// API lives in disk backends, where it collapses N point lookups into +// one bulk Cypher query. Empty input returns nil; duplicate ids are +// deduped naturally. Missing ids are absent from the returned map. +func (g *Graph) GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, ok := out[id]; ok { + continue + } + out[id] = g.GetOutEdges(id) + } + return out +} + +// GetInEdgesByNodeIDs is the inbound sibling of GetOutEdgesByNodeIDs. +// See that doc-comment for the contract. +func (g *Graph) GetInEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, ok := out[id]; ok { + continue + } + out[id] = g.GetInEdges(id) + } + return out +} + // EvictFile removes all nodes and edges belonging to the given file // path. Nodes for one file can span many shards (different IDs hash // differently), so we lock all shards for this multi-shard operation. @@ -1316,6 +2173,81 @@ func (g *Graph) AllEdges() []*Edge { return out } +// DrainNodes yields every node and FREES the graph's internal node +// storage shard-by-shard as it goes. After Drain finishes the graph +// holds zero nodes. Intended for the one-shot persist path where the +// shadow is about to be discarded: AllNodes would pin the full 11 GB +// graph for the entire persist phase; Drain releases each shard's +// node map (and the per-name / per-file / per-repo indexes) as soon +// as that shard's iteration completes, so GC can reclaim ~700 MB at +// a time on a Linux-scale graph instead of waiting for the indexer's +// defer to return. +// +// The graph remains structurally consistent during Drain — edges and +// other indexes are untouched, only the node maps are emptied. If +// you also need DrainEdges, call them in either order; both are +// destructive and idempotent (a second call yields nothing). +func (g *Graph) DrainNodes() iter.Seq[*Node] { + return func(yield func(*Node) bool) { + for _, s := range g.shards { + s.mu.Lock() + nodes := s.nodes + // Replace with an empty map so the shard's read methods + // keep working (return zero) instead of nil-panicking. + s.nodes = map[string]*Node{} + s.byFile = map[string][]*Node{} + s.byName = map[string][]*Node{} + s.byQual = map[string]*Node{} + s.byRepo = map[string][]*Node{} + s.byFileIdx = map[string]map[string]int{} + s.byNameIdx = map[string]map[string]int{} + s.byRepoIdx = map[string]map[string]int{} + s.mu.Unlock() + for _, n := range nodes { + if !yield(n) { + return + } + } + // nodes goes out of scope here — the shard's old map plus + // every *Node it referenced is now GC-eligible (assuming + // the caller has dropped any remaining reference). + } + } +} + +// DrainEdges yields every edge and FREES the graph's internal edge +// storage shard-by-shard. Same semantics as DrainNodes — meant for +// the persist hand-off, not for general queries. +func (g *Graph) DrainEdges() iter.Seq[*Edge] { + // Invalidate the AllEdges cache so any subsequent caller doesn't + // see drained-shard zombies. The cache holds direct *Edge slice + // references that DrainEdges is about to start freeing. + g.allEdgesCacheMu.Lock() + g.allEdgesCache = nil + g.allEdgesCacheGen = 0 + g.allEdgesCacheMu.Unlock() + return func(yield func(*Edge) bool) { + for _, s := range g.shards { + s.mu.Lock() + outEdges := s.outEdges + s.outEdges = map[string][]*Edge{} + s.inEdges = map[string][]*Edge{} + s.outEdgeIdx = map[string]map[edgeHash]int{} + s.inEdgeIdx = map[string]map[edgeHash]int{} + s.outEdgeKeys = map[string][]edgeHash{} + s.inEdgeKeys = map[string][]edgeHash{} + s.mu.Unlock() + for _, edges := range outEdges { + for _, e := range edges { + if !yield(e) { + return + } + } + } + } + } +} + // Stats returns summary counts by kind and language. func (g *Graph) Stats() GraphStats { g.lockAllRead() @@ -1364,6 +2296,32 @@ func (g *Graph) GetRepoNodes(repoPrefix string) []*Node { return out } +// GetRepoEdges returns every edge whose source node has the given +// RepoPrefix — the in-memory reference implementation of the +// Store-interface method. Walks each shard's byRepo bucket and +// concatenates that node's outEdges in place (no per-node +// GetOutEdges call, so no per-call slice copy). Equivalent in +// observable behaviour to the GetRepoNodes(r) × GetOutEdges loop +// callers used before this method existed; meant to give disk +// backends a single-query hook without changing in-memory cost. +// Empty repoPrefix returns nil (callers use AllEdges() instead). +func (g *Graph) GetRepoEdges(repoPrefix string) []*Edge { + if repoPrefix == "" { + return nil + } + var out []*Edge + for _, s := range g.shards { + s.mu.RLock() + for _, n := range s.byRepo[repoPrefix] { + if src := s.outEdges[n.ID]; len(src) > 0 { + out = append(out, src...) + } + } + s.mu.RUnlock() + } + return out +} + // EvictRepo removes all nodes with matching RepoPrefix and all edges // referencing those nodes. Returns counts of removed nodes and edges. func (g *Graph) EvictRepo(repoPrefix string) (nodesRemoved, edgesRemoved int) { @@ -1604,3 +2562,668 @@ func (g *Graph) RepoPrefixes() []string { } return prefixes } + +// InDegreeForNodes is the in-memory reference implementation of the +// InDegreeForNodes capability. Walks the per-target in-edge buckets +// directly — the same arithmetic the disk backends push into a single +// Cypher COUNT. +func (g *Graph) InDegreeForNodes(ids []string) map[string]int { + if len(ids) == 0 { + return nil + } + out := make(map[string]int, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + c := len(g.GetInEdges(id)) + if c == 0 { + continue + } + out[id] = c + } + return out +} + +// ReachableForwardByKinds is the in-memory reference implementation +// of the ReachableForwardByKinds capability. Layer-by-layer BFS from +// the seed frontier, following only edges whose Kind is in the +// supplied set. Pure map / slice walks here — the win is the disk +// backends fold the BFS into one variable-length Cypher match. +func (g *Graph) ReachableForwardByKinds(seeds []string, kinds []EdgeKind) map[string]bool { + if len(seeds) == 0 { + return nil + } + covered := make(map[string]bool, len(seeds)) + frontier := make([]string, 0, len(seeds)) + for _, id := range seeds { + if id == "" || covered[id] { + continue + } + covered[id] = true + frontier = append(frontier, id) + } + if len(kinds) == 0 { + return covered + } + allowed := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + allowed[k] = struct{}{} + } + for len(frontier) > 0 { + next := frontier[:0:0] + for _, id := range frontier { + for _, e := range g.GetOutEdges(id) { + if e == nil { + continue + } + if _, ok := allowed[e.Kind]; !ok { + continue + } + if !covered[e.To] { + covered[e.To] = true + next = append(next, e.To) + } + } + } + frontier = next + } + return covered +} + +// ThrowerErrorSurface is the in-memory reference implementation of +// the ThrowerErrorSurfacer capability. Walks EdgeThrows once for the +// per-thrower target dedup, then walks each thrower's out-edges for +// the EdgeEmits → KindString(context=error_msg) attachment. The disk +// backends collapse both passes into two Cypher GROUP BYs. +func (g *Graph) ThrowerErrorSurface(pathPrefix string) []ThrowerErrorRow { + byThrower := map[string]*ThrowerErrorRow{} + addUnique := func(set []string, v string) []string { + if slices.Contains(set, v) { + return set + } + return append(set, v) + } + for e := range g.EdgesByKind(EdgeThrows) { + if e == nil { + continue + } + if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { + continue + } + row, ok := byThrower[e.From] + if !ok { + file := e.FilePath + line := e.Line + n := g.GetNode(e.From) + if n != nil { + if file == "" { + file = n.FilePath + } + if line == 0 { + line = n.StartLine + } + } + row = &ThrowerErrorRow{ThrowerID: e.From, FilePath: file, Line: line} + byThrower[e.From] = row + } + row.Throws++ + row.ErrorTargets = addUnique(row.ErrorTargets, e.To) + } + for thrower, row := range byThrower { + for _, e := range g.GetOutEdges(thrower) { + if e == nil || e.Kind != EdgeEmits { + continue + } + n := g.GetNode(e.To) + if n == nil || n.Kind != KindString { + continue + } + ctxLabel, _ := n.Meta["context"].(string) + if ctxLabel != "error_msg" { + continue + } + row.ErrorMsgs = addUnique(row.ErrorMsgs, n.Name) + } + } + out := make([]ThrowerErrorRow, 0, len(byThrower)) + for _, r := range byThrower { + out = append(out, *r) + } + return out +} + +// MemberMethodsByType is the in-memory reference implementation of the +// MemberMethodsByType capability. One EdgesByKind(EdgeMemberOf) walk +// joined with the in-memory node table to filter Kind == KindMethod +// and project the four columns the resolver consumes — the exact +// loop the resolver runs today, just exposed as a single method call +// so disk backends can fold the join into one Cypher. +// +// Empty graph returns nil. Per-type method lists are deduplicated by +// MethodID so a method that appears twice in the EdgeMemberOf bucket +// (defensive against double-insertion) yields a single row. +func (g *Graph) MemberMethodsByType() map[string][]MemberMethodInfo { + out := map[string][]MemberMethodInfo{} + seen := map[string]map[string]struct{}{} + for e := range g.EdgesByKind(EdgeMemberOf) { + if e == nil { + continue + } + m := g.GetNode(e.From) + if m == nil || m.Kind != KindMethod { + continue + } + typeID := e.To + dedup := seen[typeID] + if dedup == nil { + dedup = make(map[string]struct{}) + seen[typeID] = dedup + } + if _, ok := dedup[m.ID]; ok { + continue + } + dedup[m.ID] = struct{}{} + out[typeID] = append(out[typeID], MemberMethodInfo{ + MethodID: m.ID, + Name: m.Name, + FilePath: m.FilePath, + StartLine: m.StartLine, + RepoPrefix: m.RepoPrefix, + }) + } + if len(out) == 0 { + return nil + } + return out +} + +// StructuralParentEdges is the in-memory reference implementation of +// the StructuralParentEdges capability. Single AllEdges scan with the +// (Extends | Implements | Composes) kind gate and the +// (Type | Interface) endpoint-kind gate applied per edge. +// +// Empty graph or no matching edges returns nil. +func (g *Graph) StructuralParentEdges() []StructuralParentEdgeRow { + var out []StructuralParentEdgeRow + for _, e := range g.AllEdges() { + if e == nil { + continue + } + switch e.Kind { + case EdgeExtends, EdgeImplements, EdgeComposes: + default: + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.Kind != KindType && from.Kind != KindInterface { + continue + } + if to.Kind != KindType && to.Kind != KindInterface { + continue + } + out = append(out, StructuralParentEdgeRow{ + FromID: from.ID, + ToID: to.ID, + FromKind: from.Kind, + ToKind: to.Kind, + Origin: e.Origin, + }) + } + return out +} + +// CrossRepoCandidates is the in-memory reference implementation of the +// CrossRepoCandidates capability. Single AllEdges scan with the +// edge-kind gate + the (non-empty, distinct) repo-prefix gate. Returns +// one row per surviving edge carrying the underlying Edge pointer plus +// the two RepoPrefix values projected from the endpoints. +// +// Empty baseKinds returns nil — matches the disk-backend contract. +// Single-repo graphs (or graphs whose nodes carry no RepoPrefix) +// return no rows because the prefix gate filters them out. +func (g *Graph) CrossRepoCandidates(baseKinds []EdgeKind) []CrossRepoCandidateRow { + if len(baseKinds) == 0 { + return nil + } + kset := make(map[EdgeKind]struct{}, len(baseKinds)) + for _, k := range baseKinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + if len(kset) == 0 { + return nil + } + var out []CrossRepoCandidateRow + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.RepoPrefix == "" || to.RepoPrefix == "" { + continue + } + if from.RepoPrefix == to.RepoPrefix { + continue + } + out = append(out, CrossRepoCandidateRow{ + Edge: e, + FromRepo: from.RepoPrefix, + ToRepo: to.RepoPrefix, + }) + } + return out +} + +// ExtractCandidates is the in-memory reference implementation of +// ExtractCandidatesScanner. Walks NodesByKind for function + method, +// applies the threshold gates locally, and counts distinct in-edge +// From / out-edge To values restricted to the requested edge kinds. +func (g *Graph) ExtractCandidates( + kinds []EdgeKind, + minLines, minCallers, minFanOut int, + pathPrefix string, +) []ExtractCandidateRow { + if len(kinds) == 0 { + return nil + } + kset := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + if len(kset) == 0 { + return nil + } + var out []ExtractCandidateRow + for _, n := range g.NodesByKinds([]NodeKind{KindFunction, KindMethod}) { + if n == nil { + continue + } + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + if n.StartLine == 0 || n.EndLine == 0 { + continue + } + lineCount := n.EndLine - n.StartLine + 1 + if lineCount < minLines { + continue + } + callerSet := make(map[string]struct{}) + for _, e := range g.GetInEdges(n.ID) { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + callerSet[e.From] = struct{}{} + } + if len(callerSet) < minCallers { + continue + } + calleeSet := make(map[string]struct{}) + for _, e := range g.GetOutEdges(n.ID) { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + calleeSet[e.To] = struct{}{} + } + if len(calleeSet) < minFanOut { + continue + } + out = append(out, ExtractCandidateRow{ + NodeID: n.ID, + Name: n.Name, + FilePath: n.FilePath, + StartLine: n.StartLine, + EndLine: n.EndLine, + LineCount: lineCount, + CallerCount: len(callerSet), + FanOut: len(calleeSet), + }) + } + return out +} + +// FileSymbolNamesByPaths is the in-memory reference implementation of +// the FileSymbolNamesByPaths capability. Walks GetFileNodes for every +// input path, keeps the requested kinds, and emits one row per +// (path, name) pair. Duplicates within a file collapse to a single +// row (a method declared once per file emits once regardless of how +// many times the indexer touched it). +func (g *Graph) FileSymbolNamesByPaths(paths []string, kinds []NodeKind) []FileSymbolNameRow { + if len(paths) == 0 { + return nil + } + kset := make(map[NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + seen := make(map[string]struct{}) + dedupKey := func(p, name string) string { return p + "\x00" + name } + var out []FileSymbolNameRow + for _, p := range paths { + if p == "" { + continue + } + for _, n := range g.GetFileNodes(p) { + if n == nil || n.Name == "" { + continue + } + if len(kset) > 0 { + if _, ok := kset[n.Kind]; !ok { + continue + } + } + k := dedupKey(p, n.Name) + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + out = append(out, FileSymbolNameRow{FilePath: p, Name: n.Name}) + } + } + return out +} + +// ClassHierarchyTraverse is the in-memory reference implementation of +// ClassHierarchyTraverser. Performs the same BFS as +// query.ClassHierarchy, but stops at the kind/depth gates and returns +// the full Path + EdgeKinds for each terminal node reached so the +// disk backend's Cypher variable-length match can be a drop-in +// replacement. Direction "up" follows out-edges; "down" follows +// in-edges. +func (g *Graph) ClassHierarchyTraverse( + seedID string, + direction string, + kinds []EdgeKind, + depth int, +) []ClassHierarchyRow { + if seedID == "" || depth <= 0 || len(kinds) == 0 { + return nil + } + kset := make(map[EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + if len(kset) == 0 { + return nil + } + if g.GetNode(seedID) == nil { + return nil + } + walkUp := direction == "up" + walkDown := direction == "down" + if !walkUp && !walkDown { + return nil + } + type queued struct { + id string + path []string + edgeKinds []EdgeKind + hops int + } + visited := map[string]struct{}{seedID: {}} + queue := []queued{{id: seedID, path: nil, edgeKinds: nil, hops: 0}} + var out []ClassHierarchyRow + for len(queue) > 0 { + cur := queue[0] + queue = queue[1:] + if cur.hops >= depth { + continue + } + var edges []*Edge + if walkUp { + edges = g.GetOutEdges(cur.id) + } else { + edges = g.GetInEdges(cur.id) + } + for _, e := range edges { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + var nb string + if walkUp { + nb = e.To + } else { + nb = e.From + } + if nb == "" { + continue + } + if _, ok := visited[nb]; ok { + continue + } + visited[nb] = struct{}{} + newPath := append([]string(nil), cur.path...) + newPath = append(newPath, nb) + newKinds := append([]EdgeKind(nil), cur.edgeKinds...) + newKinds = append(newKinds, e.Kind) + out = append(out, ClassHierarchyRow{ + Path: newPath, + EdgeKinds: newKinds, + }) + queue = append(queue, queued{id: nb, path: newPath, edgeKinds: newKinds, hops: cur.hops + 1}) + } + } + return out +} + +// FileEditingContext is the in-memory reference implementation of the +// FileEditingContext capability. Performs the equivalent of +// GetFileSymbols + per-function GetCallers/GetCallChain but bounded +// to the call/method node set, so the disk backend's batched query +// returns the same projection. The kinds parameter is the set of +// kinds treated as call targets (function + method). +func (g *Graph) FileEditingContext(filePath string, kinds []NodeKind) *FileEditingContextResult { + if filePath == "" { + return nil + } + nodes := g.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil + } + kset := make(map[NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + res := &FileEditingContextResult{} + var fileNodeID string + var defNodeIDs []string + for _, n := range nodes { + if n == nil { + continue + } + if n.Kind == KindFile { + res.FileNode = n + fileNodeID = n.ID + continue + } + res.Defines = append(res.Defines, n) + if _, ok := kset[n.Kind]; ok { + defNodeIDs = append(defNodeIDs, n.ID) + } + } + if fileNodeID != "" { + for _, e := range g.GetOutEdges(fileNodeID) { + if e == nil { + continue + } + if e.Kind == EdgeImports { + res.Imports = append(res.Imports, e) + } + } + } + if len(defNodeIDs) == 0 { + return res + } + inEdges := g.GetInEdgesByNodeIDs(defNodeIDs) + outEdges := g.GetOutEdgesByNodeIDs(defNodeIDs) + callerIDSet := make(map[string]struct{}) + calleeIDSet := make(map[string]struct{}) + for _, id := range defNodeIDs { + for _, e := range inEdges[id] { + if e == nil || e.Kind != EdgeCalls { + continue + } + if e.From == "" { + continue + } + callerIDSet[e.From] = struct{}{} + } + for _, e := range outEdges[id] { + if e == nil || e.Kind != EdgeCalls { + continue + } + if e.To == "" { + continue + } + calleeIDSet[e.To] = struct{}{} + } + } + callerIDs := make([]string, 0, len(callerIDSet)) + for id := range callerIDSet { + callerIDs = append(callerIDs, id) + } + calleeIDs := make([]string, 0, len(calleeIDSet)) + for id := range calleeIDSet { + calleeIDs = append(calleeIDs, id) + } + callerNodes := g.GetNodesByIDs(callerIDs) + calleeNodes := g.GetNodesByIDs(calleeIDs) + for _, id := range callerIDs { + n := callerNodes[id] + if n == nil || n.FilePath == filePath { + continue + } + res.CalledBy = append(res.CalledBy, n) + } + for _, id := range calleeIDs { + n := calleeNodes[id] + if n == nil || n.FilePath == filePath { + continue + } + res.Calls = append(res.Calls, n) + } + return res +} + +// GetFileSubGraph is the in-memory reference implementation of the +// FileSubGraphReader capability. Iterates the existing per-file +// byFile bucket and the per-node outEdges / inEdges shards — the +// same lookups Engine.GetFileSymbols' fallback path already runs, +// just collapsed behind one method so the disk backends can push the +// whole walk into a single Cypher pattern match. +func (g *Graph) GetFileSubGraph(filePath string) ([]*Node, []*Edge) { + if filePath == "" { + return nil, nil + } + nodes := g.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil, nil + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil && n.ID != "" { + ids = append(ids, n.ID) + } + } + outByID := g.GetOutEdgesByNodeIDs(ids) + inByID := g.GetInEdgesByNodeIDs(ids) + type edgeKey struct { + from string + to string + kind EdgeKind + } + seen := make(map[edgeKey]struct{}, 2*len(ids)) + edges := make([]*Edge, 0, 2*len(ids)) + add := func(e *Edge) { + if e == nil { + return + } + k := edgeKey{from: e.From, to: e.To, kind: e.Kind} + if _, ok := seen[k]; ok { + return + } + seen[k] = struct{}{} + edges = append(edges, e) + } + for _, id := range ids { + for _, e := range outByID[id] { + add(e) + } + for _, e := range inByID[id] { + add(e) + } + } + return nodes, edges +} + +// GetFileSubGraphCounts is the in-memory reference implementation of +// FileSubGraphCountReader. The per-node bucket reads are already +// O(1) so it just walks GetFileSubGraph and reports len(edges); the +// row-materialisation win belongs to disk backends. +func (g *Graph) GetFileSubGraphCounts(filePath string) ([]*Node, int) { + nodes, edges := g.GetFileSubGraph(filePath) + return nodes, len(edges) +} + +// NodeDegreeByKinds is the in-memory reference implementation of the +// NodeDegreeByKinds capability. Walks NodesByKinds and reads each +// node's in/out edge buckets — the disk backend overrides with one +// kind-filtered aggregation per direction so the IN-list of node IDs +// the legacy NodeDegreeCounts path needed is avoided altogether. +func (g *Graph) NodeDegreeByKinds(kinds []NodeKind, pathPrefix string) []NodeDegreeRow { + if len(kinds) == 0 { + return nil + } + pool := g.NodesByKinds(kinds) + out := make([]NodeDegreeRow, 0, len(pool)) + for _, n := range pool { + if n == nil { + continue + } + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + out = append(out, NodeDegreeRow{ + NodeID: n.ID, + InCount: len(g.GetInEdges(n.ID)), + OutCount: len(g.GetOutEdges(n.ID)), + }) + } + return out +} + diff --git a/internal/graph/node.go b/internal/graph/node.go index d2c9c00e..18c3aa38 100644 --- a/internal/graph/node.go +++ b/internal/graph/node.go @@ -40,6 +40,35 @@ const ( // node, not its enclosing function. EdgeMemberOf links to the // enclosing function. EdgeCaptures lists outer bindings closed over. KindClosure NodeKind = "closure" + // KindLocal represents an intra-function binding — a variable + // declared inside a function body via `x := …` / `var x = …` / a + // range clause / a type-switch / a for-init clause. ID convention: + // `#local:@+` (the + // leading `+` flags the value as a relative offset so the IDs + // stay stable when the enclosing function moves as a whole). + // EdgeMemberOf links each binding to its enclosing function or + // method. KindLocal is excluded from the BM25 search index by + // shouldIndexForSearch — surfacing `err` / `data` / `n` / `i` + // from every function would flood every name lookup. The data- + // flow analysis (flow_between, taint_paths, ...) traverses the + // EdgeValueFlow / EdgeArgOf / EdgeReturnsTo edges that target + // these nodes; consumers that want the locals can ask for them + // by kind explicitly. + KindLocal NodeKind = "local" + // KindBuiltin represents a language intrinsic — a function / + // type / constant that's part of the language itself, not + // declared in any indexed source file. ID convention: + // `builtin::::` for functions (`builtin::go::append`, + // `builtin::py::len`) and `builtin::::type::` for + // types (`builtin::go::type::string`). Meta.builtin_kind ∈ + // "func" | "type" | "const". KindBuiltin is excluded from the + // BM25 search index — surfacing `string` / `int` / `append` + // would flood every name lookup. They participate in normal + // graph queries: `find_usages(builtin::go::type::float64)` + // answers "every variable typed as float64 in this codebase", + // which is the load-bearing query for type-drift / dataflow + // analyses. + KindBuiltin NodeKind = "builtin" // KindConstant peels off `const`, `iota`, top-level immutable // bindings, and language-specific constant declarations from // KindVariable. Existing variable-kind nodes are re-classified on diff --git a/internal/graph/node_id_parity_test.go b/internal/graph/node_id_parity_test.go index 35cc2034..8a74a7ad 100644 --- a/internal/graph/node_id_parity_test.go +++ b/internal/graph/node_id_parity_test.go @@ -231,10 +231,18 @@ func indexFixture(t *testing.T, checkoutName string) fixtureResult { for _, n := range g.AllNodes() { // This test is about source-symbol IDs (functions, methods, // types, files) — the things overlay merging keys on. - // Contract-kind nodes (kind=contract) don't currently carry a - // RepoPrefix field; skip them here so the parity gate is - // precise about what it gates. - if n.Kind == graph.KindContract { + // Contract / Module / Builtin nodes are deliberately + // cross-repo singletons (one `dep::foo`, `module::pypi:requests`, + // `builtin::go::len` shared across every repo that uses them) + // and don't carry RepoPrefix; skip them so the parity gate + // stays precise about what it gates. KindFunction nodes + // with meta.external=true are the per-symbol stubs the + // external-call attribution materialises for stdlib/dep + // targets — same rule. + if n.Kind == graph.KindContract || n.Kind == graph.KindModule || n.Kind == graph.KindBuiltin { + continue + } + if ext, _ := n.Meta["external"].(bool); ext { continue } if n.RepoPrefix == "" { diff --git a/internal/graph/overlay.go b/internal/graph/overlay.go index 1518f33a..f53a7bdc 100644 --- a/internal/graph/overlay.go +++ b/internal/graph/overlay.go @@ -331,6 +331,45 @@ func (v *OverlaidView) GetNode(id string) *Node { return v.base.GetNode(id) } +// GetNodesByIDs returns the overlay-aware *Node for each input ID. +// Overlay-owned IDs short-circuit to the per-session layer (and may +// resolve to nil when the overlay deleted the node); the remainder +// fans out as a single batched lookup against the base store. Missing +// IDs are simply absent from the returned map. +func (v *OverlaidView) GetNodesByIDs(ids []string) map[string]*Node { + if len(ids) == 0 { + return nil + } + out := make(map[string]*Node, len(ids)) + baseIDs := ids[:0:0] // fresh backing array — never aliases caller's slice + for _, id := range ids { + if id == "" { + continue + } + if _, dup := out[id]; dup { + continue + } + if v.layer != nil && v.nodeBelongsToOverlay(id) { + if n := v.layer.nodeByID[id]; n != nil { + out[id] = n + } + // Overlay tombstone — ID is hidden, do not fall back to base. + continue + } + // Track for the single base round-trip; reserve a slot in `out` + // only after the batched lookup returns. + baseIDs = append(baseIDs, id) + } + if len(baseIDs) > 0 && v.base != nil { + for id, n := range v.base.GetNodesByIDs(baseIDs) { + if n != nil { + out[id] = n + } + } + } + return out +} + // GetNodeByQualName: overlay first, then base. Base hits are filtered // to drop entries whose file is overlaid (the overlay's view wins). func (v *OverlaidView) GetNodeByQualName(qualName string) *Node { @@ -383,6 +422,60 @@ func (v *OverlaidView) FindNodesByName(name string) []*Node { return out } +// FindNodesByNameContaining merges overlay-touched name hits with the +// base result, then re-applies the per-overlay-file masking the same +// way FindNodesByName does. Order is overlay-first, then base; the +// limit caps the merged total. Empty substr or both layers nil +// returns nil. +func (v *OverlaidView) FindNodesByNameContaining(substr string, limit int) []*Node { + if substr == "" { + return nil + } + needle := strings.ToLower(substr) + var out []*Node + // Overlay-side: walk the layer's nodesByName index — the same + // bucket FindNodesByName reads from — and accept any name whose + // lowercase form contains the needle. + if v.layer != nil { + for name, bucket := range v.layer.nodesByName { + if strings.Contains(strings.ToLower(name), needle) { + out = append(out, bucket...) + if limit > 0 && len(out) >= limit { + return out[:limit] + } + } + } + } + if v.base == nil { + return out + } + // Base-side: fetch with an inflated limit so overlay-mask drops + // don't leave a short page. Then re-apply the same overlaid-file + // + name-removed mask FindNodesByName uses. + fetch := limit + if fetch > 0 { + fetch *= 2 + } + for _, n := range v.base.FindNodesByNameContaining(substr, fetch) { + if v.layer != nil { + if v.layer.HasFile(IDFile(n.ID)) { + continue + } + if v.layer.nameRemoved[n.Name] != nil && v.layer.nameRemoved[n.Name][n.ID] { + continue + } + } + out = append(out, n) + if limit > 0 && len(out) >= limit { + return out[:limit] + } + } + if limit > 0 && len(out) > limit { + out = out[:limit] + } + return out +} + // GetFileNodes: if the path is overlaid, return overlay's nodes // (empty for tombstones). Otherwise pass through to base. func (v *OverlaidView) GetFileNodes(filePath string) []*Node { @@ -486,6 +579,113 @@ func (v *OverlaidView) GetInEdges(nodeID string) []*Edge { return out } +// GetOutEdgesByNodeIDs returns the overlay-aware outgoing-edge map for +// every input id. Overlay-owned ids short-circuit to the per-session +// layer; the remainder fans out as a single batched lookup against +// the base store. Output mirrors GetOutEdges's per-id semantics +// (target-side overlay deletions filtered out), but in one cgo +// round-trip per direction instead of N. +func (v *OverlaidView) GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + baseIDs := ids[:0:0] + seen := make(map[string]struct{}, len(ids)) + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + if v.layer != nil && v.nodeBelongsToOverlay(id) { + src := v.layer.outEdges[id] + cp := make([]*Edge, len(src)) + copy(cp, src) + out[id] = cp + continue + } + baseIDs = append(baseIDs, id) + } + if len(baseIDs) > 0 && v.base != nil { + base := v.base.GetOutEdgesByNodeIDs(baseIDs) + for id, edges := range base { + if v.layer == nil { + out[id] = edges + continue + } + filtered := edges[:0:0] + for _, e := range edges { + if v.layer.HasFile(IDFile(e.To)) { + if v.layer.nodeByID[e.To] == nil { + continue // target deleted in overlay + } + } + filtered = append(filtered, e) + } + out[id] = filtered + } + } + return out +} + +// GetInEdgesByNodeIDs is the inbound sibling of GetOutEdgesByNodeIDs. +// Merges base in-edges (filtered to drop edges sourced in overlaid +// files) with overlay-introduced in-edges for each input id, all in a +// single batched base round-trip. +func (v *OverlaidView) GetInEdgesByNodeIDs(ids []string) map[string][]*Edge { + if len(ids) == 0 { + return nil + } + out := make(map[string][]*Edge, len(ids)) + seen := make(map[string]struct{}, len(ids)) + uniq := ids[:0:0] + for _, id := range ids { + if id == "" { + continue + } + if _, dup := seen[id]; dup { + continue + } + seen[id] = struct{}{} + uniq = append(uniq, id) + } + if len(uniq) == 0 { + return out + } + if v.base != nil { + base := v.base.GetInEdgesByNodeIDs(uniq) + for _, id := range uniq { + edges := base[id] + if v.layer == nil { + out[id] = edges + continue + } + filtered := edges[:0:0] + for _, e := range edges { + if v.layer.HasFile(IDFile(e.From)) { + continue // source is overlaid — overlay's version wins + } + if v.layer.HasFile(IDFile(e.To)) && v.layer.nodeByID[e.To] == nil { + continue // target was deleted by overlay + } + filtered = append(filtered, e) + } + out[id] = filtered + } + } + if v.layer != nil { + for _, id := range uniq { + if extras := v.layer.inEdges[id]; len(extras) > 0 { + out[id] = append(out[id], extras...) + } + } + } + return out +} + // AllNodes returns base's nodes minus nodes in overlaid files, plus // every node the overlay introduced. Bulk-read consumers (analyzers, // search reindex, snapshot export) get an overlay-consistent view diff --git a/internal/graph/reader.go b/internal/graph/reader.go index 10936e0c..a86a57be 100644 --- a/internal/graph/reader.go +++ b/internal/graph/reader.go @@ -21,6 +21,23 @@ type Reader interface { GetNode(id string) *Node GetNodeByQualName(qualName string) *Node FindNodesByName(name string) []*Node + // FindNodesByNameContaining returns nodes whose Name (case- + // insensitive) contains substr. The filter is pushed into the + // backend so only matching rows cross cgo on disk backends; + // the search hot path's substring fallback uses this instead of + // the old AllNodes()-then-filter pattern (which materialised the + // whole node set per call and didn't scale). limit caps the + // result; 0 means "no limit". + FindNodesByNameContaining(substr string, limit int) []*Node + + // GetNodesByIDs is the batched sibling of GetNode. Disk-backed + // stores (Ladybug) collapse N individual point lookups into a + // single bulk query — critical on the search hot path where one + // query materialises 60+ candidate IDs. The in-memory backend + // forwards to per-id GetNode, so the cost matches an inline loop + // there. Missing IDs are simply absent from the map (no nil + // values); duplicates dedupe naturally. + GetNodesByIDs(ids []string) map[string]*Node // File / repo scopes. GetFileNodes(filePath string) []*Node @@ -30,6 +47,19 @@ type Reader interface { GetOutEdges(nodeID string) []*Edge GetInEdges(nodeID string) []*Edge + // GetInEdgesByNodeIDs / GetOutEdgesByNodeIDs are the batched + // siblings of GetInEdges / GetOutEdges. Disk-backed stores collapse + // N per-id Cypher queries into one bulk MATCH over `WHERE id IN + // $ids`; the in-memory backend forwards to per-id walks (no + // concurrency win — same algorithmic cost as an inline loop). On + // the rerank hot path this drops ~150 cgo round-trips per + // search_symbols call down to ~4 (prepare collects every + // candidate's ids and fans them out in one inbound + one outbound + // batch). Missing nodes get nil slices in the returned map so + // callers can `for _, e := range m[id]` without an ok-check. + GetInEdgesByNodeIDs(ids []string) map[string][]*Edge + GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge + // Bulk reads — used by analyzers (hotspots, cycles, dead code, // communities, …) and by the embedded query engine's whole-graph // passes. diff --git a/internal/graph/store.go b/internal/graph/store.go new file mode 100644 index 00000000..97523770 --- /dev/null +++ b/internal/graph/store.go @@ -0,0 +1,1568 @@ +package graph + +import ( + "iter" + "sync" +) + +// EdgeReindex is the per-edge payload for ReindexEdges. Edge points +// at the (already mutated) Edge value the caller wants the store to +// re-bind; OldTo is the To target the edge had BEFORE the mutation, +// so the store can drop the stale in-edge index entry for OldTo +// while writing the new one for Edge.To. +type EdgeReindex struct { + Edge *Edge + OldTo string +} + +// EdgeProvenanceUpdate is the per-edge payload for +// SetEdgeProvenanceBatch. Edge points at the stored Edge whose +// origin should be promoted; NewOrigin is the target tier. The store +// only persists the change (and bumps EdgeIdentityRevisions) when +// NewOrigin differs from the currently stored Origin. +type EdgeProvenanceUpdate struct { + Edge *Edge + NewOrigin string +} + +// Store is the persistence-and-query backend the rest of gortex sees +// behind the *Graph type. The only implementation today is the +// in-memory *Graph; future implementations will include an on-disk +// embedded-DB backend (local single-binary) and a remote network +// client. The interface is the seam that lets the rest of the +// codebase be backend-agnostic. +// +// The method set deliberately mirrors *Graph's current public API so +// the codebase compiles unchanged the day this interface lands. A few +// notes on shape: +// +// - Slice-shaped reads (AllNodes / AllEdges / FindNodesByName / …) +// materialise their result in memory — fine for the in-memory +// store, but disk / remote backends will want iterator-shaped +// variants added alongside as those implementations come online. +// +// - Memory-estimate methods (RepoMemoryEstimate / +// AllRepoMemoryEstimates) are inherently in-memory specific; disk +// and remote backends return whatever they can compute and callers +// treat the result as advisory. +// +// - ResolveMutex() returns a backend-owned mutex that resolver +// instances (cross-repo, temporal, external) share to serialise +// their edge-mutation passes against each other and against the +// indexer's incremental rewrites. Every backend needs equivalent +// coordination; the in-memory store uses its existing +// graph-wide resolveMu, disk backends keep a dedicated mutex +// alongside their own write serialisation. The returned pointer +// is owned by the store and must not be Unlocked when not held. +type Store interface { + // --- Writes ----------------------------------------------------- + + AddNode(n *Node) + AddBatch(nodes []*Node, edges []*Edge) + AddEdge(e *Edge) + SetEdgeProvenance(e *Edge, newOrigin string) bool + ReindexEdge(e *Edge, oldTo string) + // Batched siblings of the per-edge mutators. Same semantics, but + // disk backends amortise the per-call transaction overhead by + // committing in implementation-chosen chunks (the in-memory + // backend just loops). The resolver fans out per-edge mutations + // across thousands of edges in a single ResolveAll pass, so the + // per-call form was unusable on disk backends without these. + // Callers MUST first mutate the *Edge fields they want persisted + // (To / Kind / Origin / …) before handing the entry over — these + // methods read the post-mutation Edge state and update the + // backend's indexes accordingly. + ReindexEdges(batch []EdgeReindex) + SetEdgeProvenanceBatch(batch []EdgeProvenanceUpdate) (changed int) + RemoveEdge(from, to string, kind EdgeKind) bool + EvictFile(filePath string) (nodesRemoved, edgesRemoved int) + EvictRepo(repoPrefix string) (nodesRemoved, edgesRemoved int) + + // --- Point lookups --------------------------------------------- + + GetNode(id string) *Node + GetNodeByQualName(qualName string) *Node + + // --- Name + scope queries -------------------------------------- + + FindNodesByName(name string) []*Node + FindNodesByNameInRepo(name, repoPrefix string) []*Node + // FindNodesByNameContaining returns nodes whose Name (case- + // insensitive) contains the given substring. The implementation + // pushes the filter into the backend so only matching rows cross + // the cgo boundary — the old search-substring fallback's + // AllNodes()-then-filter pattern materialised the whole node set + // per query and breaks at Linux-kernel scale (10M+ symbols). + // limit caps the result set so a very common substring can't blow + // up memory; pass 0 for "no limit" (caller's responsibility to + // handle). The order is implementation-defined — callers that + // need deterministic output sort the result. + FindNodesByNameContaining(substr string, limit int) []*Node + GetFileNodes(filePath string) []*Node + GetRepoNodes(repoPrefix string) []*Node + + // --- Edge adjacency -------------------------------------------- + + GetOutEdges(nodeID string) []*Edge + GetInEdges(nodeID string) []*Edge + + // GetInEdgesByNodeIDs / GetOutEdgesByNodeIDs batch the per-node + // edge fan-out into a single backend round-trip. The rerank + // pipeline calls these once per Rerank() to materialise every + // candidate's incoming + outgoing edges in two cgo round-trips + // instead of 6N per-candidate calls. Missing IDs are absent from + // the returned map (callers can index without an ok-check via the + // nil-slice semantics of map[k][]*Edge — range over nil is a no-op). + GetInEdgesByNodeIDs(ids []string) map[string][]*Edge + GetOutEdgesByNodeIDs(ids []string) map[string][]*Edge + + // GetRepoEdges returns every edge whose source node has the given + // RepoPrefix. Equivalent to GetRepoNodes(r) followed by + // GetOutEdges(n.ID) for every n, but executes as a single backend + // query — critical on disk backends (Ladybug, SQLite, DuckDB) + // where the per-node loop is O(repo_nodes) round-trips. The + // in-memory backend forwards to that same nested walk; the disk + // backends push the join into one server-side query. + // + // Empty repoPrefix returns nothing — use AllEdges() for the + // global view. Nodes with an empty RepoPrefix are unreachable + // through this method by design (they don't belong to any repo). + GetRepoEdges(repoPrefix string) []*Edge + + // --- Bulk reads ------------------------------------------------ + + AllNodes() []*Node + AllEdges() []*Edge + + // --- Predicate-shaped reads (push filters into the store) ------ + // + // These methods replace the pre-Store idiom of `for _, e := range + // AllEdges() { if cond { ... } }`. On the in-memory backend they + // iterate the existing internal byKind / byPrefix buckets — same + // algorithmic cost as the inline filter. On disk backends they + // fan out to dedicated indexes (idx_edge_kind / idx_node_kind / + // the to_id LIKE prefix scan, etc.) so the row count actually + // materialised is proportional to the predicate match, not the + // whole table. + // + // The resolver alone calls AllEdges/AllNodes 34× per pass and + // throws away >99% of each scan; using these predicate methods + // instead cut a 503-second disk-backed resolver pass on a 122k-node + // graph down to seconds. + // + // Iterators stop when the consumer's yield returns false. + // Implementations MUST honour early-stop so callers can break + // out of a search. + + // EdgesByKind yields every edge whose Kind matches. + EdgesByKind(kind EdgeKind) iter.Seq[*Edge] + + // NodesByKind yields every node whose Kind matches. + NodesByKind(kind NodeKind) iter.Seq[*Node] + + // EdgesWithUnresolvedTarget yields every edge whose To has the + // "unresolved::" prefix. The resolver's main loop calls this + // once per pass; on disk backends it should range-scan a + // to-keyed index over the single contiguous "unresolved::" slice + // rather than materialise the whole edges table. + EdgesWithUnresolvedTarget() iter.Seq[*Edge] + + // --- Batched point lookups ------------------------------------- + // + // The resolver fires ~3-10 GetNode / FindNodesByName calls per + // unresolved edge across its workers. With 10-30k pending edges + // that's 100k-300k individual queries. On in-memory that's + // fine (map lookups, nanoseconds). On a disk backend each point + // lookup is ~ms — at 100k+ calls the per-pass cost is hundreds + // of seconds, dominating the resolver. The batched variants + // collapse those into one (or chunked) bulk query. + + // GetNodesByIDs returns a map id→*Node for every input ID present + // in the store. IDs not in the store are simply absent from the + // returned map (no nil values). Callers may pass duplicates; the + // returned map dedupes naturally. + GetNodesByIDs(ids []string) map[string]*Node + + // FindNodesByNames returns a map name→[]*Node where each slot + // holds every node whose Name field matches. Names that match no + // node are absent. Used by the resolver to pre-warm its name-only + // fallback lookup across the whole pending-edge slice in one + // batched call instead of one query per edge. + FindNodesByNames(names []string) map[string][]*Node + + // --- Counts and stats ------------------------------------------ + + NodeCount() int + EdgeCount() int + Stats() GraphStats + RepoStats() map[string]GraphStats + RepoPrefixes() []string + + // --- Provenance verification ----------------------------------- + + EdgeIdentityRevisions() int + VerifyEdgeIdentities() error + + // --- Memory estimation (advisory; in-memory-specific) ---------- + + RepoMemoryEstimate(repoPrefix string) RepoMemoryEstimate + AllRepoMemoryEstimates() map[string]RepoMemoryEstimate + + // --- Coordination ---------------------------------------------- + + // ResolveMutex returns a backend-owned mutex resolver instances + // share to serialise edge-mutation passes. See the package doc + // above for the full contract. + ResolveMutex() *sync.Mutex +} + +// Compile-time assertion: *Graph satisfies the Store interface. If a +// *Graph method's signature ever drifts from the interface, the build +// fails fast here instead of at runtime when a different Store +// implementation gets swapped in. +var _ Store = (*Graph)(nil) + +// BackendResolver is an optional interface backends MAY implement to +// drain the bulk-tractable subset of the resolver's work entirely +// inside the backend engine (Cypher MATCH+SET on Ladybug, +// UPDATE...FROM on DuckDB) instead of round-tripping every +// resolution decision back to Go. +// +// Sequencing matters: earlier rules are higher-precision than later +// ones. The orchestrator (ResolveAllBulk) runs them in the order +// listed below so that, e.g., an intra-file call binds to its same- +// file declaration before the unique-name pass would have bound it +// to a same-named symbol elsewhere in the repo. +// +// Each method returns the number of pending edges it drained. +// Unimplemented methods return (0, nil) and the orchestrator skips +// to the next. Errors surface as non-fatal — the orchestrator logs +// and continues with subsequent rules; the Go-side Resolver then +// picks up whatever the bulk pass didn't drain. +type BackendResolver interface { + // ResolveSameFile: unresolved::Name where target is in the + // caller's same source file. Strongest precision — a same-file + // declaration is almost never ambiguous. + ResolveSameFile() (resolved int, err error) + + // ResolveSamePackage: unresolved::Name where target is in the + // caller's same directory (Go package). Repo_prefix must match + // to keep the rule within one source tree. + ResolveSamePackage() (resolved int, err error) + + // ResolveImportAware: caller's file imports F, target is a + // symbol in F. Joins against the EdgeImports adjacency. + ResolveImportAware() (resolved int, err error) + + // ResolveRelativeImports: unresolved::pyrel:: / Dart + // relative-URI stubs rewritten to the matching KindFile node + // (e.g. .py or /__init__.py for Python). + // `lang` selects the dialect; empty string runs all supported + // dialects in turn. + ResolveRelativeImports(lang string) (resolved int, err error) + + // ResolveCrossRepo: unresolved::Name where exactly one + // cross-repo Node carries that name. Lower precision than the + // same-repo rules; sets cross_repo = true on the resulting edge. + ResolveCrossRepo() (resolved int, err error) + + // ResolveUniqueNames: unresolved::Name where exactly one Node + // in the entire graph carries that name. Lowest-precision + // "fallback" — runs after the same-file / same-package / + // import-aware passes have drained anything they could resolve + // more precisely. + ResolveUniqueNames() (resolved int, err error) + + // ResolveExternalCallStubs: ensures every external::* edge + // target has a corresponding Node row (the existing + // SynthesizeExternalCalls pass on the Go side). Promotes + // origin to ast_resolved for edges that now point at a real + // stub. + ResolveExternalCallStubs() (resolved int, err error) + + // ResolveAllBulk runs the bulk-tractable methods in + // precision-descending order and returns the cumulative count + // of edges resolved across all rules. The default backend + // implementation should chain the methods above; callers use + // ResolveAllBulk as the single Resolver-side hook. + ResolveAllBulk() (totalResolved int, err error) +} + +// BulkLoader is an optional interface backends MAY implement to expose +// a high-throughput cold-load fast path that bypasses per-call query +// overhead. The cold-start indexer fires ~2000 small AddBatch calls +// during its parse phase; on backends where every AddBatch round-trips +// through a query parser (Ladybug, DuckDB) that per-call cost +// dominates wall time. BulkLoader lets the indexer bracket the parse +// loop with BeginBulkLoad / FlushBulk: AddBatch calls inside the +// bracket buffer rows in memory, and FlushBulk commits them through +// the backend's native bulk primitive (Ladybug's COPY FROM, +// DuckDB's long-lived Appender). +// +// Contract: +// +// - BeginBulkLoad may be called on a non-empty store. The cold-start +// parse phase calls it on an empty store, but later passes (notably +// the contracts pass, which appends a few hundred contract nodes / +// edges after resolve) re-enter the bracket against a populated +// backend. FlushBulk commits via the backend's native bulk +// primitive in MERGE-on-primary-key mode, so re-appending rows +// that share an ID with existing data does not duplicate them. +// +// - Between BeginBulkLoad and FlushBulk, AddBatch is the only mutator +// the caller may invoke. Reads (GetNode, AllEdges, EdgesByKind, …) +// return whatever the backend can see — typically nothing buffered. +// The resolver MUST NOT run until after FlushBulk. +// +// - FlushBulk commits everything buffered since BeginBulkLoad and +// returns the backend to normal per-call write mode. An error +// leaves the store in an implementation-defined state. +// +// - Calling BeginBulkLoad twice without an intervening FlushBulk, +// or calling FlushBulk without a prior BeginBulkLoad, is a +// programmer error; backends are free to panic. +// +// The in-memory *Graph deliberately does NOT implement BulkLoader — +// it's already optimal at the per-call path. bbolt and SQLite likewise +// skip it: their per-call overhead is already amortised by their own +// internal batching (chunked transactions, prepared statements). The +// interface is intentionally opt-in so the indexer can probe with a +// type assertion and fall through to today's per-batch path uniformly. +type BulkLoader interface { + BeginBulkLoad() + FlushBulk() error +} + +// SymbolHit is a single full-text-search result: the matched node ID +// plus its relevance score from the backend's scorer (BM25 in +// Ladybug's FTS). Higher score = more relevant. +type SymbolHit struct { + NodeID string + Score float64 +} + +// SymbolFTSItem is the payload BulkUpsertSymbolFTS takes per node: +// the node's ID and its pre-tokenised text. Reused so the indexer +// can preallocate one slice and the backend can iterate without +// per-element wrapper allocs. +type SymbolFTSItem struct { + NodeID string + Tokens string +} + +// SymbolSearcher is an optional interface backends MAY implement to +// expose engine-native full-text search over the graph's symbol +// names. When the backing store implements it, the daemon's +// search_symbols path routes through the backend FTS instead of +// building a parallel in-process Bleve/BM25 index — saving ~100MB +// of heap on a vscode-scale repo and putting the search latency in +// the same address space as the rest of the graph. +// +// Contract: +// +// - UpsertSymbolFTS is the per-call write path used by incremental +// reindex. The store decides how to persist the pre-tokenised +// text (a sidecar table, an FTS column, an in-engine index — +// backend choice). Tokens are produced by +// internal/search.Tokenize so camelCase / snake_case / path- +// separator semantics match the existing BM25 corpus contract. +// +// - BulkUpsertSymbolFTS is the cold-start fast path used by the +// indexer's shadow-swap drain. Implementations SHOULD use the +// backend's native bulk primitive (TSV + COPY FROM on Ladybug) +// so a 600k-node repo doesn't pay per-row Cypher parse cost. +// Idempotent on NodeID like UpsertSymbolFTS — re-running with +// an overlapping set replaces in place. +// +// repoPrefix is the per-repo namespace; the store wipes only +// rows owned by that prefix before COPYing the new items, so +// multiple repos sharing one store don't clobber each other's +// FTS corpus. Empty prefix means "single-repo mode" — the +// store wipes everything (the legacy behaviour). +// +// - BuildSymbolIndex finalises the index after the bulk parse +// phase. For backends whose FTS index updates automatically on +// row writes (Ladybug), this is a one-shot cold-start call; +// for backends that need an explicit build pass, it's where +// the work happens. Idempotent — safe to call multiple times. +// +// - SearchSymbols runs a query and returns hits ordered by score +// descending. The query string is the user's raw input; the +// backend is expected to tokenise it the same way it tokenised +// the indexed text (typically by passing it through +// internal/search.Tokenize before invoking the FTS). +// +// - Close is implied by graph.Store.Close — no separate +// teardown method here. +type SymbolSearcher interface { + UpsertSymbolFTS(nodeID, tokens string) error + BulkUpsertSymbolFTS(repoPrefix string, items []SymbolFTSItem) error + BuildSymbolIndex() error + SearchSymbols(query string, limit int) ([]SymbolHit, error) +} + +// SymbolBundle is the rerank-shaped result of one search call: the +// matched node, its BM25 score, AND the in/out edges the rerank +// pipeline reads from. Backends that can compose this in a single +// engine round-trip implement SymbolBundleSearcher; callers can fall +// through to SymbolSearcher + GetNodesByIDs + GetIn/OutEdgesByNodeIDs +// when the backend doesn't. +// +// The same node may appear in successive bundles when a multi-call +// retrieval path (primary + expansion) returns it more than once; the +// caller's dedup-by-ID step keeps the per-call shape simple and the +// engine can merge across calls into a single rerank candidate set +// without paying for the duplicate edge fetch — the second occurrence +// already carries the same edges. +type SymbolBundle struct { + Node *Node + Score float64 + InEdges []*Edge + OutEdges []*Edge +} + +// SymbolBundleSearcher is an optional capability backends MAY +// implement to fold the symbol-search hot path's three +// per-BM25-call cgo round-trips (FTS + GetNodesByIDs + the rerank +// prepare's batched in/out edge fetch) into one bundled +// engine-side call: +// +// - FTS yields (id, score) +// - One batched node materialise + one in-edge fan-in + one +// out-edge fan-out, all keyed on the same id list, return the +// bundle. +// +// Backends that do NOT implement this interface still serve the +// search path through SymbolSearcher; callers fall back to +// SymbolSearcher.SearchSymbols + GetNodesByIDs + +// GetIn/OutEdgesByNodeIDs and pay the per-call cgo cost the +// bundled form avoids. The contract is intentionally read-only — +// writes still go through UpsertSymbolFTS / BulkUpsertSymbolFTS on +// the SymbolSearcher. +// +// Today the Ladybug backend implements this via four cypher calls +// (FTS → IDs, then a node batch + an outgoing-edge batch + an +// inbound-edge batch on those IDs). A single combined Cypher with +// OPTIONAL MATCH + collect() is slower in practice — the +// cross-product Kuzu builds across the two OPTIONAL MATCH + +// collect frames outweighs the cgo saving (probe: 150ms median vs +// the 4-query split's 68ms median on the same id set). +type SymbolBundleSearcher interface { + SearchSymbolBundles(query string, limit int) ([]SymbolBundle, error) +} + +// VectorItem is the payload BulkUpsertEmbeddings takes per node: +// the node's ID and its embedding vector. Length of Vec must +// match the dim the corresponding BuildVectorIndex call declared +// — backends with fixed-width vector columns (Ladybug's +// FLOAT[N]) reject inserts that don't match. +type VectorItem struct { + NodeID string + Vec []float32 +} +// VectorHit is a single ANN search result: the matched node ID +// plus its distance to the query vector under the backend's +// metric (cosine by default in Ladybug). LOWER distance = more +// similar. Callers that need a similarity score in [0,1] should +// translate via `1 - distance` for cosine. +type VectorHit struct { + NodeID string + Distance float64 +} + +// VectorSearcher is an optional interface backends MAY implement to +// expose engine-native HNSW vector indexing over per-symbol +// embedding vectors. When the backing store implements it, the +// daemon's semantic-search path routes through the backend's +// native ANN index instead of holding a parallel in-process +// HNSW — saving roughly `dim × 4 × N` bytes of heap (≈ 1 GB for +// 384-dim × 663k symbols on a Vscode-scale repo). +// +// The bigger win — and the reason Option B exists alongside +// Option C in the storage-engine roadmap — is that vector +// neighbours and graph traversal can be combined in a single +// Cypher round-trip: +// +// CALL QUERY_VECTOR_INDEX('SymbolVec', 'idx_emb', $vec, 50) +// YIELD node AS seed +// MATCH (seed)<-[:calls]-(caller:KindFunction) +// WHERE caller.RepoPrefix = $repo AND NOT caller.id CONTAINS '_test' +// RETURN seed.name, caller.name +// +// Today this query is three round-trips on the in-process HNSW +// path (ANN → IDs → graph fetch → Go-side filter); with +// VectorSearcher it's one engine-vectorised pipeline. +// +// Contract: +// +// - UpsertEmbedding is the per-call write path used by +// incremental reindex when one file's embeddings change. +// +// - BulkUpsertEmbeddings is the cold-start fast path used by +// the indexer's embedding pass. Implementations SHOULD use +// the backend's native bulk primitive (TSV + COPY FROM on +// Ladybug) so a 600k-node corpus doesn't pay per-row Cypher +// parse cost. Idempotent on NodeID — re-running with an +// overlapping set replaces in place. +// +// - BuildVectorIndex finalises the HNSW index after the bulk +// populate. The dim parameter declares the embedding +// width; backends with fixed-width columns lazily create +// the storage schema on the first BuildVectorIndex call. +// Idempotent — safe to call multiple times with the same dim. +// +// - SimilarTo runs an ANN query: given a vector, return the k +// closest stored vectors ordered by ascending distance. +// +// - Close is implied by graph.Store.Close — no separate +// teardown method here. +type VectorSearcher interface { + UpsertEmbedding(nodeID string, vec []float32) error + BulkUpsertEmbeddings(items []VectorItem) error + BuildVectorIndex(dims int) error + SimilarTo(vec []float32, limit int) ([]VectorHit, error) +} + +// PageRankOpts tunes the PageRank computation. Zero values request +// the backend default — only set fields you genuinely want to +// override so backends can pick their own parallel-tuned defaults +// without the caller second-guessing the constants. +// +// NodeKinds / EdgeKinds restrict the projected subgraph the +// algorithm runs over. Empty means "all kinds" — the algo sees the +// full graph. A non-empty filter is rewritten into the projected- +// graph predicate (Ladybug supports per-table predicates of the +// form 'n.kind = "function"'). +type PageRankOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind + DampingFactor float64 + MaxIterations int + Tolerance float64 + Limit int // 0 = return every ranked node +} + +// PageRankHit is one row of the PageRank output: the node ID plus +// its rank score. Hits come back sorted by rank descending. +type PageRankHit struct { + NodeID string + Rank float64 +} + +// PageRanker is an optional interface backends MAY implement to +// expose engine-native PageRank centrality. When the store +// implements it, the daemon's hotspot / authority-ranking path +// routes through the backend's parallel implementation (Ligra- +// based on Ladybug) instead of computing degree-centrality +// in-process. +// +// Engine-native PageRank is qualitatively different from the +// degree-based hotspot analyzer: random-walk authority weights +// rare-but-influential nodes the degree count would miss +// (a low-fan-in API that's called from every domain layer ranks +// higher than a high-fan-in test helper). +// +// Contract: +// +// - PageRank runs the algorithm against a projected subgraph and +// returns hits sorted by rank descending. The projection is +// declared and torn down per call — callers don't manage +// PROJECT_GRAPH lifecycle directly. +// +// - The score is normalized so the full corpus sums to 1 +// (Ladybug's default). Relative ordering — not the absolute +// value — is what callers should consume. +// +// - Close is implied by graph.Store.Close. +type PageRanker interface { + PageRank(opts PageRankOpts) ([]PageRankHit, error) +} + +// CommunityOpts tunes Louvain community detection over a projected +// subgraph. Zero values request the backend default +// (maxPhases=20, maxIterations=20 on Ladybug). NodeKinds / EdgeKinds +// restrict the projection; an empty filter runs over the full graph. +type CommunityOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind + MaxPhases int + MaxIterations int +} + +// CommunityHit is one row of the Louvain output: the node ID plus +// the integer community label the algorithm assigned. Two nodes +// with the same CommunityID are in the same community; the actual +// integer is opaque (Ladybug uses internal node offsets and +// promises no stability across runs). +type CommunityHit struct { + NodeID string + CommunityID int64 +} + +// CommunityDetector is an optional interface backends MAY +// implement to expose engine-native Louvain community detection +// (Ladybug uses a parallel Grappolo implementation). When the +// store implements it, the daemon's analysis.DetectCommunitiesLouvain +// path can delegate the partitioning step and keep the existing +// post-processing (label disambiguation, hub detection, cohesion, +// parent assignment). +// +// Contract: +// +// - Louvain runs the algorithm against a projected subgraph and +// returns one hit per node assigning it to a community. The +// projection is declared and torn down per call. +// +// - Ladybug's implementation treats edges as undirected (the +// modularity score is computed on the undirected graph even +// though the projected Edge table is directed). Callers that +// care about directed modularity should consult the in-process +// fallback. +// +// - Close is implied by graph.Store.Close. +type CommunityDetector interface { + Louvain(opts CommunityOpts) ([]CommunityHit, error) +} + +// ComponentOpts tunes connected-component computation over a +// projected subgraph. Zero values request the backend default +// (maxIterations=100 on Ladybug). NodeKinds / EdgeKinds restrict +// the projection. +type ComponentOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind + MaxIterations int +} + +// ComponentHit is one row of a connected-component output: the +// node ID plus the integer component label the algorithm assigned. +// Two nodes with the same ComponentID are in the same component. +// The integer is opaque (Ladybug uses internal node offsets). +type ComponentHit struct { + NodeID string + ComponentID int64 +} + +// ComponentFinder is an optional interface backends MAY implement +// to expose engine-native weakly- and strongly-connected-component +// algorithms. Two methods because the algorithms answer different +// questions: +// +// - WeaklyConnectedComponents treats edges as undirected — every +// pair of nodes reachable from each other (ignoring direction) +// lands in one component. Useful for "is this symbol part of +// the connected core?" diagnostics. +// +// - StronglyConnectedComponents respects edge direction — only +// nodes mutually reachable end up in the same component. The +// SCC of a call graph is the cycle structure: every non- +// trivial SCC (size > 1) is a mutual-recursion ring. +// +// When the store implements ComponentFinder, the daemon's +// connectivity diagnostics and circular-dependency detection +// (`analyze kind=wcc` / `analyze kind=scc`) route through it; +// otherwise the in-process analysis.ComputeWCC / analysis.ComputeSCC +// fallbacks run. +type ComponentFinder interface { + WeaklyConnectedComponents(opts ComponentOpts) ([]ComponentHit, error) + StronglyConnectedComponents(opts ComponentOpts) ([]ComponentHit, error) +} + +// KCoreOpts tunes k-core decomposition. NodeKinds / EdgeKinds +// restrict the projection. The algorithm itself takes no +// per-call parameters — it always computes the full +// decomposition (every node gets its k-degree). +type KCoreOpts struct { + NodeKinds []NodeKind + EdgeKinds []EdgeKind +} + +// KCoreHit is one row of the k-core output: the node ID plus the +// largest k for which the node remains in the k-core after +// iteratively pruning nodes with degree < k. A node's KDegree is +// its position in the core hierarchy — high values mean the node +// sits inside a densely connected centre. +type KCoreHit struct { + NodeID string + KDegree int64 +} + +// KCorer is an optional interface backends MAY implement to +// expose engine-native k-core decomposition. When the store +// implements it, the daemon's `analyze kind=kcore` path delegates +// to the engine-native implementation; otherwise +// analysis.ComputeKCore runs in-process. +// +// k-core finds the densest subgraph: the k-core of a graph is +// the largest subgraph where every node has at least k +// neighbours. The k-degree of a node is the largest k for which +// it stays in the k-core — useful for "find the hub-of-hubs", or +// "what's the core infrastructure code that everything depends +// on". +type KCorer interface { + KCoreDecomposition(opts KCoreOpts) ([]KCoreHit, error) +} + +// DeadCodeCandidator is an optional capability backends MAY implement +// to compute the dead-code candidate set server-side. The default Go +// path in analysis.FindDeadCode pulls every node + a batched in-edge +// map and filters in Go; on disk backends (Ladybug) that's +// ~1.3M edge rows over cgo per call. A backend that implements +// DeadCodeCandidator runs the equivalent WHERE-NOT-EXISTS filter +// inside the query engine and returns ~hundreds of true candidates, +// skipping the materialise-then-filter loop entirely. +// +// The opts mirror analysis.FindDeadCodeOptions to keep the surface +// in sync — only the fields the backend can act on (kinds + the +// per-kind in-edge allowlist) are honoured. File-path / build-tag +// / well-known-name exclusions stay in Go because they need +// string parsing the backend can't do efficiently. +type DeadCodeCandidator interface { + // DeadCodeCandidates returns nodes matching the allowed node + // kinds that have NO incoming edges of the corresponding + // allowed in-edge kinds. The map keys the in-edge allowlist by + // node kind — backends evaluate the right allowlist per row. + // Empty allowedInEdgeKinds for a kind means "any incoming edge + // counts as usage". + DeadCodeCandidates(allowedNodeKinds []NodeKind, allowedInEdgeKinds map[NodeKind][]EdgeKind) []*Node +} + +// IfaceImplementsRow is the per-row payload returned by +// IfaceImplementsScanner — one tuple per EdgeImplements edge whose +// target is a KindInterface node carrying Meta["methods"]. TypeID +// is the implementing type (the edge's source); IfaceID is the +// interface (the edge's target); IfaceMeta is the interface +// node's decoded Meta map, from which the caller pulls the +// "methods" field. Rows where the interface had no Meta are +// elided server-side. +type IfaceImplementsRow struct { + TypeID string + IfaceID string + IfaceMeta map[string]any +} + +// IfaceImplementsScanner returns the set of (typeID, interfaceID, +// interfaceMeta) tuples for every EdgeImplements edge where the +// target is a KindInterface node carrying Meta["methods"]. Used by +// analysis.FindDeadCode to compute "type implements interface, so +// these methods are alive even if never called directly". The +// server-side join is one Cypher; the Go-side equivalent fetched +// every interface node then every implements edge separately. +// +// Optional capability — analysis.FindDeadCode falls back to the +// Go-side scan when the backend doesn't implement it. +type IfaceImplementsScanner interface { + IfaceImplementsRows() []IfaceImplementsRow +} + +// NodeDegreeRow is one tuple returned by NodeDegreeAggregator. InCount +// counts EVERY incoming edge (any kind); OutCount counts EVERY outgoing +// edge; UsageInCount counts only the subset whose kind is in the +// "usage" set (Calls, References, Instantiates, Implements, Extends, +// Reads, Writes, Tests). The split exists because connectivity_health +// needs the totals (for isolated / leaf classification) AND the +// usage-edge presence (to fold ClassifyZeroEdge's logic in +// server-side); pulling them in one row saves a second cgo trip per +// node. +type NodeDegreeRow struct { + NodeID string + InCount int + OutCount int + UsageInCount int +} + +// NodeDegreeAggregator is an optional capability backends MAY +// implement to return per-node in/out edge counts plus a usage-edge +// count, server-side. Used by analysis.GraphConnectivity to replace +// the per-node g.GetInEdges(id) + g.GetOutEdges(id) + +// graph.ClassifyZeroEdge(id) trio — three cgo round-trips per node +// on Ladybug, three full edge materialisations per node on disk. +// One round-trip returns all three counts and lets the analyzer +// classify isolated / leaf / source-only / sink-only / extraction-gap +// without ever materialising the underlying edge structs. +// +// The usageKinds slice MUST mirror graph.usageEdgeKinds (the set +// ClassifyZeroEdge consults). Empty usageKinds means UsageInCount is +// always 0; an empty input ids slice returns nil. +// +// Optional capability — GraphConnectivity falls back to the per-node +// GetInEdges/GetOutEdges path when the backend doesn't implement it. +type NodeDegreeAggregator interface { + NodeDegreeCounts(ids []string, usageKinds []EdgeKind) []NodeDegreeRow +} + +// NodeFanRow is one tuple returned by NodeFanAggregator. FanIn counts +// incoming edges whose kind is in the fanInKinds set; FanOut counts +// outgoing edges whose kind is in the fanOutKinds set. The two kind +// sets are passed by the caller so the same capability serves both +// FindHotspots (fanIn = Calls+References, fanOut = Calls) and any +// future analyzer with a different kind split. +type NodeFanRow struct { + NodeID string + FanIn int + FanOut int +} + +// NodeFanAggregator is an optional capability backends MAY implement +// to compute per-node fan-in / fan-out counts filtered by edge kind, +// server-side. Used by analysis.FindHotspots and +// handleAnalyzeHealthScore to replace the AllEdges() materialisation +// they both ran every call (~500k edges over cgo on the gortex +// workspace, the bulk of the wall-clock cost on Ladybug). The Go-side +// crossing computation still needs per-edge (from, to) for the +// Calls/References kinds — that runs through EdgesByKind, which +// streams without materialising the full edge set. +// +// Empty ids => nil; empty fanInKinds / fanOutKinds means that side +// is always 0. Output order is unspecified. +// +// Optional capability — both analyzers fall back to the AllEdges scan +// when the backend doesn't implement it. +type NodeFanAggregator interface { + NodeFanCounts(ids []string, fanInKinds []EdgeKind, fanOutKinds []EdgeKind) []NodeFanRow +} + +// FileImporterRow is the per-row payload returned by FileImporters. +// FromFile is the importing file's path (the result the caller cares +// about); FromID / FromName / FromKind describe the node that owns +// the EdgeImports edge, in case the caller needs more than just the +// file list. +type FileImporterRow struct { + FromFile string + FromID string + FromName string + FromKind NodeKind +} + +// FileImporters is an optional capability backends MAY implement to +// answer "which files import filePath?" with a single backend round- +// trip instead of a Go-side AllEdges() scan. The MCP check_references +// tool's importing-files block hammered AllEdges() per call: ~286k +// edges materialised over cgo on the gortex workspace, then a per- +// edge GetNode(e.To) + GetNode(e.From) — multiple thousand cgo round- +// trips for a single check_references call. A backend that implements +// FileImporters runs the equivalent join inside the query engine and +// only surfaces the rows that match. +// +// Match semantics mirror the original handler: an EdgeImports edge +// counts when its To node's FilePath equals filePath OR when the To +// node's ID equals filePath (the file's own node id, used by the +// indexer for file-level import bindings). The same-file dedup the +// caller applies stays in Go — backends just stream the candidate +// rows. +// +// Optional capability — handleCheckReferences falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type FileImporters interface { + FileImporters(filePath string) []FileImporterRow +} + +// InEdgeCounter is an optional capability backends MAY implement to +// compute incoming-edge fan-in counts per target node for a fixed +// set of edge kinds in one backend round-trip. The fallback iterates +// AllEdges() Go-side; on Ladybug that materialises every edge over +// cgo (~286k rows on the gortex workspace) just to bucket by To. +// The capability instead runs `MATCH ()-[e:Edge]->(n) WHERE e.kind +// IN $kinds RETURN n.id, count(*)` and ships back only the per-target +// counts — a fraction of the rows and zero per-row Go object alloc. +// +// Used by handleGetUntestedSymbols to compute the calls+references +// fan-in ranking. The map keys are node IDs; values are the integer +// count of matching incoming edges. Targets with zero matching in- +// edges are absent from the map (callers index with `m[id]` and rely +// on the zero-value default). +// +// Optional capability — the handler falls back to AllEdges-driven +// bucketing when the backend doesn't implement it. +type InEdgeCounter interface { + InEdgeCountsByKind(kinds []EdgeKind) map[string]int +} + +// NodesInFilesByKindFinder is an optional capability backends MAY +// implement to answer "which nodes of kinds K live in files F?" +// with a single backend round-trip. The fallback iterates AllNodes() +// Go-side; on Ladybug that materialises the full node table over +// cgo per call. The capability instead runs `MATCH (n:Node) WHERE +// n.file_path IN $files AND n.kind IN $kinds RETURN ...` and ships +// only the matching rows. +// +// Used by handleFindDeclaration to build the per-file enclosing- +// symbol index off the small set of trigram-match file paths. The +// Go fallback's AllNodes pull was ~70k rows on the gortex workspace +// to land at ~hundreds of relevant rows. +// +// Empty files / empty kinds returns nil — never a whole-graph scan. +// +// Optional capability — the handler falls back to AllNodes when the +// backend doesn't implement it. +type NodesInFilesByKindFinder interface { + NodesInFilesByKind(files []string, kinds []NodeKind) []*Node +} + +// FileMtimeWriter is an optional capability backends MAY implement to +// persist the per-file modification time the indexer uses for its +// incremental-reindex decisions. Lifting this state off the daemon's +// gob+gzip snapshot makes warm restarts read it through the same +// backend the graph already lives in (no second persistence surface +// to keep coherent). +// +// repoPrefix is the indexer's own prefix tag; mtimes is keyed on the +// repo-relative file path (the same key the in-memory Indexer's +// fileMtimes map uses). Empty input is a no-op; empty repoPrefix is +// allowed for single-repo daemons. +type FileMtimeWriter interface { + BulkSetFileMtimes(repoPrefix string, mtimes map[string]int64) error +} + +// FileMtimeReader is the read side of FileMtimeWriter. Returns the +// recorded mtimes for one repo prefix as a fresh map (nil for "no +// data"). Used by warmup to seed ReconcileRepoCtx with the per-file +// mtimes it would otherwise have read from the gob snapshot. +type FileMtimeReader interface { + LoadFileMtimes(repoPrefix string) map[string]int64 +} + +// EdgesByKindsScanner is an optional capability backends MAY +// implement to stream every edge whose Kind is in the supplied set, +// in a single backend round-trip. The fallback iterates AllEdges() +// Go-side and filters in process — on Ladybug AllEdges materialises +// every edge over cgo (~286k rows on the gortex workspace) for the +// edge-driven analyzers (channel_ops, pubsub, k8s_resources, +// kustomize, error_surface, …) that only care about a handful of +// kinds. The capability runs `MATCH ()-[e:Edge]->() WHERE e.kind IN +// $kinds RETURN ...` and ships back only the matching rows. +// +// The single-kind variant EdgesByKind already exists, but the +// analyzers in question typically need 2-5 kinds in one pass; firing +// EdgesByKind once per kind would issue N independent backend queries +// when the planner can naturally batch them with an IN-list. Calling +// EdgesByKinds with one kind is equivalent to EdgesByKind for that +// kind — backends should still prefer the IN-list path so the call +// site never branches on len(kinds). +// +// Empty kinds yields nothing — never a whole-table scan. Iterators +// stop when the consumer's yield returns false; implementations MUST +// honour early-stop so callers can break out of a search. +// +// Optional capability — analyzers fall back to per-kind EdgesByKind +// iteration when the backend doesn't implement it. +type EdgesByKindsScanner interface { + EdgesByKinds(kinds []EdgeKind) iter.Seq[*Edge] +} + +// NodesByKindsScanner is an optional capability backends MAY implement +// to fetch every node whose Kind is in the supplied set in a single +// backend round-trip. Replaces the AllNodes() + Go-side `if n.Kind != +// allowed` filter used by the metadata-oriented analyze handlers +// (todos, stale_code, stale_flags, ownership, coverage_gaps, +// coverage_summary, cgo_users, wasm_users, orphan_tables, +// unreferenced_tables). Each of those scans the entire node table just +// to keep one or two kinds — on Ladybug that's ~70k rows over cgo on +// the gortex workspace per call. The capability runs +// `MATCH (n:Node) WHERE n.kind IN $kinds RETURN ...` and ships only the +// matching rows. +// +// Why a separate kinds-IN scanner instead of looping the existing +// NodesByKind iterator per kind: on Ladybug NodesByKind is one query +// per call. Looping it for {function, method} doubles the round-trip +// count and rebuilds the row decoder for each pass. One IN-list query +// returns the union directly. The dedup is intentional — duplicated +// kinds in the input never reach the IN-list, matching the in-memory +// reference's behaviour. +// +// Optional capability — handlers fall back to AllNodes-driven scanning +// when the backend doesn't implement it. Empty kinds returns nil +// without touching the backend. +type NodesByKindsScanner interface { + NodesByKinds(kinds []NodeKind) []*Node +} + +// EdgeAdjacencyForKinds is an optional capability backends MAY +// implement to stream (from, to) id pairs for every edge whose Kind +// is in the supplied edge-kind set AND whose endpoints both belong +// to the supplied node-kind set. The shape covers the betweenness / +// centrality adjacency build that today calls EdgesByKinds and +// filters Go-side: on Ladybug the per-edge row carries ~10 string +// columns over cgo, multiplied by ~286k edges on the gortex +// workspace, just for a build that uses only From/To. The +// capability returns a 2-column projection from a single Cypher +// join — every endpoint kind is enforced by the planner, so neither +// the cross-kind edges nor the irrelevant columns ever cross cgo. +// +// Empty edgeKinds or empty nodeKinds yields nothing — never a +// whole-table scan. Iterators stop when the consumer's yield +// returns false; implementations MUST honour early-stop. +// +// Optional capability — analyzers fall back to EdgesByKinds when +// the backend doesn't implement it. +type EdgeAdjacencyForKinds interface { + EdgeAdjacencyForKinds(edgeKinds []EdgeKind, nodeKinds []NodeKind) iter.Seq[[2]string] +} + +// CommunityCrossingsByKind is an optional capability backends MAY +// implement to return per-source crossing counts for edges whose +// Kind is in the supplied set, given a node→community membership +// map. A "crossing" is an edge whose source community differs from +// its target community; the count is keyed by source id. +// +// Replaces the FindHotspots.countCrossings loop that today iterates +// EdgesByKind twice and tallies per-source Go-side: on the gortex +// workspace the two EdgesByKind passes materialised the full call / +// reference bucket over cgo (~286k rows × ~10 columns) just to +// derive a thousand-row aggregate. The capability ships only the +// (from, to) projection — the community comparison runs Go-side +// because the community map isn't a Node column today. +// +// Empty kinds or an empty community map returns nil. The map keys +// in the result MUST be source ids whose count is non-zero — +// implementations MUST drop zero-count rows so callers can probe +// existence without a >0 check. +// +// Optional capability — analyzers fall back to EdgesByKind iteration +// when the backend doesn't implement it. +type CommunityCrossingsByKind interface { + CommunityCrossingsByKind(kinds []EdgeKind, nodeToComm map[string]string) map[string]int +} + +// NodeIDsByKinds is an optional capability backends MAY implement +// to return just the IDs of nodes whose Kind is in the supplied +// set. Replaces NodesByKinds in ranking paths (betweenness, +// hotspots) that only need to iterate ids — the full *Node carries +// ~10 string columns over cgo per row, and the candidate set is +// thousands of function/method rows, so the projection drops the +// per-call cgo allocation count by an order of magnitude. +// +// Empty kinds returns nil without touching the backend. Duplicated +// input kinds must NOT duplicate the output — backends MUST dedup +// the kind set in the IN-list. +// +// Optional capability — callers fall back to NodesByKinds when the +// backend doesn't implement it. +type NodeIDsByKinds interface { + NodeIDsByKinds(kinds []NodeKind) []string +} + +// EdgeKindCounter is an optional capability backends MAY implement +// to return one row per distinct edge kind with its occurrence +// count, server-side. Used by handleGetSurprisingConnections to +// derive the "rare kinds" set (kinds whose share of all edges is at +// or below the rare_kind_pct threshold) without materialising every +// edge over cgo just to bucket by Kind. On the gortex workspace the +// AllEdges() bucket pass was ~286k edges over cgo per call; the +// aggregator returns ~30 rows. +// +// The map's key is the EdgeKind; the value is the integer occurrence +// count. Empty graph returns nil (or an empty map — callers MUST +// treat both as "no rare kinds detected"). +// +// Optional capability — handleGetSurprisingConnections falls back +// to the AllEdges-driven kind bucketing when the backend doesn't +// implement it. +type EdgeKindCounter interface { + EdgeKindCounts() map[EdgeKind]int +} + +// CrossRepoEdgeRow is one tuple returned by CrossRepoEdgeAggregator. +// Kind is the cross_repo_* edge kind verbatim. FromRepo / ToRepo +// are the source / target node's RepoPrefix; Count is the number of +// underlying edges that share the triple. +type CrossRepoEdgeRow struct { + Kind EdgeKind + FromRepo string + ToRepo string + Count int +} + +// CrossRepoEdgeAggregator is an optional capability backends MAY +// implement to return pre-grouped cross-repo edge counts. Used by +// the get_architecture handler's cross_repo rollup, which previously +// scanned AllEdges() + per-edge GetNode(from)+GetNode(to) just to +// emit one row per (kind, from_repo, to_repo). On the gortex +// workspace that meant ~286k edge rows + ~thousands of GetNode +// round-trips over cgo for typically <100 cross-repo rows. The +// aggregator runs one Cypher GROUP BY and ships only the surviving +// per-triple counts. +// +// Cross-repo edges are identified by graph.BaseKindForCrossRepo — +// the disk implementation MUST use the same kind list (so single- +// repo graphs return an empty slice, not a whole-graph scan). +// +// Optional capability — handleGetArchitecture falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type CrossRepoEdgeAggregator interface { + CrossRepoEdgeCounts() []CrossRepoEdgeRow +} + +// FileImportCountRow is one tuple returned by FileImportAggregator. +// FilePath is the imported file path (the target node's FilePath, or +// the target node's ID when the indexer pointed the import edge at +// the file node directly). Count is the number of distinct EdgeImports +// edges whose To resolves to that path. +type FileImportCountRow struct { + FilePath string + Count int +} + +// FileImportAggregator is an optional capability backends MAY +// implement to return per-target-file incoming-imports counts in +// one backend round-trip. Used by mostImportedFiles (shared between +// get_repo_outline and suggest_queries) which previously scanned +// AllEdges() + per-edge GetNode(to) just to bucket counts by path. +// On the gortex workspace that loop materialised ~286k edges + per- +// edge GetNode round-trips over cgo to produce a top-10 list. The +// aggregator GROUPs server-side and ships the per-file counts only. +// +// scope, when non-nil, bounds the counted edges to those whose target +// node ID lies in the slice (session-workspace clamp). An empty (but +// non-nil) scope returns nil — never a whole-graph scan. A nil scope +// means "no clamp" and counts every imports edge. +// +// Optional capability — mostImportedFiles falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type FileImportAggregator interface { + FileImportCounts(scope []string) []FileImportCountRow +} + +// InDegreeForNodes is an optional capability backends MAY implement to +// return the per-target incoming-edge count for the given node id set +// in one backend round-trip. Unlike InEdgeCounter (which filters by +// edge kind across the WHOLE graph), this counter is scoped to a +// caller-supplied id set and counts EVERY incoming edge regardless of +// kind. handleGetSurprisingConnections needs both the hub heuristic +// and the per-edge anomaly walk, but the hub check only cares about +// nodes already inside the session-scoped working set; counting every +// edge across the table just to bucket by `To` materialises the entire +// edge column (~286k rows over cgo on Ladybug). +// +// Empty ids returns nil — never a whole-table scan. Targets with zero +// matching in-edges may be absent from the returned map (callers index +// with `m[id]` and treat zero as the default). +// +// Optional capability — handleGetSurprisingConnections falls back to +// the AllEdges-driven bucketing when the backend doesn't implement it. +type InDegreeForNodes interface { + InDegreeForNodes(ids []string) map[string]int +} + +// ReachableForwardByKinds is an optional capability backends MAY +// implement to compute the set of node IDs reachable from the seed +// frontier via outgoing edges whose Kind is in the supplied set, in +// one backend round-trip. The Go fallback runs a layer-by-layer BFS +// firing GetOutEdges per node — on Ladybug that's N+1 cgo round-trips +// where N is the transitive frontier size; on a 100k-symbol repo with +// a few thousand test functions the BFS easily issues tens of +// thousands of edge fetches. +// +// reachableFromTests in handleGetUntestedSymbols is the primary +// caller: seeds are every function/method in a test file, kinds are +// {calls, references}, and the result is the closed set of symbols +// covered transitively by the test surface. The capability runs one +// variable-length match expression and ships the closure back as a +// single id list. +// +// Empty seeds returns nil; an empty kinds set returns the seed set +// unchanged (no edges to traverse). The returned map keys are the +// reachable node IDs (including the seeds); the bool value is always +// true — the shape mirrors the in-memory implementation's covered set +// so the caller's index expression stays identical. +// +// Optional capability — reachableFromTests falls back to the +// per-layer GetOutEdges BFS when the backend doesn't implement it. +type ReachableForwardByKinds interface { + ReachableForwardByKinds(seeds []string, kinds []EdgeKind) map[string]bool +} + +// ThrowerErrorRow is one tuple returned by ThrowerErrorSurfacer. ThrowerID +// is the symbol that originates the EdgeThrows edges; ErrorTargets is the +// distinct set of error-type node IDs the thrower reaches via EdgeThrows; +// ErrorMsgs is the distinct set of literal error-message strings the +// thrower emits (KindString nodes with meta.context = "error_msg", linked +// by EdgeEmits). Throws is the count of underlying EdgeThrows edges (one +// thrower may raise the same target multiple times from different sites). +// FilePath / Line are the row metadata the legacy handler propagated from +// the first edge / falling back to the thrower node — they ride here so +// the analyzer never has to issue a follow-up GetNode lookup. +type ThrowerErrorRow struct { + ThrowerID string + FilePath string + Line int + Throws int + ErrorTargets []string + ErrorMsgs []string +} + +// ThrowerErrorSurfacer is an optional capability backends MAY implement +// to evaluate the analyze(error_surface) rollup entirely inside the +// storage layer. The Go fallback walks EdgeThrows once for the per- +// thrower aggregation, then issues GetOutEdges per surviving thrower +// to attach the literal error-message strings. On Ladybug that's two +// scans of the edge table plus an N+1 cgo loop for the per-thrower +// emit walk; the capability runs two Cypher GROUP BYs and ships the +// pre-shaped rows back. +// +// pathPrefix narrows the EdgeThrows rows by their stored FilePath +// prefix; an empty prefix means "every thrower". Returned rows are +// already deduplicated per (thrower, error_target) and per (thrower, +// error_msg) — callers feed them directly into the analyzer's sort / +// truncate path without further bucketing. +// +// Optional capability — handleAnalyzeErrorSurface falls back to the +// AllEdges-driven loop when the backend doesn't implement it. +type ThrowerErrorSurfacer interface { + ThrowerErrorSurface(pathPrefix string) []ThrowerErrorRow +} + +// MemberMethodInfo is one row of the MemberMethodsByType projection. +// MethodID is the method node's id; Name is its name (the key the +// InferImplements method-set check compares against); FilePath / +// StartLine are the source coordinates InferOverrides stamps on the +// EdgeOverrides edge it emits; RepoPrefix lets consumers +// (ResolveGRPCStubCalls' pickGRPCHandler) tie-break on same-repo +// without a follow-up GetNode. +type MemberMethodInfo struct { + MethodID string + Name string + FilePath string + StartLine int + RepoPrefix string +} + +// MemberMethodsByType is an optional capability backends MAY implement +// to return the typeID → []MemberMethodInfo projection of every +// EdgeMemberOf edge whose source is a KindMethod node, in one backend +// round-trip. Replaces the InferImplements / InferOverrides Pass 1 +// pattern of EdgesByKind(EdgeMemberOf) followed by per-edge +// GetNode(e.From) to filter on Kind == KindMethod and read the +// method's columns. On Ladybug that loop is N+1 cgo: each method +// GetNode pulls ~10 string columns + the Meta blob over cgo just to +// read four scalar fields. The capability runs a single Cypher join, +// server-side, and ships only the four method columns the resolver +// actually consumes. +// +// Empty graph returns nil; types with no method members are absent +// from the result. The returned slice's elements are unique per +// MethodID — duplicated (typeID, methodID) pairs (a method +// member-of'd twice) collapse to one row. +// +// Optional capability — InferImplements / InferOverrides fall back to +// the per-edge GetNode walk when the backend doesn't implement it. +type MemberMethodsByType interface { + MemberMethodsByType() map[string][]MemberMethodInfo +} + +// StructuralParentEdgeRow is one tuple returned by StructuralParentEdges. +// FromID / ToID are the child / parent node IDs verbatim. FromKind / +// ToKind let the consumer apply the (Type | Interface) gate without a +// follow-up GetNode. Origin is the edge's resolution-tier label, which +// drives override-edge origin selection in InferOverrides. +type StructuralParentEdgeRow struct { + FromID string + ToID string + FromKind NodeKind + ToKind NodeKind + Origin string +} + +// StructuralParentEdges is an optional capability backends MAY +// implement to return every EdgeExtends / EdgeImplements / EdgeComposes +// edge whose endpoints are both KindType / KindInterface, projected as +// (FromID, ToID, FromKind, ToKind, Origin) in one backend round-trip. +// Replaces the InferOverrides Pass 2 pattern of g.AllEdges() followed +// by per-edge GetNode(e.From) + GetNode(e.To) to apply the kind gate. +// On Ladybug the AllEdges scan materialises every edge over cgo (~286k +// on the gortex workspace) plus issues two per-edge node lookups; the +// capability runs one Cypher join with kind filters on both sides and +// ships only the surviving rows back (typically a small fraction of +// the edge table). +// +// Empty graph returns nil. Rows from extends/implements/composes edges +// whose endpoints aren't both type/interface are filtered server-side +// — the consumer never has to gate them again. +// +// Optional capability — InferOverrides falls back to the AllEdges + +// per-edge GetNode walk when the backend doesn't implement it. +type StructuralParentEdges interface { + StructuralParentEdges() []StructuralParentEdgeRow +} + +// CrossRepoCandidateRow is one tuple returned by CrossRepoCandidates. +// Edge is the underlying base-kind edge verbatim — the consumer +// rewrites Edge.CrossRepo on it and emits a parallel cross_repo_* edge. +// FromRepo / ToRepo are the (already-distinct) source and target +// RepoPrefix values projected from the endpoint nodes. +type CrossRepoCandidateRow struct { + Edge *Edge + FromRepo string + ToRepo string +} + +// CrossRepoCandidates is an optional capability backends MAY implement +// to return every edge whose Kind has a parallel cross_repo_* kind AND +// whose endpoints carry two different non-empty RepoPrefix values, in +// one backend round-trip. Replaces the DetectCrossRepoEdges pattern of +// g.AllEdges() + per-edge GetNode(e.From) + GetNode(e.To) to extract +// the RepoPrefix pair. On Ladybug the AllEdges scan ships every edge +// in the graph over cgo plus issues two GetNode lookups per surviving +// row; the capability filters by edge kind + the repo-prefix mismatch +// server-side and ships only the surviving rows (typically a small +// fraction of the edge table on a multi-repo workspace). +// +// baseKinds is the set of edge kinds for which a CrossRepoKindFor +// mapping exists — the caller passes the list and the implementation +// MUST use exactly that set in the IN-list, so a single-repo graph +// (or a graph whose nodes carry no RepoPrefix) returns no rows. +// +// Optional capability — DetectCrossRepoEdges falls back to the +// AllEdges + per-edge GetNode loop when the backend doesn't implement +// it. +type CrossRepoCandidates interface { + CrossRepoCandidates(baseKinds []EdgeKind) []CrossRepoCandidateRow +} + +// ExtractCandidateRow is one tuple returned by ExtractCandidatesScanner. +// Caller / FanOut counts are distinct-by-endpoint (one caller counted +// once per (From, kind) pair, one callee counted once per (To, kind) +// pair) restricted to the call-like edge kinds the consumer cares +// about. LineCount is EndLine - StartLine + 1; rows whose StartLine or +// EndLine is zero are filtered server-side. +type ExtractCandidateRow struct { + NodeID string + Name string + FilePath string + StartLine int + EndLine int + LineCount int + CallerCount int + FanOut int +} + +// ExtractCandidatesScanner is an optional capability backends MAY +// implement to compute the get_extraction_candidates ranking in two +// Cypher round-trips (per-node caller-count and fan-out aggregation +// joined to the node table). Replaces the AllNodes() scan + per-node +// GetInEdges / GetOutEdges loop the handler used previously — on the +// gortex workspace that was ~30k node × 2 cgo trips per call, where +// each trip materialised the full edge bucket just to count +// distinct endpoints. The capability instead runs the count +// (DISTINCT-by-endpoint) inside the engine and ships only the rows +// that satisfy the three threshold gates. +// +// Empty kinds yields nothing — the handler always passes a non-empty +// set (EdgeCalls + EdgeCrossRepoCalls). pathPrefix narrows the scan to +// nodes under that file-path prefix; empty matches every path. The +// returned rows mirror the result of the Go-side loop verbatim: +// thresholds applied, line_count = EndLine - StartLine + 1. +// +// Optional capability — handleGetExtractionCandidates falls back to +// the AllNodes scan when the backend doesn't implement it. +type ExtractCandidatesScanner interface { + ExtractCandidates( + kinds []EdgeKind, + minLines, minCallers, minFanOut int, + pathPrefix string, + ) []ExtractCandidateRow +} + +// FileSymbolNameRow is one tuple returned by FileSymbolNamesByPaths. +// FilePath echoes the input slot; Name is one symbol name observed in +// the file (function / method / type / interface kinds only, matching +// symbolNamesInFile's Go-side filter). One file may produce many rows. +type FileSymbolNameRow struct { + FilePath string + Name string +} + +// FileSymbolNamesByPaths is an optional capability backends MAY +// implement to fetch the sorted distinct (file → function/method/type +// names) projection for a slice of file paths in one backend round- +// trip. Replaces the per-file GetFileNodes loop find_co_changing_symbols +// runs after a positive cochange match: 20 result rows × one +// `MATCH (n {file_path: $p})` query each on Ladybug. The capability +// runs a single `WHERE n.file_path IN $paths AND n.kind IN $kinds` +// query and ships one row per (file, name). +// +// Empty paths returns nil — never a whole-table scan. Rows for paths +// with no qualifying symbols are absent from the result; callers +// always index by file path and treat missing keys as "no names". +// +// Optional capability — symbolNamesInFile and its callers fall back to +// the per-file GetFileNodes loop when the backend doesn't implement +// it. +type FileSymbolNamesByPaths interface { + FileSymbolNamesByPaths(paths []string, kinds []NodeKind) []FileSymbolNameRow +} + +// ClassHierarchyRow is one tuple returned by ClassHierarchyTraverser. +// Path carries the node IDs visited from the seed (exclusive of the +// seed) out to the terminal node, in BFS order. EdgeKinds carries the +// per-hop edge kind so the caller can reconstruct the *Edge values. +// For a single hop Path has one element and EdgeKinds has one element; +// for a depth-N walk both slices have length N. +type ClassHierarchyRow struct { + Path []string + EdgeKinds []EdgeKind +} + +// ClassHierarchyTraverser is an optional capability backends MAY +// implement to compute the inheritance subgraph rooted at a seed in +// one (or two — up + down) Cypher variable-length traversals, server- +// side. Replaces the BFS in query.ClassHierarchy: each frontier node +// fired GetNode + GetInEdges or GetOutEdges per visit on Ladybug, so a +// depth-5 walk over an interface with a wide implementer set burned +// hundreds of cgo round-trips just to discover ~50 edges. +// +// kinds is the edge-kind set the walk consumes (EdgeExtends + +// EdgeImplements + EdgeComposes + EdgeOverrides). depth caps the hop +// budget. direction: +// - "up" — follow outgoing edges from each frontier node. +// - "down" — follow incoming edges into each frontier node. +// +// Empty kinds / depth <= 0 / unknown seed returns nil. The returned +// rows are deduplicated by (Path[-1], last EdgeKind) — the consumer +// reconstructs the visited node set and the edge list from them. +// +// Optional capability — query.ClassHierarchy falls back to the BFS +// when the backend doesn't implement it. +type ClassHierarchyTraverser interface { + ClassHierarchyTraverse( + seedID string, + direction string, + kinds []EdgeKind, + depth int, + ) []ClassHierarchyRow +} + +// FileEditingContext is an optional capability backends MAY +// implement to return the get_editing_context payload (defines + +// imports + 1-hop callers + 1-hop callees, all for one file) in a +// small fixed number of Cypher round-trips. Replaces the handler's +// per-symbol GetCallers / GetCallChain loop — for a file with 30 +// functions that fired 60 query-engine entry points on Ladybug. +// +// kinds is the set of node kinds the caller treats as call-targets +// (KindFunction + KindMethod). The capability returns FileNode (the +// file row), Defines (every non-file node anchored to the path, +// signature carried through Meta), Imports (the EdgeImports out-edges +// of the file node), CalledBy (one-hop callers of any defines node, +// filtered to symbols outside the file), and Calls (one-hop callees of +// any defines node, filtered to symbols outside the file). All five +// projections are scoped to the input file in one round-trip each. +// +// Optional capability — handleGetEditingContext falls back to the +// per-symbol loop when the backend doesn't implement it. +type FileEditingContextResult struct { + FileNode *Node + Defines []*Node + Imports []*Edge + CalledBy []*Node + Calls []*Node +} + +type FileEditingContext interface { + FileEditingContext(filePath string, kinds []NodeKind) *FileEditingContextResult +} + +// FileSubGraphReader is an optional capability backends MAY implement +// to return the full file neighbourhood — the file node, every node +// defined in or contained by it, and every adjacent edge — in a +// single backend round-trip. +// +// On the in-memory backend the per-id GetOutEdges / GetInEdges loop +// is already O(1) per node, so the query.Engine.GetFileSymbols +// fallback wraps it. On disk backends the same loop is +// O(file_symbols × cgo) — ~547 symbols on a real file fanned out into +// ~5 000 cgo round-trips just to dedup edges in Go. The capability +// lets Ladybug express the walk as one Cypher pattern match that +// uses the primary-key HASH index on Node.id plus the rel-table's +// FROM index on Edge — both already present without any DDL change. +// +// Returned slices are deduplicated by the implementation. Missing +// file returns (nil, nil); empty file (file node only, no symbols) +// returns ([file], nil). Callers that need the symbols-only view +// strip KindFile + KindImport on top (see +// internal/mcp/tools_core.go::stripFileAndImportNodes). +// +// Optional capability — query.Engine.GetFileSymbols falls back to +// GetFileNodes + GetOut/InEdgesByNodeIDs when the backend doesn't +// implement it. +type FileSubGraphReader interface { + GetFileSubGraph(filePath string) (nodes []*Node, edges []*Edge) +} + +// FrontierHop is one (edge, neighbour) pair from a FrontierExpander: an +// edge adjacent to a queried source node plus the node at its far end, +// with the neighbour's columns populated and Meta left nil (traversal +// callers don't read it). It lets a BFS record the edge and +// scope-check / materialise the neighbour without a GetNode per edge. +type FrontierHop struct { + Edge *Edge + Neighbor *Node +} + +// FrontierExpander is an optional backend capability: given a set of +// source node IDs it returns, in a single round-trip, their adjacent +// edges of the requested kinds plus the neighbour nodes — the +// node-edge-node projection a BFS frontier needs. forward=true follows +// outgoing edges (neighbour = edge target); forward=false follows +// incoming (neighbour = edge source). kinds must be non-empty (the +// directed-traversal contract). limit derives a deterministic per-call +// row cap so a hub node's fan-out can no longer be dragged across the +// boundary in full. +// +// query.Engine.bfs uses it when the reader implements it (the ladybug +// store) and falls back to per-node GetOutEdges/GetInEdges + GetNode +// otherwise — the in-memory graph needs no batching (its reads are O(1)). +type FrontierExpander interface { + ExpandFrontier(ids []string, forward bool, kinds []EdgeKind, limit int) []FrontierHop +} + +// FileSubGraphCountReader is the count-only sibling of +// FileSubGraphReader: returns the file's nodes plus the number of +// distinct edges adjacent to any of them, without materialising the +// edges themselves. +// +// The Ladybug headline cost for get_file_summary on a 500-symbol file +// was the ~4 000-row cgo crossing to ship every adjacent edge back to +// Go. The gcx and compact output paths only emit a total_edges scalar +// in their meta headers — never per-edge rows — so handleGetFileSummary +// routes gcx through this method and skips the row materialisation +// entirely. The json output path keeps the full GetFileSubGraph call +// because it serialises every edge in the body, and the compact path +// keeps it because it summarises edges per confidence label. +// +// On the in-memory backend the per-node edge bucket lookups are +// already O(1), so its implementation just counts via the same path +// GetFileSubGraph walks; the win is on disk backends. +// +// Optional capability — query.Engine.GetFileSymbolsCounts falls back +// to len(GetFileSubGraph().edges) when the backend doesn't implement +// it. +type FileSubGraphCountReader interface { + GetFileSubGraphCounts(filePath string) (nodes []*Node, edgeCount int) +} + +// NodeDegreeByKinds is an optional capability backends MAY implement +// to return per-node total in/out edge counts for every node whose +// kind is in the supplied set, server-side. Replaces the +// get_knowledge_gaps pattern of "give me all functions, then ask for +// their in/out degree" — on Ladybug that fed an IN-list of ~30k node +// IDs to the NodeDegreeCounts query, which has to compare every node +// against the list. The capability instead matches kinds at the +// source and groups by node — one Cypher per direction with a kind +// predicate the planner can index. +// +// pathPrefix narrows the scan to nodes under that file-path prefix; +// empty matches every path. Empty kinds returns nil (never a whole- +// graph scan). +// +// The returned rows mirror NodeDegreeRow's shape but UsageInCount is +// always 0 — knowledge_gaps does not need the usage subset, only the +// total degree. Adding the usage filter back would re-tie the +// capability to ClassifyZeroEdge's notion of "alive" without buying +// any other call site. +// +// Optional capability — handleGetKnowledgeGaps falls back to the +// NodeDegreeCounts IN-list when the backend doesn't implement it. +type NodeDegreeByKinds interface { + NodeDegreeByKinds(kinds []NodeKind, pathPrefix string) []NodeDegreeRow +} diff --git a/internal/graph/store_ladybug/algo.go b/internal/graph/store_ladybug/algo.go new file mode 100644 index 00000000..d4f46ca6 --- /dev/null +++ b/internal/graph/store_ladybug/algo.go @@ -0,0 +1,572 @@ +package store_ladybug + +import ( + "fmt" + "strings" + "sync" + "sync/atomic" + + lbug "github.com/LadybugDB/go-ladybug" + + "github.com/zzet/gortex/internal/graph" +) + +// algoProjectionName is the canonical name of the projected +// subgraph every algo CALL runs against. The projection is built +// once on demand and cached across algo invocations — withProjection +// only rebuilds when the cache key (node/edge filter) changes or +// the underlying graph mutates (Store.writeGen advanced). On +// gortex-scale graphs (313k+ edges) one PROJECT_GRAPH costs 30+s, +// so reusing it across consecutive algo runs is the difference +// between a 1.3 s analyze and a 63 s one. +const algoProjectionName = "GortexAlgo" + +// projectionCacheEntry remembers the last successful PROJECT_GRAPH +// declaration so a repeat algo call with the same filter can skip +// the rebuild. generation is Store.writeGen at the time the +// projection was built; a mismatch with the current writeGen means +// the underlying graph has mutated and the projection is stale. +type projectionCacheEntry struct { + valid bool + key string // canonicalised projectionOpts (nodeKinds + edgeKinds) + name string // active projection name (currently always algoProjectionName) + generation uint64 // Store.writeGen value when projection was built +} + +// algoState tracks the per-store algo-extension lifecycle and +// the cached PROJECT_GRAPH declaration. The extension-load +// sentinel is durable; the projection is rebuilt lazily on the +// first algo call that follows a graph mutation (writeGen change) +// or a different filter shape. +type algoState struct { + extensionLoaded atomic.Bool + projectionMu sync.Mutex // serialises projection-name use + cache mutation + projection projectionCacheEntry +} + +// ensureAlgoExtensionLocked loads the ALGO extension into the +// active connection. Same dance as ensureVectorExtensionLocked / +// ensureFTSExtensionLocked (INSTALL + LOAD EXTENSION); idempotent +// via the sentinel. Held under writeMu by the caller. +// +// INSTALL / LOAD run on the setup conn (the same connection every +// later projection-lifecycle and algo CALL goes through). Routing +// the entire ALGO path to s.conn is required: Ladybug binds +// projected-graph declarations to the *connection* that ran +// PROJECT_GRAPH — a pooled connection sees no projection from +// a sibling pool slot, surfacing as "Projected graph G does not +// exists" the moment the algo CALL lands on a different pool conn. +func (s *Store) ensureAlgoExtensionLocked() error { + if s.algo.extensionLoaded.Load() { + return nil + } + if err := runCypherOnSetupSafe(s, `INSTALL ALGO`); err != nil && + !strings.Contains(err.Error(), "is already installed") { + // Soft-ignore the "already installed" path — re-runs on the + // same on-disk store re-INSTALL and a benign duplicate + // shouldn't abort startup. + _ = err + } + if err := runCypherOnSetupSafe(s, `LOAD EXTENSION ALGO`); err != nil { + return fmt.Errorf("load algo extension: %w", err) + } + s.algo.extensionLoaded.Store(true) + return nil +} + +// projectionPredicate builds the per-table predicate map that +// PROJECT_GRAPH accepts when the caller wants to scope the algo +// to a subset of node kinds / edge kinds. Returns the literal +// predicate string ("'n.kind = "function" OR n.kind = "method"'") +// for substitution into the Cypher; an empty predicate falls +// through to the unfiltered list-of-tables form. +// +// Ladybug rejects predicates that reference more than one table, +// so node and edge predicates are emitted independently. +func projectionPredicates(opts projectionOpts) (nodePred, edgePred string) { + if len(opts.nodeKinds) > 0 { + parts := make([]string, 0, len(opts.nodeKinds)) + for _, k := range opts.nodeKinds { + parts = append(parts, fmt.Sprintf(`n.kind = %q`, string(k))) + } + nodePred = strings.Join(parts, " OR ") + } + if len(opts.edgeKinds) > 0 { + parts := make([]string, 0, len(opts.edgeKinds)) + for _, k := range opts.edgeKinds { + parts = append(parts, fmt.Sprintf(`r.kind = %q`, string(k))) + } + edgePred = strings.Join(parts, " OR ") + } + return nodePred, edgePred +} + +// projectionOpts is the union of every algo's per-call scoping +// knobs that map into PROJECT_GRAPH's filtered form. Each algo +// builds it from its public Opts struct. +type projectionOpts struct { + nodeKinds []graph.NodeKind + edgeKinds []graph.EdgeKind +} + +// cacheKey returns a canonical serialisation of the projection +// shape — two opts with the same node/edge kinds (any order) +// produce the same key, so the cached projection is reused for +// repeat algo calls that differ only in their tuning knobs +// (dampingFactor, maxIterations, …). The key is intentionally +// cheap: a small string concat is dwarfed by the algo CALL itself. +func (o projectionOpts) cacheKey() string { + // Sort for order-independence — callers may pass kinds in any + // order, and the projection itself is order-insensitive. + nodes := make([]string, len(o.nodeKinds)) + for i, k := range o.nodeKinds { + nodes[i] = string(k) + } + edges := make([]string, len(o.edgeKinds)) + for i, k := range o.edgeKinds { + edges[i] = string(k) + } + sortStrings(nodes) + sortStrings(edges) + return strings.Join(nodes, ",") + "|" + strings.Join(edges, ",") +} + +// sortStrings is a tiny insertion sort over a string slice — +// fine for the handful of node/edge kinds an algo opts struct +// ever carries; pulls no stdlib sort import in. +func sortStrings(xs []string) { + for i := 1; i < len(xs); i++ { + j := i + for j > 0 && xs[j-1] > xs[j] { + xs[j-1], xs[j] = xs[j], xs[j-1] + j-- + } + } +} + +// projectGraphLocked declares the named projection. If predicates +// are non-empty, the filtered form (map-of-table-to-predicate) is +// used; otherwise the simple list form. Caller must already hold +// writeMu and the algo.projectionMu (acquired by withProjection). +func (s *Store) projectGraphLocked(name string, opts projectionOpts) error { + nodePred, edgePred := projectionPredicates(opts) + var q string + switch { + case nodePred == "" && edgePred == "": + q = fmt.Sprintf(`CALL PROJECT_GRAPH('%s', ['Node'], ['Edge'])`, name) + default: + nodeArg := `['Node']` + if nodePred != "" { + nodeArg = fmt.Sprintf(`{'Node': '%s'}`, escapeCypherStringLit(nodePred)) + } + edgeArg := `['Edge']` + if edgePred != "" { + edgeArg = fmt.Sprintf(`{'Edge': '%s'}`, escapeCypherStringLit(edgePred)) + } + q = fmt.Sprintf(`CALL PROJECT_GRAPH('%s', %s, %s)`, name, nodeArg, edgeArg) + } + if err := runCypherOnSetupSafe(s, q); err != nil { + return fmt.Errorf("project graph %q: %w", name, err) + } + return nil +} + +// dropProjectionLocked tears down the named projection. Logs but +// does not propagate errors — a stale projection from a crashed +// run shouldn't block the next algo call. Pinned to the setup +// conn (same conn as projectGraphLocked) so the drop targets the +// right per-connection catalog. +func (s *Store) dropProjectionLocked(name string) { + _ = runCypherOnSetupSafe(s, fmt.Sprintf(`CALL DROP_PROJECTED_GRAPH('%s')`, name)) +} + +// withProjection wraps an algo CALL in the project → run lifecycle +// with a projection cache. The first call for a given (nodeKinds, +// edgeKinds) shape declares the projection; subsequent calls with +// the same shape and an unchanged Store.writeGen reuse it — no +// CALL PROJECT_GRAPH, no CALL DROP_PROJECTED_GRAPH. The cache is +// invalidated lazily: a mismatch between the cached generation and +// the live writeGen triggers a drop+rebuild on the next call. +// +// The algo.projectionMu mutex serialises projection-name reuse + +// cache mutation across concurrent algo invocations. writeMu is +// taken inside it so an unrelated write can't slip in between the +// generation read and the projection rebuild (which would race the +// cache into an apparently-fresh-but-actually-stale state). +// +// Why no drop after fn: the algo CALL is a read-only query against +// the projection — leaving the projection live across calls turns +// the second-and-later PageRank / Louvain / WCC / SCC / KCore call +// into a pure algorithm run instead of a full graph rebuild. On +// gortex-scale graphs (313k+ edges) that's the difference between +// ~1 s and ~30 s per call. +func (s *Store) withProjection(opts projectionOpts, fn func(name string) error) error { + s.algo.projectionMu.Lock() + defer s.algo.projectionMu.Unlock() + + s.writeMu.Lock() + defer s.writeMu.Unlock() + + if err := s.ensureAlgoExtensionLocked(); err != nil { + return err + } + + key := opts.cacheKey() + gen := s.writeGen.Load() + + // Fast path: cached projection still matches the requested + // shape AND the graph hasn't mutated since it was built. + if s.algo.projection.valid && + s.algo.projection.key == key && + s.algo.projection.generation == gen { + return fn(s.algo.projection.name) + } + + // Cache miss (different shape, stale generation, or first + // call). Drop the previous projection if one is live, then + // rebuild against the requested opts. The cache stays invalid + // across the rebuild so a PROJECT_GRAPH failure leaves us in + // a clean "no projection" state for the next call to retry. + if s.algo.projection.valid { + s.dropProjectionLocked(s.algo.projection.name) + s.algo.projection.valid = false + } + // Defensive drop for a stale projection from a prior crashed + // run (or a previous Open of the same on-disk store) that + // would otherwise make PROJECT_GRAPH fail with "graph G + // already exists". + s.dropProjectionLocked(algoProjectionName) + + if err := s.projectGraphLocked(algoProjectionName, opts); err != nil { + return err + } + s.algo.projection = projectionCacheEntry{ + valid: true, + key: key, + name: algoProjectionName, + generation: gen, + } + return fn(algoProjectionName) +} + +// dropCachedProjection tears down any cached projection. Called +// from Store.Close so the engine's catalog doesn't carry a +// dangling projection across the connection teardown. +func (s *Store) dropCachedProjection() { + s.algo.projectionMu.Lock() + defer s.algo.projectionMu.Unlock() + if !s.algo.projection.valid { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + s.dropProjectionLocked(s.algo.projection.name) + s.algo.projection.valid = false +} + +// runCypherOnSetupSafe is runCypherSafe but pinned to the setup +// connection (s.conn) instead of round-tripping through the pool. +// The ALGO extension's CALL PROJECT_GRAPH binds the projection to +// the connection that ran it — every later CALL from a +// different pool connection would surface "Projected graph G +// does not exists". Pinning the entire projection lifecycle +// (INSTALL + LOAD + PROJECT_GRAPH + CALL + DROP) to s.conn +// guarantees per-connection consistency. +func runCypherOnSetupSafe(s *Store, query string) (err error) { + defer func() { + if r := recover(); r != nil { + if e, ok := r.(error); ok { + err = e + return + } + err = fmt.Errorf("%v", r) + } + }() + if s.conn == nil { + // Test fixtures may construct a Store{} without Open — fall + // back to the regular pool-aware path. + s.runWriteLocked(query, nil) + return nil + } + res, qerr := s.conn.Query(query) + if qerr != nil { + return qerr + } + res.Close() + return nil +} + +// querySelectOnSetupSafe is querySelectSafe pinned to the setup +// connection — same rationale as runCypherOnSetupSafe. +func querySelectOnSetupSafe(s *Store, query string, args map[string]any) (rows [][]any, err error) { + defer func() { + if r := recover(); r != nil { + if e, ok := r.(error); ok { + err = e + return + } + err = fmt.Errorf("%v", r) + } + }() + if s.conn == nil { + // Test fixtures may construct a Store{} without Open — fall + // back to the regular pool-aware path. + rows = s.querySelectLocked(query, args) + return rows, nil + } + var res *lbug.QueryResult + if len(args) == 0 { + res, err = s.conn.Query(query) + if err != nil { + return nil, err + } + } else { + stmt, perr := s.conn.Prepare(query) + if perr != nil { + return nil, fmt.Errorf("prepare: %w", perr) + } + defer stmt.Close() + res, err = s.conn.Execute(stmt, args) + if err != nil { + return nil, err + } + } + defer res.Close() + for res.HasNext() { + tup, terr := res.Next() + if terr != nil { + return rows, terr + } + vals, verr := tup.GetAsSlice() + if verr != nil { + tup.Close() + return rows, verr + } + rows = append(rows, vals) + tup.Close() + } + return rows, nil +} + +// PageRank computes PageRank centrality over a projected subgraph. +// Returns hits sorted by rank descending; the rank values sum to ~1 +// across the projection (Ladybug normalises initial scores by +// default). +// +// Zero-valued opts map to the backend's default tuning. The +// projection name and lifetime are managed internally — callers +// don't touch CALL PROJECT_GRAPH directly. +func (s *Store) PageRank(opts graph.PageRankOpts) ([]graph.PageRankHit, error) { + projOpts := projectionOpts{nodeKinds: opts.NodeKinds, edgeKinds: opts.EdgeKinds} + + // Build the page_rank CALL with only the overridden tuning + // knobs as named args. Leaving a knob out delegates to + // Ladybug's parallel-tuned defaults (dampingFactor=0.85, + // maxIterations=20, tolerance=1e-7). + var args []string + if opts.DampingFactor > 0 { + args = append(args, fmt.Sprintf("dampingFactor := %g", opts.DampingFactor)) + } + if opts.MaxIterations > 0 { + args = append(args, fmt.Sprintf("maxIterations := %d", opts.MaxIterations)) + } + if opts.Tolerance > 0 { + args = append(args, fmt.Sprintf("tolerance := %g", opts.Tolerance)) + } + knobs := "" + if len(args) > 0 { + knobs = ", " + strings.Join(args, ", ") + } + + limitClause := "" + if opts.Limit > 0 { + limitClause = fmt.Sprintf(" LIMIT %d", opts.Limit) + } + + var hits []graph.PageRankHit + err := s.withProjection(projOpts, func(name string) error { + q := fmt.Sprintf( + `CALL page_rank('%s'%s) RETURN node.id AS id, rank ORDER BY rank DESC%s`, + name, knobs, limitClause, + ) + rows, err := querySelectOnSetupSafe(s, q, nil) + if err != nil { + return fmt.Errorf("page_rank: %w", err) + } + hits = make([]graph.PageRankHit, 0, len(rows)) + for _, row := range rows { + if len(row) < 2 { + continue + } + id, _ := row[0].(string) + if id == "" { + continue + } + rank, _ := row[1].(float64) + hits = append(hits, graph.PageRankHit{NodeID: id, Rank: rank}) + } + return nil + }) + if err != nil { + return nil, err + } + return hits, nil +} + +// Louvain runs community detection over a projected subgraph and +// returns one hit per node with the integer community label the +// algorithm assigned. Ladybug treats edges as undirected when +// computing modularity even though the projected Edge table is +// directed — callers that care about directed modularity should +// run the in-process fallback (analysis.DetectCommunitiesLouvain). +// +// CommunityID values are opaque integers (Ladybug uses internal +// node offsets); two nodes with the same ID are in the same +// community, but the integer itself isn't stable across runs. +func (s *Store) Louvain(opts graph.CommunityOpts) ([]graph.CommunityHit, error) { + projOpts := projectionOpts{nodeKinds: opts.NodeKinds, edgeKinds: opts.EdgeKinds} + + var args []string + if opts.MaxPhases > 0 { + args = append(args, fmt.Sprintf("maxPhases := %d", opts.MaxPhases)) + } + if opts.MaxIterations > 0 { + args = append(args, fmt.Sprintf("maxIterations := %d", opts.MaxIterations)) + } + knobs := "" + if len(args) > 0 { + knobs = ", " + strings.Join(args, ", ") + } + + var hits []graph.CommunityHit + err := s.withProjection(projOpts, func(name string) error { + q := fmt.Sprintf( + `CALL louvain('%s'%s) RETURN node.id AS id, louvain_id`, + name, knobs, + ) + rows, err := querySelectOnSetupSafe(s, q, nil) + if err != nil { + return fmt.Errorf("louvain: %w", err) + } + hits = make([]graph.CommunityHit, 0, len(rows)) + for _, row := range rows { + if len(row) < 2 { + continue + } + id, _ := row[0].(string) + if id == "" { + continue + } + cid := asInt64(row[1]) + hits = append(hits, graph.CommunityHit{NodeID: id, CommunityID: cid}) + } + return nil + }) + if err != nil { + return nil, err + } + return hits, nil +} + +// WeaklyConnectedComponents runs WCC (undirected reachability) +// over a projected subgraph. Returns one hit per node with the +// integer component label; two nodes with the same ComponentID +// are in the same WCC. +func (s *Store) WeaklyConnectedComponents(opts graph.ComponentOpts) ([]graph.ComponentHit, error) { + return s.runComponentAlgo("weakly_connected_components", opts) +} + +// StronglyConnectedComponents runs SCC (directional mutual +// reachability) over a projected subgraph. Two nodes share an +// SCC iff they are mutually reachable along directed edges; SCCs +// of size > 1 are the cycle structure of the directed graph. +// +// Ladybug ships two SCC implementations — a BFS-based default +// (used here) and a Kosaraju DFS variant +// (strongly_connected_components_kosaraju) "recommended for sparse +// graphs or those with high diameter" per the docs. Callers that +// need Kosaraju behaviour can invoke graph_query directly. +func (s *Store) StronglyConnectedComponents(opts graph.ComponentOpts) ([]graph.ComponentHit, error) { + return s.runComponentAlgo("strongly_connected_components", opts) +} + +// KCoreDecomposition runs the k-core decomposition over a +// projected subgraph and returns one hit per node carrying its +// k-degree — the largest k for which the node stays in the +// k-core after iterative degree-< k pruning. +// +// Ladybug's CALL k_core_decomposition takes no tuning knobs +// (the algorithm always computes the full decomposition); the +// only per-call shaping comes from PROJECT_GRAPH's NodeKinds / +// EdgeKinds filter. +func (s *Store) KCoreDecomposition(opts graph.KCoreOpts) ([]graph.KCoreHit, error) { + projOpts := projectionOpts{nodeKinds: opts.NodeKinds, edgeKinds: opts.EdgeKinds} + + var hits []graph.KCoreHit + err := s.withProjection(projOpts, func(name string) error { + q := fmt.Sprintf( + `CALL k_core_decomposition('%s') RETURN node.id AS id, k_degree`, + name, + ) + rows, err := querySelectOnSetupSafe(s, q, nil) + if err != nil { + return fmt.Errorf("k_core_decomposition: %w", err) + } + hits = make([]graph.KCoreHit, 0, len(rows)) + for _, row := range rows { + if len(row) < 2 { + continue + } + id, _ := row[0].(string) + if id == "" { + continue + } + hits = append(hits, graph.KCoreHit{NodeID: id, KDegree: asInt64(row[1])}) + } + return nil + }) + if err != nil { + return nil, err + } + return hits, nil +} + +// runComponentAlgo is the shared shape for the two component +// algos. cypherCall is the algo's CALL name; both algos return +// the same (node, group_id) shape. +func (s *Store) runComponentAlgo(cypherCall string, opts graph.ComponentOpts) ([]graph.ComponentHit, error) { + projOpts := projectionOpts{nodeKinds: opts.NodeKinds, edgeKinds: opts.EdgeKinds} + + knobs := "" + if opts.MaxIterations > 0 { + knobs = fmt.Sprintf(", maxIterations := %d", opts.MaxIterations) + } + + var hits []graph.ComponentHit + err := s.withProjection(projOpts, func(name string) error { + q := fmt.Sprintf( + `CALL %s('%s'%s) RETURN node.id AS id, group_id`, + cypherCall, name, knobs, + ) + rows, err := querySelectOnSetupSafe(s, q, nil) + if err != nil { + return fmt.Errorf("%s: %w", cypherCall, err) + } + hits = make([]graph.ComponentHit, 0, len(rows)) + for _, row := range rows { + if len(row) < 2 { + continue + } + id, _ := row[0].(string) + if id == "" { + continue + } + hits = append(hits, graph.ComponentHit{NodeID: id, ComponentID: asInt64(row[1])}) + } + return nil + }) + if err != nil { + return nil, err + } + return hits, nil +} diff --git a/internal/graph/store_ladybug/algo_probe_test.go b/internal/graph/store_ladybug/algo_probe_test.go new file mode 100644 index 00000000..6914fe53 --- /dev/null +++ b/internal/graph/store_ladybug/algo_probe_test.go @@ -0,0 +1,139 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// TestAlgo_Probe walks the ALGO extension's surface: +// +// 1. INSTALL ALGO + LOAD EXTENSION ALGO (mirrors FTS / VECTOR dance) +// 2. CALL PROJECT_GRAPH('G', ['Node'], ['Edge']) — declare a projected +// subgraph the algos run over +// 3. CALL page_rank, louvain, weakly_connected_components, +// strongly_connected_components, k_core_decomposition each in turn +// against the projection +// 4. CALL DROP_PROJECTED_GRAPH('G') to clean up (we want to know if a +// projection is per-call or persistent) +// +// Liberal logging so the probe surfaces what works regardless of where +// the algo extension's surface lands relative to the docs. +func TestAlgo_Probe(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-algo-probe-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + // Step 1: install + load. INSTALL may report "already installed" on + // repeat runs — log and continue either way. + for _, q := range []string{`INSTALL ALGO`, `LOAD EXTENSION ALGO`} { + if err := tryRunCypher(s, q); err != nil { + t.Logf("%s: %v", q, err) + } else { + t.Logf("%s: ok", q) + } + } + + // Step 2: seed a small directed graph with two clear communities + // plus a hub node that ties them together. Layout: + // + // a -> b -> c -> a (triangle 1, SCC + community A) + // d -> e -> f -> d (triangle 2, SCC + community B) + // c -> d (bridge — makes it one WCC but two SCCs) + // hub <- a,b,c,d,e,f (incoming hub → high PageRank) + for _, n := range []*graph.Node{ + {ID: "a", Kind: graph.KindFunction, Name: "a", FilePath: "x.go"}, + {ID: "b", Kind: graph.KindFunction, Name: "b", FilePath: "x.go"}, + {ID: "c", Kind: graph.KindFunction, Name: "c", FilePath: "x.go"}, + {ID: "d", Kind: graph.KindFunction, Name: "d", FilePath: "y.go"}, + {ID: "e", Kind: graph.KindFunction, Name: "e", FilePath: "y.go"}, + {ID: "f", Kind: graph.KindFunction, Name: "f", FilePath: "y.go"}, + {ID: "hub", Kind: graph.KindFunction, Name: "hub", FilePath: "z.go"}, + } { + s.AddNode(n) + } + for _, e := range []*graph.Edge{ + {From: "a", To: "b", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "b", To: "c", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "c", To: "a", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "d", To: "e", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "e", To: "f", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "f", To: "d", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "c", To: "d", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "a", To: "hub", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "b", To: "hub", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "c", To: "hub", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "d", To: "hub", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "e", To: "hub", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "f", To: "hub", Kind: graph.EdgeCalls, FilePath: "y.go"}, + } { + s.AddEdge(e) + } + t.Logf("seeded %d nodes, %d edges", s.NodeCount(), s.EdgeCount()) + + // Step 3: declare the projection. Try the simple form first; fall + // back to alternate spellings if the binder rejects the literal. + for _, q := range []string{ + `CALL PROJECT_GRAPH('G', ['Node'], ['Edge'])`, + `CALL project_graph('G', ['Node'], ['Edge'])`, + } { + if err := tryRunCypher(s, q); err != nil { + t.Logf("%s: %v", q, err) + } else { + t.Logf("%s: ok", q) + break + } + } + + // Step 4: try every algo. Each is logged independently so a single + // missing function doesn't abort the others. + probes := []struct { + name string + q string + }{ + {"page_rank", `CALL page_rank('G') RETURN node.id AS id, rank ORDER BY rank DESC LIMIT 10`}, + {"page_rank_with_opts", `CALL page_rank('G', dampingFactor := 0.85, maxIterations := 20) RETURN node.id AS id, rank ORDER BY rank DESC LIMIT 10`}, + {"louvain", `CALL louvain('G') RETURN node.id AS id, louvain_id ORDER BY louvain_id LIMIT 20`}, + {"weakly_connected_components", `CALL weakly_connected_components('G') RETURN node.id AS id, group_id ORDER BY group_id LIMIT 20`}, + {"strongly_connected_components", `CALL strongly_connected_components('G') RETURN node.id AS id, group_id ORDER BY group_id LIMIT 20`}, + {"strongly_connected_components_kosaraju", `CALL strongly_connected_components_kosaraju('G') RETURN node.id AS id, group_id ORDER BY group_id LIMIT 20`}, + {"k_core_decomposition", `CALL k_core_decomposition('G') RETURN node.id AS id, k_degree ORDER BY k_degree DESC LIMIT 20`}, + } + for _, p := range probes { + rows, qerr := tryQueryCypher(s, p.q, nil) + if qerr != nil { + t.Logf("%s: error: %v", p.name, qerr) + continue + } + t.Logf("%s → %d rows", p.name, len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } + + // Step 5: drop the projection and see whether re-projecting is + // allowed. If not, projections are per-session / per-call. + for _, q := range []string{ + `CALL DROP_PROJECTED_GRAPH('G')`, + `CALL drop_projected_graph('G')`, + } { + if err := tryRunCypher(s, q); err != nil { + t.Logf("%s: %v", q, err) + } else { + t.Logf("%s: ok", q) + break + } + } +} diff --git a/internal/graph/store_ladybug/algo_test.go b/internal/graph/store_ladybug/algo_test.go new file mode 100644 index 00000000..837ca899 --- /dev/null +++ b/internal/graph/store_ladybug/algo_test.go @@ -0,0 +1,373 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// seedAlgoTestGraph builds the same hub-and-spoke graph the probe +// used. Two SCC triangles + a hub that every node points at — gives +// PageRank, SCC, Louvain, and K-Core a predictable answer to test +// against without needing a big real corpus. +func seedAlgoTestGraph(t *testing.T) *Store { + t.Helper() + dir, err := os.MkdirTemp("", "lbug-algo-test-*") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + for _, n := range []*graph.Node{ + {ID: "a", Kind: graph.KindFunction, Name: "a", FilePath: "x.go"}, + {ID: "b", Kind: graph.KindFunction, Name: "b", FilePath: "x.go"}, + {ID: "c", Kind: graph.KindFunction, Name: "c", FilePath: "x.go"}, + {ID: "d", Kind: graph.KindFunction, Name: "d", FilePath: "y.go"}, + {ID: "e", Kind: graph.KindFunction, Name: "e", FilePath: "y.go"}, + {ID: "f", Kind: graph.KindFunction, Name: "f", FilePath: "y.go"}, + {ID: "hub", Kind: graph.KindFunction, Name: "hub", FilePath: "z.go"}, + } { + s.AddNode(n) + } + for _, e := range []*graph.Edge{ + {From: "a", To: "b", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "b", To: "c", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "c", To: "a", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "d", To: "e", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "e", To: "f", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "f", To: "d", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "c", To: "d", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "a", To: "hub", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "b", To: "hub", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "c", To: "hub", Kind: graph.EdgeCalls, FilePath: "x.go"}, + {From: "d", To: "hub", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "e", To: "hub", Kind: graph.EdgeCalls, FilePath: "y.go"}, + {From: "f", To: "hub", Kind: graph.EdgeCalls, FilePath: "y.go"}, + } { + s.AddEdge(e) + } + return s +} + +func TestPageRanker_RanksHubFirst(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.PageRank(graph.PageRankOpts{}) + require.NoError(t, err) + require.GreaterOrEqual(t, len(hits), 7) + + // Hub has six incoming edges (every other node calls it) while + // triangle nodes only have one or two — PageRank must rank hub + // first by a clear margin. + assert.Equal(t, "hub", hits[0].NodeID, + "hub should rank #1; got %v", hits) + assert.Greater(t, hits[0].Rank, hits[1].Rank*1.5, + "hub rank should dominate next-highest by at least 1.5x; got hits=%v", hits) +} + +func TestPageRanker_RespectsLimit(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.PageRank(graph.PageRankOpts{Limit: 3}) + require.NoError(t, err) + assert.Len(t, hits, 3, "Limit=3 must cap the result at 3 rows") +} + +func TestPageRanker_RespectsNodeKindFilter(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-algo-filter-*") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + // Two kinds. Only KindFunction should appear when we filter for it. + for _, n := range []*graph.Node{ + {ID: "fn1", Kind: graph.KindFunction, Name: "fn1", FilePath: "x.go"}, + {ID: "fn2", Kind: graph.KindFunction, Name: "fn2", FilePath: "x.go"}, + {ID: "ty1", Kind: graph.KindType, Name: "ty1", FilePath: "x.go"}, + } { + s.AddNode(n) + } + s.AddEdge(&graph.Edge{From: "fn1", To: "fn2", Kind: graph.EdgeCalls, FilePath: "x.go"}) + s.AddEdge(&graph.Edge{From: "fn1", To: "ty1", Kind: graph.EdgeReferences, FilePath: "x.go"}) + + hits, err := s.PageRank(graph.PageRankOpts{ + NodeKinds: []graph.NodeKind{graph.KindFunction}, + }) + require.NoError(t, err) + for _, h := range hits { + assert.NotEqual(t, "ty1", h.NodeID, "type node should be excluded by NodeKinds filter; got %v", hits) + } +} + +func TestPageRanker_RespectsTuningKnobs(t *testing.T) { + s := seedAlgoTestGraph(t) + // A high damping factor with very few iterations should still + // produce hub-first ordering — this just exercises the named-arg + // path so a future binder change can't silently break it. + hits, err := s.PageRank(graph.PageRankOpts{ + DampingFactor: 0.9, + MaxIterations: 5, + Tolerance: 1e-4, + Limit: 3, + }) + require.NoError(t, err) + require.Len(t, hits, 3) + assert.Equal(t, "hub", hits[0].NodeID) +} + +// TestPageRanker_ConsecutiveCallsDoNotLeak validates the project → +// run → drop lifecycle: two back-to-back calls must succeed even +// though they reuse the same projection name. A leaked projection +// from call 1 would make call 2's PROJECT_GRAPH error out. +func TestPageRanker_ConsecutiveCallsDoNotLeak(t *testing.T) { + s := seedAlgoTestGraph(t) + for i := 0; i < 3; i++ { + hits, err := s.PageRank(graph.PageRankOpts{Limit: 1}) + require.NoError(t, err, "consecutive PageRank call %d must succeed", i) + require.Len(t, hits, 1) + assert.Equal(t, "hub", hits[0].NodeID) + } +} + +func TestCommunityDetector_FindsTwoCommunities(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.Louvain(graph.CommunityOpts{}) + require.NoError(t, err) + require.Len(t, hits, 7) + + // Group hits by community ID. + byComm := map[int64][]string{} + for _, h := range hits { + byComm[h.CommunityID] = append(byComm[h.CommunityID], h.NodeID) + } + assert.GreaterOrEqual(t, len(byComm), 2, + "Louvain should find at least 2 communities for the two-triangle graph; got %v", byComm) + + // Members of the same triangle should land in the same community. + commFor := map[string]int64{} + for _, h := range hits { + commFor[h.NodeID] = h.CommunityID + } + assert.Equal(t, commFor["a"], commFor["b"], + "a + b should be in the same community (triangle 1); got %v", commFor) + assert.Equal(t, commFor["b"], commFor["c"], + "b + c should be in the same community (triangle 1); got %v", commFor) + assert.Equal(t, commFor["d"], commFor["e"], + "d + e should be in the same community (triangle 2); got %v", commFor) + assert.Equal(t, commFor["e"], commFor["f"], + "e + f should be in the same community (triangle 2); got %v", commFor) +} + +func TestCommunityDetector_RespectsTuningKnobs(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.Louvain(graph.CommunityOpts{ + MaxPhases: 5, + MaxIterations: 5, + }) + require.NoError(t, err) + require.Len(t, hits, 7) +} + +// TestCommunityDetector_ConsecutiveCallsDoNotLeak — identical +// project → run → drop hygiene check as the PageRanker side. +func TestCommunityDetector_ConsecutiveCallsDoNotLeak(t *testing.T) { + s := seedAlgoTestGraph(t) + for i := 0; i < 3; i++ { + hits, err := s.Louvain(graph.CommunityOpts{}) + require.NoError(t, err, "consecutive Louvain call %d must succeed", i) + require.Len(t, hits, 7) + } +} + +// TestAlgo_PageRankThenLouvain — interleaved different-algo calls +// must not stomp on each other's projection. Catches a regression +// where the algoProjectionName collision between two distinct +// algos would surface as a "graph G already exists" binder error. +func TestAlgo_PageRankThenLouvain(t *testing.T) { + s := seedAlgoTestGraph(t) + prHits, err := s.PageRank(graph.PageRankOpts{Limit: 1}) + require.NoError(t, err) + require.Len(t, prHits, 1) + + louvainHits, err := s.Louvain(graph.CommunityOpts{}) + require.NoError(t, err) + require.Len(t, louvainHits, 7) +} + +func TestComponentFinder_WCC_OneComponent(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.WeaklyConnectedComponents(graph.ComponentOpts{}) + require.NoError(t, err) + require.Len(t, hits, 7) + // Hub + both triangles are one undirected component (the bridge + // c -> d unifies them) — every node must share the same group_id. + first := hits[0].ComponentID + for _, h := range hits { + assert.Equal(t, first, h.ComponentID, + "all 7 nodes should be in one WCC; got %v", hits) + } +} + +func TestComponentFinder_SCC_ThreeComponents(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.StronglyConnectedComponents(graph.ComponentOpts{}) + require.NoError(t, err) + require.Len(t, hits, 7) + + // Index by node ID. + commFor := map[string]int64{} + for _, h := range hits { + commFor[h.NodeID] = h.ComponentID + } + // Triangle 1 = {a, b, c} must all share one SCC. + assert.Equal(t, commFor["a"], commFor["b"]) + assert.Equal(t, commFor["b"], commFor["c"]) + // Triangle 2 = {d, e, f} must all share one SCC. + assert.Equal(t, commFor["d"], commFor["e"]) + assert.Equal(t, commFor["e"], commFor["f"]) + // Triangle 1 and triangle 2 must be DIFFERENT SCCs (no path + // back from d to c). + assert.NotEqual(t, commFor["a"], commFor["d"], + "the two triangles must be separate SCCs; got %v", commFor) + // Hub is its own SCC (no inbound calls from any node it points at). + assert.NotEqual(t, commFor["hub"], commFor["a"]) + assert.NotEqual(t, commFor["hub"], commFor["d"]) +} + +func TestComponentFinder_SCC_RespectsMaxIterations(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.StronglyConnectedComponents(graph.ComponentOpts{MaxIterations: 5}) + require.NoError(t, err) + require.Len(t, hits, 7) +} + +func TestKCorer_FindsCore(t *testing.T) { + s := seedAlgoTestGraph(t) + hits, err := s.KCoreDecomposition(graph.KCoreOpts{}) + require.NoError(t, err) + require.Len(t, hits, 7) + // Every node in the hub-and-spoke + two-triangle graph has at + // least 3 neighbours when edges are treated as undirected, so + // k_degree of every node should be exactly 3 (the whole graph + // is its own 3-core). + for _, h := range hits { + assert.Equal(t, int64(3), h.KDegree, + "every node should have k-degree 3; got %v", hits) + } +} + +func TestKCorer_ConsecutiveCallsDoNotLeak(t *testing.T) { + s := seedAlgoTestGraph(t) + for i := 0; i < 3; i++ { + hits, err := s.KCoreDecomposition(graph.KCoreOpts{}) + require.NoError(t, err, "consecutive KCore call %d must succeed", i) + require.Len(t, hits, 7) + } +} + +// TestAlgo_ProjectionCachedAcrossCalls is the proof point for the +// projection-cache fast path: two consecutive PageRank calls with +// identical opts must reuse the same projection. Track via the +// generation field on algo.projection — it is stamped with +// Store.writeGen at the time PROJECT_GRAPH was run, so observing +// the same generation across two calls means PROJECT_GRAPH ran +// exactly once. +// +// On real-scale graphs (Ladybug + gortex's 313k+ edges) a cache +// miss costs 30+s for the rebuild; a hit is ~0 ms. This test +// asserts hit behaviour on the small synthetic graph where both +// paths are fast — what we're really checking is the cache key +// math and the writeGen comparison. +func TestAlgo_ProjectionCachedAcrossCalls(t *testing.T) { + s := seedAlgoTestGraph(t) + + // First PageRank: cache miss, projection is built. + _, err := s.PageRank(graph.PageRankOpts{Limit: 1}) + require.NoError(t, err) + require.True(t, s.algo.projection.valid, "projection should be cached after first call") + firstGen := s.algo.projection.generation + firstKey := s.algo.projection.key + firstName := s.algo.projection.name + + // Second PageRank with identical opts: cache hit, projection + // reused. The cached generation must NOT advance (no writes + // happened between calls) — proves the projection was reused, + // not rebuilt. + _, err = s.PageRank(graph.PageRankOpts{Limit: 1}) + require.NoError(t, err) + require.True(t, s.algo.projection.valid, "projection should still be cached") + assert.Equal(t, firstGen, s.algo.projection.generation, + "generation must not advance between two same-opts calls — proves the cached projection was reused, not rebuilt") + assert.Equal(t, firstKey, s.algo.projection.key) + assert.Equal(t, firstName, s.algo.projection.name) + + // Third call: different algo (Louvain) with the same shape — + // the cache key is shape-only so this must also hit the cache. + _, err = s.Louvain(graph.CommunityOpts{}) + require.NoError(t, err) + assert.Equal(t, firstGen, s.algo.projection.generation, + "different algos with the same projection shape must share the cached projection") +} + +// TestAlgo_ProjectionRebuiltAfterWrite confirms lazy invalidation: +// after a write bumps Store.writeGen, the next algo call must +// detect the mismatch and rebuild the projection. The cached +// generation should advance to the new writeGen value. +func TestAlgo_ProjectionRebuiltAfterWrite(t *testing.T) { + s := seedAlgoTestGraph(t) + + _, err := s.PageRank(graph.PageRankOpts{Limit: 1}) + require.NoError(t, err) + require.True(t, s.algo.projection.valid) + preWriteGen := s.algo.projection.generation + + // Add a new node — bumps writeGen and invalidates the cache. + s.AddNode(&graph.Node{ + ID: "extra", Kind: graph.KindFunction, Name: "extra", FilePath: "z.go", + }) + require.Greater(t, s.writeGen.Load(), preWriteGen, + "AddNode must advance writeGen") + + // Next algo call must rebuild. The cached generation should + // now match the post-write writeGen. + _, err = s.PageRank(graph.PageRankOpts{Limit: 1}) + require.NoError(t, err) + require.True(t, s.algo.projection.valid) + assert.Greater(t, s.algo.projection.generation, preWriteGen, + "projection generation must advance after a write — proves the cache was invalidated and the projection rebuilt") + assert.Equal(t, s.writeGen.Load(), s.algo.projection.generation, + "rebuilt projection's generation must equal current writeGen") +} + +// TestAlgo_ProjectionRebuiltOnShapeChange covers the +// different-opts cache miss: a PageRank with a NodeKinds filter +// must rebuild against the filtered shape after an unfiltered +// PageRank built the broad projection. The cache key changes, so +// the entry must be replaced. +func TestAlgo_ProjectionRebuiltOnShapeChange(t *testing.T) { + s := seedAlgoTestGraph(t) + + _, err := s.PageRank(graph.PageRankOpts{Limit: 1}) + require.NoError(t, err) + require.True(t, s.algo.projection.valid) + broadKey := s.algo.projection.key + + // Different shape — explicit NodeKinds filter. + _, err = s.PageRank(graph.PageRankOpts{ + NodeKinds: []graph.NodeKind{graph.KindFunction}, + Limit: 1, + }) + require.NoError(t, err) + require.True(t, s.algo.projection.valid) + assert.NotEqual(t, broadKey, s.algo.projection.key, + "different opts must produce a different cache key") +} diff --git a/internal/graph/store_ladybug/analysis_adjacency.go b/internal/graph/store_ladybug/analysis_adjacency.go new file mode 100644 index 00000000..21c7f909 --- /dev/null +++ b/internal/graph/store_ladybug/analysis_adjacency.go @@ -0,0 +1,190 @@ +package store_ladybug + +import ( + "iter" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the adjacency-shaped +// pushdown capabilities for the betweenness + hotspots wave. A drift +// in any signature fails the build here instead of silently dropping +// to the Go-loop fallback. +var ( + _ graph.EdgeAdjacencyForKinds = (*Store)(nil) + _ graph.CommunityCrossingsByKind = (*Store)(nil) + _ graph.NodeIDsByKinds = (*Store)(nil) +) + +// EdgeAdjacencyForKinds returns (from, to) id pairs for every edge +// whose Kind is in edgeKinds AND whose endpoints both have a Kind in +// nodeKinds. Replaces the EdgesByKinds-then-filter pass the +// betweenness adjacency build used to run — every per-edge row +// carried ~10 string columns over cgo just for the From/To pair, and +// the cross-kind edges (where one endpoint isn't a function/method) +// flowed through cgo too even though the caller discarded them. +// +// The capability returns a 2-column projection from a single Cypher +// join. The IN-list dedup matches the EdgesByKinds contract. +func (s *Store) EdgeAdjacencyForKinds(edgeKinds []graph.EdgeKind, nodeKinds []graph.NodeKind) iter.Seq[[2]string] { + if len(edgeKinds) == 0 || len(nodeKinds) == 0 { + return func(yield func([2]string) bool) {} + } + eKinds := edgeKindSliceToAny(dedupeEdgeKinds(edgeKinds)) + if len(eKinds) == 0 { + return func(yield func([2]string) bool) {} + } + nKinds := nodeKindSliceToAny(dedupeNodeKinds(nodeKinds)) + if len(nKinds) == 0 { + return func(yield func([2]string) bool) {} + } + const q = ` +MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE e.kind IN $ekinds + AND a.kind IN $nkinds + AND b.kind IN $nkinds +RETURN a.id, b.id` + rows := s.querySelect(q, map[string]any{ + "ekinds": eKinds, + "nkinds": nKinds, + }) + if len(rows) >= mallocTrimRowThreshold { + mallocTrim() + } + return func(yield func([2]string) bool) { + for _, r := range rows { + if len(r) < 2 { + continue + } + from, _ := r[0].(string) + to, _ := r[1].(string) + if from == "" || to == "" { + continue + } + if !yield([2]string{from, to}) { + return + } + } + } +} + +// CommunityCrossingsByKind ships only the (from, to) projection of +// edges whose Kind is in the supplied set and lets the Go side do +// the community comparison. Community membership is not a Node +// column — it's computed at runtime by the analyzer — so the +// comparison can't live in Cypher today. The win is the column +// projection: where FindHotspots.countCrossings used to pull the +// full edge row (~10 columns) twice (once per kind) over cgo, this +// single call returns 2 columns from one IN-list join. +// +// Zero-count sources are dropped so callers can probe existence +// without a >0 check. +func (s *Store) CommunityCrossingsByKind(kinds []graph.EdgeKind, nodeToComm map[string]string) map[string]int { + if len(kinds) == 0 || len(nodeToComm) == 0 { + return nil + } + allowed := edgeKindSliceToAny(dedupeEdgeKinds(kinds)) + if len(allowed) == 0 { + return nil + } + const q = ` +MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE e.kind IN $kinds +RETURN a.id, b.id` + rows := s.querySelect(q, map[string]any{"kinds": allowed}) + if len(rows) == 0 { + return nil + } + out := make(map[string]int) + for _, r := range rows { + if len(r) < 2 { + continue + } + from, _ := r[0].(string) + to, _ := r[1].(string) + if from == "" || to == "" { + continue + } + fc := nodeToComm[from] + tc := nodeToComm[to] + if fc == "" || tc == "" || fc == tc { + continue + } + out[from]++ + } + if len(rows) >= mallocTrimRowThreshold { + mallocTrim() + } + if len(out) == 0 { + return nil + } + return out +} + +// NodeIDsByKinds returns the IDs of every node whose Kind is in the +// supplied set. Identical filter shape to NodesByKinds, but ships +// only the id column — one C string per row instead of ~10. On the +// gortex workspace the betweenness/hotspots candidate set is ~4k +// rows; the projection cuts the cgo string-alloc count by an order +// of magnitude per call. +func (s *Store) NodeIDsByKinds(kinds []graph.NodeKind) []string { + if len(kinds) == 0 { + return nil + } + allowed := nodeKindSliceToAny(dedupeNodeKinds(kinds)) + if len(allowed) == 0 { + return nil + } + const q = `MATCH (n:Node) WHERE n.kind IN $kinds RETURN n.id` + rows := s.querySelect(q, map[string]any{"kinds": allowed}) + if len(rows) == 0 { + return nil + } + out := make([]string, 0, len(rows)) + for _, r := range rows { + if len(r) < 1 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + out = append(out, id) + } + return out +} + +// dedupeNodeKinds is the node-kind counterpart of dedupeEdgeKinds — +// the kinds-IN scanners use it to collapse repeats so the Cypher +// IN-list matches the in-memory reference's behaviour. +func dedupeNodeKinds(kinds []graph.NodeKind) []graph.NodeKind { + if len(kinds) == 0 { + return nil + } + seen := make(map[graph.NodeKind]struct{}, len(kinds)) + out := make([]graph.NodeKind, 0, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + out = append(out, k) + } + return out +} + +// nodeKindSliceToAny converts a deduped node-kind slice into the +// []any shape the Cypher binding expects for IN-list parameters. +func nodeKindSliceToAny(kinds []graph.NodeKind) []any { + if len(kinds) == 0 { + return nil + } + out := make([]any, 0, len(kinds)) + for _, k := range kinds { + out = append(out, string(k)) + } + return out +} diff --git a/internal/graph/store_ladybug/analysis_aggregates.go b/internal/graph/store_ladybug/analysis_aggregates.go new file mode 100644 index 00000000..2fd8fbcd --- /dev/null +++ b/internal/graph/store_ladybug/analysis_aggregates.go @@ -0,0 +1,262 @@ +package store_ladybug + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the per-node aggregate +// capabilities so the analyzers pick the server-side path via type +// assertion. A drift in either signature fails the build here instead +// of silently falling back to the Go loop. +var ( + _ graph.NodeDegreeAggregator = (*Store)(nil) + _ graph.NodeFanAggregator = (*Store)(nil) + _ graph.EdgesByKindsScanner = (*Store)(nil) +) + +// NodeDegreeCounts evaluates per-node in/out/usage edge counts +// entirely inside Ladybug. Two Cypher queries: one for in-edges (and +// the usage subset), one for out-edges. The alternative — looping +// GetInEdges/GetOutEdges per node — fires 2N cgo round-trips and +// materialises every edge struct just to len() it. On the gortex +// workspace that loop fed GraphConnectivity ~133k nodes × 2 calls, +// each materialising the full edge bucket → ~95s wall and a sustained +// allocation spike. The aggregated path returns N compact rows in +// two queries. +// +// COUNT { ... } sub-queries return the bucket size without +// materialising the edges, which is what we actually want here. +func (s *Store) NodeDegreeCounts(ids []string, usageKinds []graph.EdgeKind) []graph.NodeDegreeRow { + if len(ids) == 0 { + return nil + } + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + usage := make([]any, 0, len(usageKinds)) + usageSeen := make(map[graph.EdgeKind]struct{}, len(usageKinds)) + for _, k := range usageKinds { + if _, ok := usageSeen[k]; ok { + continue + } + usageSeen[k] = struct{}{} + usage = append(usage, string(k)) + } + + // One pass for in-counts (total + usage subset). Selecting both + // in the same projection halves the cgo round-trips compared with + // running the usage filter separately. + inQuery := ` +MATCH (n:Node) +WHERE n.id IN $ids +RETURN n.id, + COUNT { MATCH (:Node)-[:Edge]->(n) }, + COUNT { MATCH (:Node)-[e:Edge]->(n) WHERE e.kind IN $usage }` + if len(usage) == 0 { + // No usage filter requested — drop the second COUNT to skip + // the empty-IN-list edge case and shave a few µs from the + // planner. + inQuery = ` +MATCH (n:Node) +WHERE n.id IN $ids +RETURN n.id, + COUNT { MATCH (:Node)-[:Edge]->(n) }, + 0` + } + inArgs := map[string]any{"ids": stringSliceToAny(uniq)} + if len(usage) > 0 { + inArgs["usage"] = usage + } + inRows := s.querySelect(inQuery, inArgs) + + const outQuery = ` +MATCH (n:Node) +WHERE n.id IN $ids +RETURN n.id, COUNT { MATCH (n)-[:Edge]->(:Node) }` + outRows := s.querySelect(outQuery, map[string]any{"ids": stringSliceToAny(uniq)}) + + byID := make(map[string]*graph.NodeDegreeRow, len(uniq)) + for _, r := range inRows { + if len(r) < 3 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + byID[id] = &graph.NodeDegreeRow{ + NodeID: id, + InCount: int(asInt64(r[1])), + UsageInCount: int(asInt64(r[2])), + } + } + for _, r := range outRows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + row, ok := byID[id] + if !ok { + // Node had outgoing edges but no incoming (or vice + // versa). Build the row from this pass so neither + // direction is silently dropped. + row = &graph.NodeDegreeRow{NodeID: id} + byID[id] = row + } + row.OutCount = int(asInt64(r[1])) + } + + out := make([]graph.NodeDegreeRow, 0, len(byID)) + for _, id := range uniq { + if row, ok := byID[id]; ok { + out = append(out, *row) + } + } + return out +} + +// NodeFanCounts evaluates per-node fan-in / fan-out counts filtered +// by edge kind entirely inside Ladybug. Two Cypher queries, one per +// direction. Replaces the AllEdges() scan that FindHotspots and +// handleAnalyzeHealthScore both ran every call — on the gortex +// workspace that was ~500k edge rows over cgo just to compute four +// integers per node. +// +// Empty fanInKinds / fanOutKinds short-circuits that direction's +// query — the Cypher planner does not love an empty IN-list and the +// caller already encoded "no fan" by passing nil. +func (s *Store) NodeFanCounts(ids []string, fanInKinds []graph.EdgeKind, fanOutKinds []graph.EdgeKind) []graph.NodeFanRow { + if len(ids) == 0 { + return nil + } + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + + byID := make(map[string]*graph.NodeFanRow, len(uniq)) + ensure := func(id string) *graph.NodeFanRow { + row, ok := byID[id] + if !ok { + row = &graph.NodeFanRow{NodeID: id} + byID[id] = row + } + return row + } + + if inKinds := dedupeEdgeKinds(fanInKinds); len(inKinds) > 0 { + const q = ` +MATCH (n:Node) +WHERE n.id IN $ids +RETURN n.id, COUNT { MATCH (:Node)-[e:Edge]->(n) WHERE e.kind IN $kinds }` + rows := s.querySelect(q, map[string]any{ + "ids": stringSliceToAny(uniq), + "kinds": edgeKindSliceToAny(inKinds), + }) + for _, r := range rows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + ensure(id).FanIn = int(asInt64(r[1])) + } + } + + if outKinds := dedupeEdgeKinds(fanOutKinds); len(outKinds) > 0 { + const q = ` +MATCH (n:Node) +WHERE n.id IN $ids +RETURN n.id, COUNT { MATCH (n)-[e:Edge]->(:Node) WHERE e.kind IN $kinds }` + rows := s.querySelect(q, map[string]any{ + "ids": stringSliceToAny(uniq), + "kinds": edgeKindSliceToAny(outKinds), + }) + for _, r := range rows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + ensure(id).FanOut = int(asInt64(r[1])) + } + } + + // When BOTH directions are filtered out, the caller asked for + // nothing — return an empty row per known id rather than nil, + // matching the in-memory reference's behaviour. + if len(byID) == 0 { + out := make([]graph.NodeFanRow, 0, len(uniq)) + for _, id := range uniq { + out = append(out, graph.NodeFanRow{NodeID: id}) + } + // Honour the contract that unknown ids are elided — when + // neither direction matched ANY id, the result is empty. + // Filter by membership in the node table. + const probe = `MATCH (n:Node) WHERE n.id IN $ids RETURN n.id` + seen := make(map[string]struct{}, len(uniq)) + for _, r := range s.querySelect(probe, map[string]any{"ids": stringSliceToAny(uniq)}) { + if len(r) < 1 { + continue + } + id, _ := r[0].(string) + if id != "" { + seen[id] = struct{}{} + } + } + filtered := out[:0] + for _, row := range out { + if _, ok := seen[row.NodeID]; ok { + filtered = append(filtered, row) + } + } + return filtered + } + + out := make([]graph.NodeFanRow, 0, len(byID)) + for _, id := range uniq { + if row, ok := byID[id]; ok { + out = append(out, *row) + } + } + return out +} + +// dedupeEdgeKinds returns a stable, dedup'd copy of kinds with empty +// values removed. +func dedupeEdgeKinds(kinds []graph.EdgeKind) []graph.EdgeKind { + if len(kinds) == 0 { + return nil + } + seen := make(map[graph.EdgeKind]struct{}, len(kinds)) + out := make([]graph.EdgeKind, 0, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + out = append(out, k) + } + return out +} + +// edgeKindSliceToAny converts an EdgeKind slice to []any for Kuzu +// parameter binding (which expects []any for IN-list parameters). +func edgeKindSliceToAny(kinds []graph.EdgeKind) []any { + out := make([]any, 0, len(kinds)) + for _, k := range kinds { + out = append(out, string(k)) + } + return out +} diff --git a/internal/graph/store_ladybug/analysis_deadcode.go b/internal/graph/store_ladybug/analysis_deadcode.go new file mode 100644 index 00000000..b95387f6 --- /dev/null +++ b/internal/graph/store_ladybug/analysis_deadcode.go @@ -0,0 +1,136 @@ +package store_ladybug + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the dead-code-related +// graph capabilities so analysis.FindDeadCode picks the server-side +// path via type assertion. If a signature drifts the build fails +// here instead of silently falling through to the Go-loop fallback. +var ( + _ graph.DeadCodeCandidator = (*Store)(nil) + _ graph.IfaceImplementsScanner = (*Store)(nil) +) + +// DeadCodeCandidates evaluates the dead-code candidate filter +// entirely inside Ladybug. The Go-side fallback (analysis.FindDeadCode +// without this capability) materialises ~133k Node + ~1.3M in-edge +// rows over cgo per call — 49s wall on the gortex workspace; this +// path keeps the per-row materialisation on the server and only +// returns the surviving ~hundreds of candidates. +// +// Strategy: one Cypher per requested node kind. A single combined +// query that switches the allowlist per row is harder to express in +// Kuzu Cypher than the ~6-8 per-kind queries cost (and the per-query +// cgo overhead is amortised against the rows that DO ship back). +// Shape: WHERE NOT EXISTS { MATCH ()-[e:Edge]->(n) WHERE e.kind IN +// $allowed }, confirmed via TestDeadCode_Probe. +func (s *Store) DeadCodeCandidates(allowedNodeKinds []graph.NodeKind, allowedInEdgeKinds map[graph.NodeKind][]graph.EdgeKind) []*graph.Node { + if len(allowedNodeKinds) == 0 { + return nil + } + // Dedup the kind set so an over-eager caller doesn't double-scan. + seen := make(map[graph.NodeKind]struct{}, len(allowedNodeKinds)) + kinds := make([]graph.NodeKind, 0, len(allowedNodeKinds)) + for _, k := range allowedNodeKinds { + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + kinds = append(kinds, k) + } + + var out []*graph.Node + for _, k := range kinds { + allow := allowedInEdgeKinds[k] + out = append(out, s.deadCodeCandidatesForKind(k, allow)...) + } + return out +} + +// deadCodeCandidatesForKind runs the per-node-kind Cypher and +// materialises the matching nodes. When allow is empty the query +// degenerates to "no incoming edges of any kind" — the in-memory +// reference implementation does the same. +func (s *Store) deadCodeCandidatesForKind(kind graph.NodeKind, allow []graph.EdgeKind) []*graph.Node { + if len(allow) == 0 { + // Fast path: any incoming edge counts as usage. Cypher + // without the IN $allowed filter — slightly cheaper plan. + const q = ` +MATCH (n:Node {kind: $kind}) +WHERE NOT EXISTS { MATCH (:Node)-[:Edge]->(n) } +RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"kind": string(kind)}) + return rowsToNodes(rows) + } + allowed := make([]any, 0, len(allow)) + dedup := make(map[graph.EdgeKind]struct{}, len(allow)) + for _, ek := range allow { + if _, ok := dedup[ek]; ok { + continue + } + dedup[ek] = struct{}{} + allowed = append(allowed, string(ek)) + } + const q = ` +MATCH (n:Node {kind: $kind}) +WHERE NOT EXISTS { + MATCH (:Node)-[e:Edge]->(n) + WHERE e.kind IN $allowed +} +RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{ + "kind": string(kind), + "allowed": allowed, + }) + return rowsToNodes(rows) +} + +// IfaceImplementsRows joins KindInterface nodes carrying +// Meta["methods"] with their EdgeImplements predecessors in one +// Cypher round-trip. Replaces the Go-side iterate-then-filter loop +// the analyzer used before this capability landed — that loop +// pulled every interface node, then ranged g.EdgesByKind(implements) +// for the whole graph, every analyze(dead_code) call. +// +// `iface.meta <> ''` excludes interfaces with no encoded Meta +// payload (encodeMeta serialises an empty map to ""). Rows that +// survive are decoded Go-side via decodeMeta. +func (s *Store) IfaceImplementsRows() []graph.IfaceImplementsRow { + const q = ` +MATCH (t:Node)-[e:Edge]->(iface:Node) +WHERE e.kind = $impl + AND iface.kind = $iface + AND iface.meta <> '' +RETURN t.id, iface.id, iface.meta` + rows := s.querySelect(q, map[string]any{ + "impl": string(graph.EdgeImplements), + "iface": string(graph.KindInterface), + }) + if len(rows) == 0 { + return nil + } + out := make([]graph.IfaceImplementsRow, 0, len(rows)) + for _, r := range rows { + if len(r) < 3 { + continue + } + typeID, _ := r[0].(string) + ifaceID, _ := r[1].(string) + metaStr, _ := r[2].(string) + if typeID == "" || ifaceID == "" || metaStr == "" { + continue + } + m, err := decodeMeta(metaStr) + if err != nil || m == nil { + continue + } + out = append(out, graph.IfaceImplementsRow{ + TypeID: typeID, + IfaceID: ifaceID, + IfaceMeta: m, + }) + } + return out +} diff --git a/internal/graph/store_ladybug/analysis_overview.go b/internal/graph/store_ladybug/analysis_overview.go new file mode 100644 index 00000000..664f81f0 --- /dev/null +++ b/internal/graph/store_ladybug/analysis_overview.go @@ -0,0 +1,169 @@ +package store_ladybug + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the overview-aggregate +// capabilities so the get_repo_outline / get_architecture / +// get_surprising_connections / suggest_queries handlers pick the +// server-side path via type assertion. Signature drift fails the +// build here instead of silently falling back to the Go loop. +var ( + _ graph.EdgeKindCounter = (*Store)(nil) + _ graph.CrossRepoEdgeAggregator = (*Store)(nil) + _ graph.FileImportAggregator = (*Store)(nil) +) + +// EdgeKindCounts runs the per-kind tally inside Ladybug. Replaces +// the AllEdges() bucket pass that get_surprising_connections used to +// derive its "rare kinds" set — on the gortex workspace that pulled +// ~286k edge rows over cgo just to bucket ~30 distinct kinds. The +// Cypher GROUP BY ships back one row per kind: typically a handful +// across the entire repo. +func (s *Store) EdgeKindCounts() map[graph.EdgeKind]int { + const q = ` +MATCH ()-[e:Edge]->() +RETURN e.kind, count(*)` + rows := s.querySelect(q, nil) + if len(rows) == 0 { + return nil + } + out := make(map[graph.EdgeKind]int, len(rows)) + for _, r := range rows { + if len(r) < 2 { + continue + } + kind, _ := r[0].(string) + if kind == "" { + continue + } + out[graph.EdgeKind(kind)] = int(asInt64(r[1])) + } + if len(out) == 0 { + return nil + } + return out +} + +// CrossRepoEdgeCounts runs the (kind, fromRepo, toRepo) rollup +// inside Ladybug. Replaces the AllEdges() + per-edge GetNode pair +// in handleGetArchitecture — on the gortex workspace that loop +// materialised every edge over cgo plus thousands of per-edge +// GetNode round-trips to emit typically <100 cross-repo rows. One +// Cypher join now ships only the surviving per-triple counts. +// +// The IN list mirrors graph.BaseKindForCrossRepo (the canonical +// cross-repo edge-kind set) — a fresh kind landing in +// internal/graph/edge.go without a corresponding update here would +// quietly drop from the rollup, so the kind list is duplicated by +// design (one-place change still tractable) rather than reflected +// at runtime. +func (s *Store) CrossRepoEdgeCounts() []graph.CrossRepoEdgeRow { + const q = ` +MATCH (from:Node)-[e:Edge]->(to:Node) +WHERE e.kind IN $kinds +RETURN e.kind, from.repo_prefix, to.repo_prefix, count(*)` + args := map[string]any{ + "kinds": []any{ + string(graph.EdgeCrossRepoCalls), + string(graph.EdgeCrossRepoImplements), + string(graph.EdgeCrossRepoExtends), + }, + } + rows := s.querySelect(q, args) + if len(rows) == 0 { + return nil + } + out := make([]graph.CrossRepoEdgeRow, 0, len(rows)) + for _, r := range rows { + if len(r) < 4 { + continue + } + kind, _ := r[0].(string) + if kind == "" { + continue + } + fromRepo, _ := r[1].(string) + toRepo, _ := r[2].(string) + out = append(out, graph.CrossRepoEdgeRow{ + Kind: graph.EdgeKind(kind), + FromRepo: fromRepo, + ToRepo: toRepo, + Count: int(asInt64(r[3])), + }) + } + if len(out) == 0 { + return nil + } + return out +} + +// FileImportCounts runs the per-target-file import-count rollup +// inside Ladybug. Replaces the AllEdges() + per-edge GetNode loop +// in mostImportedFiles — that pass materialised every edge over +// cgo (~286k on the gortex workspace) plus a per-edge GetNode +// round-trip just to produce a top-10 list. The Cypher GROUP BY +// returns one row per imported file path. +// +// The COALESCE mirrors the indexer's two import shapes: file- +// targeted imports point at the file node (whose ID is the path), +// symbol-targeted imports land on a symbol whose FilePath holds +// the path. The Go-side ranker handles the top-N truncation and +// the file-path-vs-ID humanising — keep that out of Cypher. +// +// scope, when non-nil, bounds the counted edges to those whose +// target ID lies in the slice. An empty (non-nil) scope returns +// nil (mirroring the in-memory contract) — never a whole-graph +// scan. A nil scope counts every imports edge. +func (s *Store) FileImportCounts(scope []string) []graph.FileImportCountRow { + if scope != nil && len(scope) == 0 { + return nil + } + scopeArg := dedupeNonEmpty(scope) + if scope != nil && len(scopeArg) == 0 { + return nil + } + + // COALESCE folds file-id-targeted vs symbol-FilePath-targeted + // imports into a single grouping key. Without it the rollup + // would split popular.go's count across "popular.go" and + // "PopularFn". + q := ` +MATCH (from:Node)-[e:Edge]->(to:Node) +WHERE e.kind = $imp + AND (to.file_path IS NOT NULL OR to.id IS NOT NULL) +RETURN coalesce(to.file_path, to.id), count(*)` + args := map[string]any{"imp": string(graph.EdgeImports)} + if scope != nil { + q = ` +MATCH (from:Node)-[e:Edge]->(to:Node) +WHERE e.kind = $imp + AND to.id IN $scope + AND (to.file_path IS NOT NULL OR to.id IS NOT NULL) +RETURN coalesce(to.file_path, to.id), count(*)` + args["scope"] = stringSliceToAny(scopeArg) + } + rows := s.querySelect(q, args) + if len(rows) == 0 { + return nil + } + out := make([]graph.FileImportCountRow, 0, len(rows)) + for _, r := range rows { + if len(r) < 2 { + continue + } + path, _ := r[0].(string) + if path == "" { + continue + } + out = append(out, graph.FileImportCountRow{ + FilePath: path, + Count: int(asInt64(r[1])), + }) + } + if len(out) == 0 { + return nil + } + return out +} diff --git a/internal/graph/store_ladybug/analysis_pushdown.go b/internal/graph/store_ladybug/analysis_pushdown.go new file mode 100644 index 00000000..b908be7a --- /dev/null +++ b/internal/graph/store_ladybug/analysis_pushdown.go @@ -0,0 +1,286 @@ +package store_ladybug + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the new pushdown +// capabilities for the performance-wave handlers. A drift in any +// signature fails the build here instead of silently dropping to the +// Go-loop fallback. +var ( + _ graph.InDegreeForNodes = (*Store)(nil) + _ graph.ReachableForwardByKinds = (*Store)(nil) + _ graph.ThrowerErrorSurfacer = (*Store)(nil) +) + +// InDegreeForNodes runs the per-target incoming-edge count entirely +// inside Ladybug. Replaces the AllEdges() + Go-side bucket pass the +// surprising-connections handler used to feed its hub heuristic — on +// the gortex workspace that materialised ~286k edges over cgo just +// to count fan-in for a few thousand scoped nodes. +// +// COUNT { … } sub-query returns the bucket size without materialising +// the edges. The IN-list constrains the rows to the caller's scoped +// id set so the planner can index-walk the in-edge adjacency. +func (s *Store) InDegreeForNodes(ids []string) map[string]int { + if len(ids) == 0 { + return nil + } + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + const q = ` +MATCH (n:Node) +WHERE n.id IN $ids +RETURN n.id, COUNT { MATCH (:Node)-[:Edge]->(n) }` + rows := s.querySelect(q, map[string]any{"ids": stringSliceToAny(uniq)}) + if len(rows) == 0 { + return nil + } + out := make(map[string]int, len(rows)) + for _, r := range rows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + c := int(asInt64(r[1])) + if c == 0 { + continue + } + out[id] = c + } + if len(out) == 0 { + return nil + } + return out +} + +// ReachableForwardByKinds runs the layer-by-layer forward BFS inside +// Ladybug. The Go fallback walks GetOutEdges per frontier id — on a +// repo with thousands of seeds the loop fires tens of thousands of +// cgo round-trips. Each layer here is one Cypher query that returns +// every distinct To-node reachable from the current frontier through +// the allowed edge kinds; the loop terminates when no new ids +// surface. +// +// Layer-driven instead of one giant recursive var-length match: the +// closure size matters more than the number of round-trips, and +// Kuzu's planner picks better index-walks against a small frontier +// IN-list than against an unbounded `*1..N` pattern with a kind +// filter in the relationship body. +func (s *Store) ReachableForwardByKinds(seeds []string, kinds []graph.EdgeKind) map[string]bool { + if len(seeds) == 0 { + return nil + } + covered := make(map[string]bool, len(seeds)) + frontier := make([]string, 0, len(seeds)) + for _, id := range seeds { + if id == "" || covered[id] { + continue + } + covered[id] = true + frontier = append(frontier, id) + } + if len(kinds) == 0 || len(frontier) == 0 { + return covered + } + kindArgs := edgeKindSliceToAny(dedupeEdgeKinds(kinds)) + if len(kindArgs) == 0 { + return covered + } + const q = ` +MATCH (src:Node)-[e:Edge]->(dst:Node) +WHERE src.id IN $frontier + AND e.kind IN $kinds +RETURN DISTINCT dst.id` + for len(frontier) > 0 { + rows := s.querySelect(q, map[string]any{ + "frontier": stringSliceToAny(frontier), + "kinds": kindArgs, + }) + next := frontier[:0:0] + for _, r := range rows { + if len(r) < 1 { + continue + } + id, _ := r[0].(string) + if id == "" || covered[id] { + continue + } + covered[id] = true + next = append(next, id) + } + frontier = next + } + return covered +} + +// throwerAgg is the intermediate per-thrower aggregator used while +// stitching the two ThrowerErrorSurface passes together. +type throwerAgg struct { + throws int + targets []string + emitMsgs []string + file string + line int +} + +// ThrowerErrorSurface runs the analyze(error_surface) rollup as two +// Cypher GROUP BYs inside Ladybug. Replaces the legacy walk that +// scanned EdgeThrows then issued GetOutEdges per thrower for the +// EdgeEmits → KindString attachment — on the gortex workspace that +// loop materialised the throws bucket plus ~thousands of per-thrower +// cgo round-trips just to land at a few dozen aggregated rows. +// +// The pathPrefix filter is evaluated with Kuzu's starts_with on the +// EdgeThrows e.file_path column. An empty prefix is dropped from the +// WHERE clause so the planner picks the kind-only index walk. +func (s *Store) ThrowerErrorSurface(pathPrefix string) []graph.ThrowerErrorRow { + args := map[string]any{"throws": string(graph.EdgeThrows)} + pass1 := ` +MATCH (from:Node)-[e:Edge]->(to:Node) +WHERE e.kind = $throws` + if pathPrefix != "" { + pass1 += "\n AND starts_with(e.file_path, $prefix)" + args["prefix"] = pathPrefix + } + pass1 += ` +RETURN from.id, to.id, count(*), min(e.file_path), min(e.line)` + + rows := s.querySelect(pass1, args) + if len(rows) == 0 { + return nil + } + + byThrower := map[string]*throwerAgg{} + addUnique := func(set []string, v string) []string { + for _, s := range set { + if s == v { + return set + } + } + return append(set, v) + } + for _, r := range rows { + if len(r) < 5 { + continue + } + from, _ := r[0].(string) + to, _ := r[1].(string) + if from == "" || to == "" { + continue + } + count := int(asInt64(r[2])) + file, _ := r[3].(string) + line := int(asInt64(r[4])) + agg, ok := byThrower[from] + if !ok { + agg = &throwerAgg{file: file, line: line} + byThrower[from] = agg + } + agg.throws += count + agg.targets = addUnique(agg.targets, to) + if agg.file == "" && file != "" { + agg.file = file + } + if agg.line == 0 && line != 0 { + agg.line = line + } + } + if len(byThrower) == 0 { + return nil + } + + // Backfill missing file / line from the thrower node row itself + // when the edge metadata didn't carry them. + missingMeta := make([]string, 0) + for id, r := range byThrower { + if r.file == "" || r.line == 0 { + missingMeta = append(missingMeta, id) + } + } + if len(missingMeta) > 0 { + const probe = `MATCH (n:Node) WHERE n.id IN $ids RETURN n.id, n.file_path, n.start_line` + mrows := s.querySelect(probe, map[string]any{"ids": stringSliceToAny(missingMeta)}) + for _, r := range mrows { + if len(r) < 3 { + continue + } + id, _ := r[0].(string) + file, _ := r[1].(string) + line := int(asInt64(r[2])) + agg, ok := byThrower[id] + if !ok { + continue + } + if agg.file == "" { + agg.file = file + } + if agg.line == 0 { + agg.line = line + } + } + } + + // Pass 2: per-(thrower, error_msg) emit join. Pulls every + // EdgeEmits→KindString edge whose source is a known thrower, then + // filters on meta.context = error_msg Go-side (the meta column is + // the encoded blob — same shape IfaceImplementsScanner consumes). + throwerIDs := make([]string, 0, len(byThrower)) + for id := range byThrower { + throwerIDs = append(throwerIDs, id) + } + const emitQ = ` +MATCH (from:Node)-[e:Edge]->(to:Node) +WHERE e.kind = $emits + AND from.id IN $throwers + AND to.kind = $strKind +RETURN from.id, to.name, to.meta` + emitRows := s.querySelect(emitQ, map[string]any{ + "emits": string(graph.EdgeEmits), + "throwers": stringSliceToAny(throwerIDs), + "strKind": string(graph.KindString), + }) + for _, r := range emitRows { + if len(r) < 3 { + continue + } + from, _ := r[0].(string) + name, _ := r[1].(string) + metaStr, _ := r[2].(string) + if from == "" || name == "" || metaStr == "" { + continue + } + agg, ok := byThrower[from] + if !ok { + continue + } + m, err := decodeMeta(metaStr) + if err != nil || m == nil { + continue + } + ctxLabel, _ := m["context"].(string) + if ctxLabel != "error_msg" { + continue + } + agg.emitMsgs = addUnique(agg.emitMsgs, name) + } + + out := make([]graph.ThrowerErrorRow, 0, len(byThrower)) + for id, r := range byThrower { + out = append(out, graph.ThrowerErrorRow{ + ThrowerID: id, + FilePath: r.file, + Line: r.line, + Throws: r.throws, + ErrorTargets: append([]string(nil), r.targets...), + ErrorMsgs: append([]string(nil), r.emitMsgs...), + }) + } + return out +} diff --git a/internal/graph/store_ladybug/analysis_verify_search.go b/internal/graph/store_ladybug/analysis_verify_search.go new file mode 100644 index 00000000..1f878ead --- /dev/null +++ b/internal/graph/store_ladybug/analysis_verify_search.go @@ -0,0 +1,217 @@ +package store_ladybug + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the verify+search +// capability set so the MCP handlers pick the server-side path via +// type assertion. Signature drift breaks the build here instead of +// silently degrading to the AllNodes / AllEdges Go fallback. +var ( + _ graph.FileImporters = (*Store)(nil) + _ graph.InEdgeCounter = (*Store)(nil) + _ graph.NodesInFilesByKindFinder = (*Store)(nil) + _ graph.NodesByKindsScanner = (*Store)(nil) +) + +// NodesByKinds runs the multi-kind candidate scan inside Ladybug. +// Replaces the AllNodes()-then-`if n.Kind != allowed` loop used by +// the metadata analyze handlers (todos, stale_code, stale_flags, +// ownership, coverage_gaps, coverage_summary, cgo_users, wasm_users, +// orphan_tables, unreferenced_tables). The legacy path pulled every +// node over cgo on every call — ~70k rows on the gortex workspace — +// just to keep the handful that matched one of a few kinds. The +// Cypher IN-list ships only the matching rows. +// +// One IN query, not a per-kind loop, because every extra round-trip +// is one more cgo crossing. Kinds dedup keeps the IN list tight when +// the caller passes redundant kinds, matching the in-memory reference. +// +// Meta filtering stays in Go: the meta column is a gob-encoded +// base64 STRING so Cypher cannot inspect its inner keys. The +// candidate-set reduction is the win — the meta gate runs against +// the surviving rows on the Go side. +func (s *Store) NodesByKinds(kinds []graph.NodeKind) []*graph.Node { + if len(kinds) == 0 { + return nil + } + seen := make(map[graph.NodeKind]struct{}, len(kinds)) + allowed := make([]any, 0, len(kinds)) + for _, k := range kinds { + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + allowed = append(allowed, string(k)) + } + if len(allowed) == 0 { + return nil + } + const q = `MATCH (n:Node) WHERE n.kind IN $kinds RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"kinds": allowed}) + if len(rows) == 0 { + return nil + } + out := make([]*graph.Node, 0, len(rows)) + for _, r := range rows { + if n := rowToNode(r); n != nil { + out = append(out, n) + } + } + if len(rows) >= mallocTrimRowThreshold { + mallocTrim() + } + return out +} + +// FileImporters runs the importing-files lookup inside Ladybug. +// Replaces the handleCheckReferences AllEdges() loop — that loop +// materialised every edge over cgo (~286k on the gortex workspace) +// plus per-edge GetNode(e.To)+GetNode(e.From), to answer "what +// imports this file?" with a few rows. One Cypher join now ships +// only the matching rows. +// +// The OR on (to.file_path == $f OR to.id == $f) keeps parity with +// the indexer's two import shapes: file-targeted imports point at +// the file node (whose ID is the path), symbol-targeted imports +// land on a symbol whose FilePath equals the path. +func (s *Store) FileImporters(filePath string) []graph.FileImporterRow { + if filePath == "" { + return nil + } + const q = ` +MATCH (from:Node)-[e:Edge]->(to:Node) +WHERE e.kind = $imp + AND (to.file_path = $f OR to.id = $f) +RETURN from.file_path, from.id, from.name, from.kind` + rows := s.querySelect(q, map[string]any{ + "imp": string(graph.EdgeImports), + "f": filePath, + }) + if len(rows) == 0 { + return nil + } + out := make([]graph.FileImporterRow, 0, len(rows)) + for _, r := range rows { + if len(r) < 4 { + continue + } + fromFile, _ := r[0].(string) + fromID, _ := r[1].(string) + fromName, _ := r[2].(string) + fromKind, _ := r[3].(string) + if fromID == "" { + continue + } + out = append(out, graph.FileImporterRow{ + FromFile: fromFile, + FromID: fromID, + FromName: fromName, + FromKind: graph.NodeKind(fromKind), + }) + } + return out +} + +// InEdgeCountsByKind runs the fan-in count inside Ladybug. Replaces +// the AllEdges() loop in handleGetUntestedSymbols — that loop pulled +// every edge over cgo just to bucket the to-id counts of two kinds. +// The Cypher count(*) returns one row per To, so only the surviving +// per-target counts cross cgo. +func (s *Store) InEdgeCountsByKind(kinds []graph.EdgeKind) map[string]int { + if len(kinds) == 0 { + return nil + } + // Dedup the kinds so the IN list doesn't double-count when the + // caller passes redundant kinds. + seen := make(map[graph.EdgeKind]struct{}, len(kinds)) + allowed := make([]any, 0, len(kinds)) + for _, k := range kinds { + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + allowed = append(allowed, string(k)) + } + const q = ` +MATCH ()-[e:Edge]->(n:Node) +WHERE e.kind IN $kinds +RETURN n.id, count(*)` + rows := s.querySelect(q, map[string]any{"kinds": allowed}) + if len(rows) == 0 { + return nil + } + out := make(map[string]int, len(rows)) + for _, r := range rows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + // Kuzu returns count(*) as an int64. + switch v := r[1].(type) { + case int64: + out[id] = int(v) + case int: + out[id] = v + case int32: + out[id] = int(v) + } + } + return out +} + +// NodesInFilesByKind runs the file+kind filter inside Ladybug. +// Replaces the AllNodes() pull in find_declaration's +// buildDeclFileIndex — that loop materialised every node over cgo +// (~70k on the gortex workspace) just to keep the few that landed +// in the small set of trigram-match files. +// +// Empty files or empty kinds returns nil — never a whole-graph +// scan. The deduped IN list keeps the engine plan tight even when +// the caller passes a sloppy file or kind list. +func (s *Store) NodesInFilesByKind(files []string, kinds []graph.NodeKind) []*graph.Node { + if len(files) == 0 || len(kinds) == 0 { + return nil + } + seenFile := make(map[string]struct{}, len(files)) + fileList := make([]any, 0, len(files)) + for _, f := range files { + if f == "" { + continue + } + if _, ok := seenFile[f]; ok { + continue + } + seenFile[f] = struct{}{} + fileList = append(fileList, f) + } + if len(fileList) == 0 { + return nil + } + seenKind := make(map[graph.NodeKind]struct{}, len(kinds)) + kindList := make([]any, 0, len(kinds)) + for _, k := range kinds { + if _, ok := seenKind[k]; ok { + continue + } + seenKind[k] = struct{}{} + kindList = append(kindList, string(k)) + } + if len(kindList) == 0 { + return nil + } + const q = ` +MATCH (n:Node) +WHERE n.file_path IN $files + AND n.kind IN $kinds +RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{ + "files": fileList, + "kinds": kindList, + }) + return rowsToNodes(rows) +} diff --git a/internal/graph/store_ladybug/analysis_wave_v3.go b/internal/graph/store_ladybug/analysis_wave_v3.go new file mode 100644 index 00000000..9ae30d35 --- /dev/null +++ b/internal/graph/store_ladybug/analysis_wave_v3.go @@ -0,0 +1,650 @@ +package store_ladybug + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the per-tool pushdown +// capabilities introduced by the wave-3 MCP-tool perf push. A drift +// in any signature fails the build here instead of silently dropping +// to the in-memory fallback path. +var ( + _ graph.ExtractCandidatesScanner = (*Store)(nil) + _ graph.FileSymbolNamesByPaths = (*Store)(nil) + _ graph.ClassHierarchyTraverser = (*Store)(nil) + _ graph.FileEditingContext = (*Store)(nil) + _ graph.NodeDegreeByKinds = (*Store)(nil) + _ graph.FileSubGraphReader = (*Store)(nil) + _ graph.FileSubGraphCountReader = (*Store)(nil) +) + +// ExtractCandidates evaluates per-function caller-count + fan-out +// directly inside Ladybug. Two Cypher aggregates by node ID over the +// requested edge-kind set, joined to the node table on the function / +// method kind set, with the three threshold gates applied server- +// side. Replaces the AllNodes + per-node GetInEdges + GetOutEdges loop +// the handler ran previously — that fired 2N cgo round-trips on a +// 30k-function graph, where each per-call materialised the full edge +// bucket just to count distinct endpoints. +// +// DISTINCT counts mirror the in-memory reference: one caller counted +// once per (From) value, one callee once per (To) value. +func (s *Store) ExtractCandidates( + kinds []graph.EdgeKind, + minLines, minCallers, minFanOut int, + pathPrefix string, +) []graph.ExtractCandidateRow { + if len(kinds) == 0 { + return nil + } + ek := edgeKindSliceToAny(dedupeEdgeKinds(kinds)) + if len(ek) == 0 { + return nil + } + // Per-node distinct caller / callee count. The edge table can hold + // multiple rows for the same (From, To, kind) triple (one per + // call site / line), so we MUST distinct over the endpoint id — + // not the edge — to match the in-memory reference. + // + // Implicit GROUP BY on n.id: Kuzu groups by every non-aggregate + // projection column. + const callerQ = ` +MATCH (c:Node)-[e:Edge]->(n:Node) +WHERE n.kind IN ['function', 'method'] + AND e.kind IN $kinds +RETURN n.id, COUNT(DISTINCT c.id)` + const calleeQ = ` +MATCH (n:Node)-[e:Edge]->(c:Node) +WHERE n.kind IN ['function', 'method'] + AND e.kind IN $kinds +RETURN n.id, COUNT(DISTINCT c.id)` + + callerRows := s.querySelect(callerQ, map[string]any{"kinds": ek}) + calleeRows := s.querySelect(calleeQ, map[string]any{"kinds": ek}) + + type counts struct{ callers, fanOut int } + merged := make(map[string]*counts, len(callerRows)) + getOrCreate := func(id string) *counts { + c, ok := merged[id] + if !ok { + c = &counts{} + merged[id] = c + } + return c + } + for _, r := range callerRows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + getOrCreate(id).callers = int(asInt64(r[1])) + } + for _, r := range calleeRows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + getOrCreate(id).fanOut = int(asInt64(r[1])) + } + + // Threshold-filter the candidate IDs Go-side first — minCallers / + // minFanOut shave the IN-list before we look up the node columns. + keep := make([]string, 0, len(merged)) + for id, c := range merged { + if c.callers < minCallers || c.fanOut < minFanOut { + continue + } + keep = append(keep, id) + } + if len(keep) == 0 { + return nil + } + + // Single Cypher pull for the node columns the row needs. + const nodeQ = ` +MATCH (n:Node) +WHERE n.id IN $ids +RETURN n.id, n.name, n.file_path, n.start_line, n.end_line` + nodeRows := s.querySelect(nodeQ, map[string]any{"ids": stringSliceToAny(keep)}) + + out := make([]graph.ExtractCandidateRow, 0, len(nodeRows)) + for _, r := range nodeRows { + if len(r) < 5 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + name, _ := r[1].(string) + fp, _ := r[2].(string) + if pathPrefix != "" && !strings.HasPrefix(fp, pathPrefix) { + continue + } + start := int(asInt64(r[3])) + end := int(asInt64(r[4])) + if start == 0 || end == 0 { + continue + } + lineCount := end - start + 1 + if lineCount < minLines { + continue + } + c := merged[id] + if c == nil { + continue + } + out = append(out, graph.ExtractCandidateRow{ + NodeID: id, + Name: name, + FilePath: fp, + StartLine: start, + EndLine: end, + LineCount: lineCount, + CallerCount: c.callers, + FanOut: c.fanOut, + }) + } + return out +} + +// FileSymbolNamesByPaths runs one Cypher MATCH with the path + kind +// IN-lists, returning (file_path, name) pairs. Replaces the per-path +// GetFileNodes loop find_co_changing_symbols ran after a positive +// match — that's 20 separate Cypher queries against the file_path +// secondary index in the previous shape. +func (s *Store) FileSymbolNamesByPaths(paths []string, kinds []graph.NodeKind) []graph.FileSymbolNameRow { + if len(paths) == 0 { + return nil + } + uniqPaths := dedupeNonEmpty(paths) + if len(uniqPaths) == 0 { + return nil + } + const qAll = ` +MATCH (n:Node) +WHERE n.file_path IN $paths +RETURN n.file_path, n.name` + const qKinds = ` +MATCH (n:Node) +WHERE n.file_path IN $paths + AND n.kind IN $kinds +RETURN n.file_path, n.name` + q := qAll + args := map[string]any{"paths": stringSliceToAny(uniqPaths)} + if len(kinds) > 0 { + nk := nodeKindSliceToAny(dedupeNodeKinds(kinds)) + if len(nk) == 0 { + return nil + } + q = qKinds + args["kinds"] = nk + } + rows := s.querySelect(q, args) + if len(rows) == 0 { + return nil + } + type pair struct{ p, n string } + seen := make(map[pair]struct{}, len(rows)) + out := make([]graph.FileSymbolNameRow, 0, len(rows)) + for _, r := range rows { + if len(r) < 2 { + continue + } + fp, _ := r[0].(string) + name, _ := r[1].(string) + if fp == "" || name == "" { + continue + } + key := pair{fp, name} + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + out = append(out, graph.FileSymbolNameRow{FilePath: fp, Name: name}) + } + return out +} + +// ClassHierarchyTraverse evaluates the inheritance subgraph rooted at +// the seed inside Ladybug. One variable-length traversal per +// direction replaces the per-frontier-node GetNode + GetInEdges / +// GetOutEdges loop query.ClassHierarchy ran — that was depth * width +// cgo round-trips on Ladybug, each round-trip materialising the full +// edge bucket just to filter on a handful of kinds. +// +// The result rows carry the Path (visited IDs in BFS order, exclusive +// of the seed) plus the per-hop EdgeKinds so the caller can rebuild +// the visited node set + edge identities without further graph +// traversal. +func (s *Store) ClassHierarchyTraverse( + seedID string, + direction string, + kinds []graph.EdgeKind, + depth int, +) []graph.ClassHierarchyRow { + if seedID == "" || depth <= 0 || len(kinds) == 0 { + return nil + } + ek := edgeKindSliceToAny(dedupeEdgeKinds(kinds)) + if len(ek) == 0 { + return nil + } + walkUp := direction == "up" + walkDown := direction == "down" + if !walkUp && !walkDown { + return nil + } + if depth > 64 { + depth = 64 + } + // BFS Cypher: one query per hop avoids re-walking the same + // frontier on each iteration. Ladybug's planner handles + // variable-length patterns, but per-hop is cheaper here because + // the kind filter restricts the per-hop fanout dramatically (most + // nodes have <5 hierarchy edges) and we want to enforce the + // "first reached wins" visited-set semantic the in-memory + // reference implements. + visited := map[string]struct{}{seedID: {}} + type row struct { + path []string + edgeKinds []graph.EdgeKind + } + frontier := []row{{path: nil, edgeKinds: nil}} + frontierIDs := []string{seedID} + var out []graph.ClassHierarchyRow + for hop := 0; hop < depth && len(frontierIDs) > 0; hop++ { + var q string + if walkUp { + q = `MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE a.id IN $ids AND e.kind IN $kinds +RETURN a.id, b.id, e.kind` + } else { + q = `MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE b.id IN $ids AND e.kind IN $kinds +RETURN b.id, a.id, e.kind` + } + rows := s.querySelect(q, map[string]any{ + "ids": stringSliceToAny(frontierIDs), + "kinds": ek, + }) + if len(rows) == 0 { + break + } + // Group neighbours by their predecessor in the frontier so + // the row reconstruction joins the per-frontier path with the + // new hop. + byPred := make(map[string][]struct { + nb string + kind graph.EdgeKind + }, len(rows)) + for _, r := range rows { + if len(r) < 3 { + continue + } + pred, _ := r[0].(string) + nb, _ := r[1].(string) + kind, _ := r[2].(string) + if pred == "" || nb == "" { + continue + } + byPred[pred] = append(byPred[pred], struct { + nb string + kind graph.EdgeKind + }{nb: nb, kind: graph.EdgeKind(kind)}) + } + // Map frontier IDs to their accumulated paths. + predRow := make(map[string]row, len(frontierIDs)) + for i, id := range frontierIDs { + predRow[id] = frontier[i] + } + nextIDs := make([]string, 0) + nextFrontier := make([]row, 0) + for pred, neighbours := range byPred { + pr, ok := predRow[pred] + if !ok { + continue + } + for _, nbInfo := range neighbours { + if _, seen := visited[nbInfo.nb]; seen { + continue + } + visited[nbInfo.nb] = struct{}{} + newPath := append([]string(nil), pr.path...) + newPath = append(newPath, nbInfo.nb) + newKinds := append([]graph.EdgeKind(nil), pr.edgeKinds...) + newKinds = append(newKinds, nbInfo.kind) + out = append(out, graph.ClassHierarchyRow{ + Path: newPath, + EdgeKinds: newKinds, + }) + nextIDs = append(nextIDs, nbInfo.nb) + nextFrontier = append(nextFrontier, row{path: newPath, edgeKinds: newKinds}) + } + } + frontierIDs = nextIDs + frontier = nextFrontier + } + return out +} + +// FileEditingContext bundles every projection get_editing_context +// needs into the smallest backend round-trip count Ladybug allows. +// Replaces the handler's per-symbol GetCallers + GetCallChain loop — +// a 30-function file fired ~60 query-engine entries on Ladybug +// previously; this caps the surface at five Cypher statements +// regardless of file size. +func (s *Store) FileEditingContext(filePath string, kinds []graph.NodeKind) *graph.FileEditingContextResult { + if filePath == "" { + return nil + } + const fileQ = `MATCH (n:Node) WHERE n.file_path = $f RETURN ` + nodeReturnCols + rows := s.querySelect(fileQ, map[string]any{"f": filePath}) + nodes := rowsToNodes(rows) + if len(nodes) == 0 { + return nil + } + kset := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + if k == "" { + continue + } + kset[k] = struct{}{} + } + res := &graph.FileEditingContextResult{} + var defIDs []string + for _, n := range nodes { + if n == nil { + continue + } + if n.Kind == graph.KindFile { + res.FileNode = n + continue + } + res.Defines = append(res.Defines, n) + if _, ok := kset[n.Kind]; ok { + defIDs = append(defIDs, n.ID) + } + } + if res.FileNode != nil { + const importQ = `MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE a.id = $id AND e.kind = 'imports' +RETURN ` + edgeReturnCols + importRows := s.querySelect(importQ, map[string]any{"id": res.FileNode.ID}) + res.Imports = rowsToEdges(importRows) + } + if len(defIDs) == 0 { + return res + } + // One IN-list scan per direction — the caller / callee node columns + // come back in the same round-trip via a join on the call edge. + callerQ := ` +MATCH (caller:Node)-[e:Edge]->(callee:Node) +WHERE callee.id IN $ids + AND e.kind = 'calls' + AND caller.file_path <> $file +RETURN DISTINCT ` + prefixedNodeReturnCols("caller") + calleeQ := ` +MATCH (caller:Node)-[e:Edge]->(callee:Node) +WHERE caller.id IN $ids + AND e.kind = 'calls' + AND callee.file_path <> $file +RETURN DISTINCT ` + prefixedNodeReturnCols("callee") + callerRows := s.querySelect(callerQ, map[string]any{ + "ids": stringSliceToAny(defIDs), + "file": filePath, + }) + res.CalledBy = rowsToNodes(callerRows) + calleeRows := s.querySelect(calleeQ, map[string]any{ + "ids": stringSliceToAny(defIDs), + "file": filePath, + }) + res.Calls = rowsToNodes(calleeRows) + return res +} + +// NodeDegreeByKinds computes per-node total in/out edge counts for +// every node whose kind is in the supplied set, server-side. Replaces +// the IN-list-of-30k-IDs shape NodeDegreeCounts uses — the planner has +// to materialise the IN-list before joining, where this query lets it +// pick the kind-filtered node set up front (smaller working set, no +// IN-list bloat). +func (s *Store) NodeDegreeByKinds(kinds []graph.NodeKind, pathPrefix string) []graph.NodeDegreeRow { + if len(kinds) == 0 { + return nil + } + nk := nodeKindSliceToAny(dedupeNodeKinds(kinds)) + if len(nk) == 0 { + return nil + } + withPrefix := pathPrefix != "" + + // COUNT { … } sub-query is the only way to keep this in a single + // MATCH while still returning a per-node aggregate. The two sub- + // queries together cost one extra index probe per node. + var inQ, outQ string + if withPrefix { + inQ = `MATCH (n:Node) +WHERE n.kind IN $kinds + AND starts_with(n.file_path, $prefix) +RETURN n.id, COUNT { MATCH (:Node)-[:Edge]->(n) }` + outQ = `MATCH (n:Node) +WHERE n.kind IN $kinds + AND starts_with(n.file_path, $prefix) +RETURN n.id, COUNT { MATCH (n)-[:Edge]->(:Node) }` + } else { + inQ = `MATCH (n:Node) +WHERE n.kind IN $kinds +RETURN n.id, COUNT { MATCH (:Node)-[:Edge]->(n) }` + outQ = `MATCH (n:Node) +WHERE n.kind IN $kinds +RETURN n.id, COUNT { MATCH (n)-[:Edge]->(:Node) }` + } + args := map[string]any{"kinds": nk} + if withPrefix { + args["prefix"] = pathPrefix + } + inRows := s.querySelect(inQ, args) + outRows := s.querySelect(outQ, args) + byID := make(map[string]*graph.NodeDegreeRow, len(inRows)) + ensure := func(id string) *graph.NodeDegreeRow { + r, ok := byID[id] + if !ok { + r = &graph.NodeDegreeRow{NodeID: id} + byID[id] = r + } + return r + } + for _, r := range inRows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + ensure(id).InCount = int(asInt64(r[1])) + } + for _, r := range outRows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + ensure(id).OutCount = int(asInt64(r[1])) + } + out := make([]graph.NodeDegreeRow, 0, len(byID)) + for _, r := range byID { + out = append(out, *r) + } + return out +} + +// GetFileSubGraph returns the file node, every symbol the file +// defines or contains, and every edge adjacent to any of them. +// Replaces the GetFileNodes + GetOut/InEdgesByNodeIDs trio the engine +// used previously — that was a property-filter scan over Node +// (`MATCH (n {file_path: $f})`, no secondary index on file_path +// available in Kuzu) followed by two IN-list scans over Edge. +// +// The rewrite anchors on the file node's primary key — which Kuzu +// already HASH-indexes — and follows EdgeDefines / EdgeContains via +// the rel-table FROM index. The two adjacency walks still use IN- +// lists but their cardinality drops to the symbols actually defined +// by the file (typically <1 000) instead of being filtered post-scan. +// The biggest win comes from skipping the full Node-table scan on +// the headline lookup. +func (s *Store) GetFileSubGraph(filePath string) ([]*graph.Node, []*graph.Edge) { + if filePath == "" { + return nil, nil + } + // Collect the file node plus every symbol anchored to it via the + // file_path column, exactly like the canonical in-memory + // Graph.GetFileSubGraph (which resolves members through + // GetFileNodes). The earlier revision walked file→symbol + // `defines`/`contains` edges instead, but the ladybug COPY and + // incremental-reindex paths never persist those edges — so the + // child set came back empty and get_file_summary reported "no + // symbols found" for every file. GetFileNodes routes through the + // file→id accelerator (a PK MATCH on the id set), so this is both + // correct and as cheap as the broken edge walk it replaces. + nodes := s.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil, nil + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil && n.ID != "" { + ids = append(ids, n.ID) + } + } + if len(ids) == 0 { + return nodes, nil + } + // Adjacent edges — the IN-list is small (~file_symbols), not the + // whole rerank candidate set. Edges that appear in both directions + // (intra-file) are deduped Go-side via a struct key. JSON callers + // of get_file_summary are the only consumers that materialise the + // list; gcx + compact callers reach for the count-only path + // (GetFileSubGraphCounts) instead and never load the full edge set. + const outQ = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE a.id IN $ids RETURN ` + edgeReturnCols + const inQ = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE b.id IN $ids RETURN ` + edgeReturnCols + args := map[string]any{"ids": stringSliceToAny(ids)} + outRows := s.querySelect(outQ, args) + inRows := s.querySelect(inQ, args) + type edgeKey struct { + from string + to string + kind graph.EdgeKind + } + seen := make(map[edgeKey]struct{}, len(outRows)+len(inRows)) + edges := make([]*graph.Edge, 0, len(outRows)+len(inRows)) + add := func(rows [][]any) { + for _, r := range rows { + e := rowToEdge(r) + if e == nil { + continue + } + k := edgeKey{from: e.From, to: e.To, kind: e.Kind} + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + edges = append(edges, e) + } + } + add(outRows) + add(inRows) + return nodes, edges +} + +// GetFileSubGraphCounts is the count-only sibling of GetFileSubGraph: +// returns the file's nodes plus the number of distinct edges adjacent +// to any of them, without materialising the edge rows. Replaces the +// per-direction edge fetches (~4 000 cgo crossings for store.go in +// the gortex repo) with two scalar aggregates that return one row +// each — three orders of magnitude less work over the wire. +// +// Both the node fetch and the edge aggregates pivot off the file-node +// PK + rel-table FROM walk (same shape GetFileSubGraph uses). The +// alternative — `WHERE id IN $ids` over the Go-side accelerator's id +// list — proved 4-5× slower on the current Kuzu version because the +// planner falls back to a node-table scan instead of using the +// primary-key HASH index for the IN predicate. +// +// Called by handleGetFileSummary on the gcx output path (which only +// emits total_edges in its meta header, never per-edge rows); the +// compact path falls back to the full fetch because it summarises +// edges per confidence label, and the json path keeps the full fetch +// because it ships every edge in the body. +func (s *Store) GetFileSubGraphCounts(filePath string) ([]*graph.Node, int) { + if filePath == "" { + return nil, 0 + } + // Collect the file's nodes via the file_path accelerator — same + // fix as GetFileSubGraph: the old file→symbol `defines`/`contains` + // edge walk found nothing because those edges are never persisted + // to ladybug, so the count came back 0 for every file. + nodes := s.GetFileNodes(filePath) + if len(nodes) == 0 { + return nil, 0 + } + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil && n.ID != "" { + ids = append(ids, n.ID) + } + } + if len(ids) == 0 { + return nodes, 0 + } + // Count adjacent edges via two scalar aggregates over the node-id + // set. outQ counts edges leaving any of the file's nodes; inQ + // counts edges arriving at any of them. The two counts overlap on + // intra-file edges (whose endpoints are both children of this + // file), so the returned total is an upper bound — exact for + // files dominated by cross-file references, slightly inflated for + // files dominated by intra-file structural edges. We accept the + // imprecision because the dedup query (a third pattern join) adds + // more latency than the inflated count costs the gcx caller, who + // only renders it as a `total_edges` header scalar, never as + // anything load-bearing. + const outCountQ = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE a.id IN $ids RETURN count(e)` + const inCountQ = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE b.id IN $ids RETURN count(e)` + args := map[string]any{"ids": stringSliceToAny(ids)} + scan := func(q string) int64 { + rows := s.querySelect(q, args) + if len(rows) == 0 || len(rows[0]) == 0 { + return 0 + } + return asInt64(rows[0][0]) + } + count := scan(outCountQ) + scan(inCountQ) + if count < 0 { + count = 0 + } + return nodes, int(count) +} + +// prefixedNodeReturnCols projects the same node columns nodeReturnCols +// covers but rooted on a custom variable name — needed when the same +// MATCH has more than one node and the row aliases need to mirror +// rowToNode's column order. +func prefixedNodeReturnCols(prefix string) string { + return prefix + ".id, " + prefix + ".kind, " + prefix + ".name, " + + prefix + ".qual_name, " + prefix + ".file_path, " + + prefix + ".start_line, " + prefix + ".end_line, " + + prefix + ".language, " + prefix + ".repo_prefix, " + + prefix + ".workspace_id, " + prefix + ".project_id, " + + prefix + ".meta" +} diff --git a/internal/graph/store_ladybug/backend_resolver.go b/internal/graph/store_ladybug/backend_resolver.go new file mode 100644 index 00000000..388abae2 --- /dev/null +++ b/internal/graph/store_ladybug/backend_resolver.go @@ -0,0 +1,518 @@ +package store_ladybug + +import ( + "fmt" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// upgradeUnresolvedStubs stamps `kind='unresolved'` plus the extracted +// `name` and `repo_prefix` on every auto-stub the bulk COPY created for +// an unresolved call target. Without this, the per-rule resolver +// queries below would never find the stubs in multi-repo mode because: +// +// - copyBulkLocked rewrites unresolved IDs to `::unresolved::` +// (to dodge cross-repo PK collisions on the shared SymbolFTS / Node +// tables). +// - The auto-stub at copyBulkLocked creates Node rows for these +// rewritten IDs with empty Name / Kind / RepoPrefix. +// - Every original resolver rule did +// `WHERE stub.id STARTS WITH 'unresolved::'` — literal — which +// never matches `gortex::unresolved::AddNode`. The fallback +// `substring(stub.id, 13, ...)` for name extraction was also +// keyed to the un-prefixed form. +// +// The upgrade runs once per ResolveAllBulk pass, before the +// downstream rules. After it runs, every stub carries: +// - kind = 'unresolved' +// - name = the bare symbol name (last segment after `unresolved::`) +// - repo_prefix = empty for the legacy form, or the prefix for the +// multi-repo form +// +// The rules below then MATCH `stub.kind = 'unresolved'` and read +// `stub.name` directly — no substring math, no format coupling. +func (s *Store) upgradeUnresolvedStubs() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Stub IDs come in two encodings: + // unresolved::Name (legacy / single-repo) + // ::unresolved::Name (multi-repo COPY rewrite) + // + // regexp_replace strips everything up to and including the + // last `unresolved::` substring, leaving the bare name on + // `stub.name`. The repo prefix is everything before + // `::unresolved::` (or empty for the single-repo form). + const q = ` +MATCH (stub:Node) +WHERE (stub.id STARTS WITH 'unresolved::' OR stub.id CONTAINS '::unresolved::') + AND (stub.kind = '' OR stub.kind IS NULL) +SET stub.kind = 'unresolved', + stub.name = regexp_replace(stub.id, '^.*unresolved::', ''), + stub.repo_prefix = CASE + WHEN stub.id STARTS WITH 'unresolved::' THEN '' + ELSE regexp_replace(stub.id, '::unresolved::.*$', '') + END +RETURN count(stub) AS upgraded` + return s.runResolverQueryLocked(q, "upgradeUnresolvedStubs") +} + +// ResolveSameFile pushes the same-source-file resolution pass into +// the Kuzu engine. For every `unresolved::Name` edge, look for a +// Node with that name whose file_path matches the caller's +// file_path — if there's exactly one such candidate, rewrite the +// edge to point at it. Same-file calls are unambiguous in every +// language we index, so the match precision is high. +// +// One Cypher statement replaces what would otherwise be ~thousands +// of per-edge GetNode / FindNodesByName round-trips. +func (s *Store) ResolveSameFile() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Two-pass to keep `target` typed as Node through the CREATE. + const q = ` +MATCH (caller:Node)-[e:Edge]->(stub:Node) +WHERE stub.kind = 'unresolved' AND caller.file_path <> '' +WITH e, caller, stub, stub.name AS name +OPTIONAL MATCH (cnd:Node {name: name}) +WHERE cnd.file_path = caller.file_path AND cnd.id <> stub.id +WITH e, caller, stub, name, count(cnd) AS cnt +WHERE cnt = 1 +MATCH (target:Node {name: name}) +WHERE target.file_path = caller.file_path AND target.id <> stub.id +DELETE e +CREATE (caller)-[newE:Edge { + kind: e.kind, + file_path: e.file_path, + line: e.line, + confidence: e.confidence, + confidence_label: e.confidence_label, + origin: 'ast_resolved', + tier: 'ast_resolved', + cross_repo: e.cross_repo, + meta: e.meta +}]->(target) +RETURN count(newE) AS resolved` + return s.runResolverQueryLocked(q, "ResolveSameFile") +} + +// ResolveSamePackage drains the "same Go-style package" case: edges +// where the caller and a unique candidate share the same directory +// portion of file_path AND the same repo_prefix. Kuzu has no +// regex_extract, so directory is derived by splitting on "/" and +// reassembling all but the last segment with list_to_string. +func (s *Store) ResolveSamePackage() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Kuzu has neither regex_extract nor split — but it does have + // regexp_replace, which we abuse to extract the directory by + // stripping everything from the last "/" onward. Files with no + // "/" come back unchanged so we add an explicit guard with + // CONTAINS to skip top-level files. + const q = ` +MATCH (caller:Node)-[e:Edge]->(stub:Node) +WHERE stub.kind = 'unresolved' + AND caller.file_path <> '' + AND caller.file_path CONTAINS '/' +WITH e, caller, stub, stub.name AS name, + regexp_replace(caller.file_path, '/[^/]+$', '') AS caller_dir +OPTIONAL MATCH (cnd:Node {name: name}) +WHERE cnd.repo_prefix = caller.repo_prefix + AND cnd.id <> stub.id + AND cnd.file_path <> caller.file_path + AND cnd.file_path CONTAINS '/' + AND regexp_replace(cnd.file_path, '/[^/]+$', '') = caller_dir +WITH e, caller, stub, name, caller_dir, count(cnd) AS cnt +WHERE cnt = 1 +MATCH (target:Node {name: name}) +WHERE target.repo_prefix = caller.repo_prefix + AND target.id <> stub.id + AND target.file_path <> caller.file_path + AND target.file_path CONTAINS '/' + AND regexp_replace(target.file_path, '/[^/]+$', '') = caller_dir +DELETE e +CREATE (caller)-[newE:Edge { + kind: e.kind, + file_path: e.file_path, + line: e.line, + confidence: e.confidence, + confidence_label: e.confidence_label, + origin: 'ast_resolved', + tier: 'ast_resolved', + cross_repo: e.cross_repo, + meta: e.meta +}]->(target) +RETURN count(newE) AS resolved` + return s.runResolverQueryLocked(q, "ResolveSamePackage") +} + +// ResolveImportAware drains the "imported-symbol" case: caller's +// file_path is the FROM of an EdgeImports to an imported file, and +// a Node with the unresolved name lives in that imported file. +// When exactly one such candidate exists across all the caller's +// imports, rewrite the edge to point at it. +// +// This is the highest-coverage rule for Python / JS / Rust-style +// `import X` semantics where the target is in a different file but +// reachable via the import set. Joins against the existing +// EdgeImports adjacency (which the parser populates). +func (s *Store) ResolveImportAware() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + const q = ` +MATCH (caller:Node)-[e:Edge]->(stub:Node) +WHERE stub.kind = 'unresolved' AND caller.file_path <> '' +WITH e, caller, stub, stub.name AS name +MATCH (callerFile:Node {file_path: caller.file_path}) +WHERE callerFile.kind = 'file' +MATCH (callerFile)-[imp:Edge {kind: 'imports'}]->(importedFile:Node) +WHERE importedFile.kind = 'file' + AND NOT (importedFile.id STARTS WITH 'external::') + AND importedFile.kind <> 'unresolved' +OPTIONAL MATCH (cnd:Node {name: name}) +WHERE cnd.file_path = importedFile.file_path + AND cnd.id <> stub.id +WITH e, caller, stub, name, count(DISTINCT cnd) AS cnt +WHERE cnt = 1 +MATCH (callerFile2:Node {file_path: caller.file_path}) +WHERE callerFile2.kind = 'file' +MATCH (callerFile2)-[:Edge {kind: 'imports'}]->(importedFile2:Node) +MATCH (target:Node {name: name}) +WHERE target.file_path = importedFile2.file_path + AND target.id <> stub.id +DELETE e +CREATE (caller)-[newE:Edge { + kind: e.kind, + file_path: e.file_path, + line: e.line, + confidence: e.confidence, + confidence_label: e.confidence_label, + origin: 'ast_resolved', + tier: 'ast_resolved', + cross_repo: e.cross_repo, + meta: e.meta +}]->(target) +RETURN count(newE) AS resolved` + return s.runResolverQueryLocked(q, "ResolveImportAware") +} + +// ResolveRelativeImports drains `unresolved::pyrel::` edges +// (Python's relative-import placeholder emitted by the parser) by +// rewriting them to either `.py` or `/__init__.py` — +// whichever KindFile node exists in the graph. Dart relative +// imports follow the same shape but are not pyrel-tagged so they +// fall through to the same-file / import-aware passes. +// +// Two Cypher passes run sequentially (one per file-naming +// convention) and the counts sum. +func (s *Store) ResolveRelativeImports(lang string) (int, error) { + if lang != "" && lang != "python" { + // Only python is meaningful here. Future Dart support + // would add another pass. + return 0, nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + + var total int + for _, suffix := range []string{".py", "/__init__.py"} { + q := ` +MATCH (caller:Node)-[e:Edge {kind: 'imports'}]->(stub:Node) +WHERE stub.kind = 'unresolved' AND stub.name STARTS WITH 'pyrel::' +WITH e, caller, stub, substring(stub.name, 7, size(stub.name) - 7) AS stem +MATCH (target:Node {kind: 'file'}) +WHERE target.id = stem + '` + suffix + `' +DELETE e +CREATE (caller)-[newE:Edge { + kind: 'imports', + file_path: e.file_path, + line: e.line, + confidence: e.confidence, + confidence_label: e.confidence_label, + origin: 'ast_resolved', + tier: 'ast_resolved', + cross_repo: e.cross_repo, + meta: e.meta +}]->(target) +RETURN count(newE) AS resolved` + n, err := s.runResolverQueryLocked(q, "ResolveRelativeImports "+suffix) + if err != nil { + return total, err + } + total += n + } + return total, nil +} + +// ResolveCrossRepo drains unresolved edges that bind unambiguously +// to a Node in a different repo. Only fires when the caller has a +// non-empty repo_prefix (i.e. we're in a multi-repo workspace) and +// exactly one candidate exists in a different repo. Sets +// cross_repo=true on the resulting edge so downstream consumers +// know the binding crosses a workspace boundary. +func (s *Store) ResolveCrossRepo() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + const q = ` +MATCH (caller:Node)-[e:Edge]->(stub:Node) +WHERE stub.kind = 'unresolved' + AND caller.repo_prefix <> '' +WITH e, caller, stub, stub.name AS name +OPTIONAL MATCH (cnd:Node {name: name}) +WHERE cnd.repo_prefix <> caller.repo_prefix + AND cnd.repo_prefix <> '' + AND cnd.id <> stub.id +WITH e, caller, stub, name, count(cnd) AS cnt +WHERE cnt = 1 +MATCH (target:Node {name: name}) +WHERE target.repo_prefix <> caller.repo_prefix + AND target.repo_prefix <> '' + AND target.id <> stub.id +DELETE e +CREATE (caller)-[newE:Edge { + kind: e.kind, + file_path: e.file_path, + line: e.line, + confidence: e.confidence, + confidence_label: e.confidence_label, + origin: 'ast_resolved', + tier: 'ast_resolved', + cross_repo: 1, + meta: e.meta +}]->(target) +RETURN count(newE) AS resolved` + return s.runResolverQueryLocked(q, "ResolveCrossRepo") +} + +// ResolveExternalCallStubs ensures every external::* edge target +// has a corresponding Node row with kind='external' and promotes +// the edge's origin to ast_resolved. Kuzu's AddEdge already +// auto-stubs the endpoint node via mergeStubNodeLocked, so the +// only work here is the kind/name update + edge origin promotion. +// ResolveMethodCalls drains the receiver-method-call stub form +// `unresolved::*.` — the target the parsers emit for a call +// `x.Method()` when they can't name x's type at extraction time (Go: +// internal/parser/languages/golang.go:646; same `*.` convention in +// java/ruby/typescript/...). upgradeUnresolvedStubs leaves +// stub.name = "*." (the `*.` is kept), so the name-EQUALITY +// rules above never match it, and the Go-side resolver's +// EdgesWithUnresolvedTarget scan (literal `unresolved::` prefix) never +// sees the repo-prefixed `::unresolved::*.` form — so in +// multi-repo mode method callers were invisible to find_usages / +// get_callers entirely. +// +// We bind the stub to a concrete method node when EXACTLY ONE method +// in the caller's repo carries that name. Method nodes store the BARE +// method name in the `name` column (e.g. "querySelect"; the receiver +// lives in meta.receiver / enclosing), so once the `*.` is stripped +// the stub name equals the method node name exactly — an indexed +// equality match, no suffix scan. The uniqueness guard means no false +// edges: an ambiguous method name (String / Close / Get, defined on +// several types) is left unresolved for a future receiver-type-aware +// pass (the edge carries a `receiver_type` meta hint) rather than +// bound to an arbitrary type. +func (s *Store) ResolveMethodCalls() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + const q = ` +MATCH (caller:Node)-[e:Edge]->(stub:Node) +WHERE stub.kind = 'unresolved' AND stub.name STARTS WITH '*.' +WITH e, caller, stub, substring(stub.name, 3, size(stub.name) - 2) AS mname +WHERE mname <> '' +OPTIONAL MATCH (cnd:Node) +WHERE cnd.kind = 'method' + AND cnd.repo_prefix = caller.repo_prefix + AND cnd.id <> stub.id + AND cnd.name = mname +WITH e, caller, stub, mname, count(cnd) AS cnt +WHERE cnt = 1 +MATCH (target:Node) +WHERE target.kind = 'method' + AND target.repo_prefix = caller.repo_prefix + AND target.name = mname +DELETE e +CREATE (caller)-[newE:Edge { + kind: e.kind, + file_path: e.file_path, + line: e.line, + confidence: e.confidence, + confidence_label: e.confidence_label, + origin: 'ast_resolved', + tier: 'ast_resolved', + cross_repo: e.cross_repo, + meta: e.meta +}]->(target) +RETURN count(newE) AS resolved` + return s.runResolverQueryLocked(q, "ResolveMethodCalls") +} + +func (s *Store) ResolveExternalCallStubs() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + + // Step 1: stamp kind='external' + name on stub rows the + // auto-stub created with empty kind. + const upgradeNodes = ` +MATCH (stub:Node) +WHERE stub.id STARTS WITH 'external::' + AND (stub.kind = '' OR stub.kind IS NULL) +SET stub.kind = 'external', + stub.name = substring(stub.id, 11, size(stub.id) - 10) +RETURN count(stub) AS upgraded` + if _, err := s.runResolverQueryLocked(upgradeNodes, "ResolveExternalCallStubs upgrade"); err != nil { + return 0, err + } + + // Step 2: promote edge origin for any external::* edge that + // still has no origin set. + const promoteEdges = ` +MATCH ()-[e:Edge]->(target:Node) +WHERE target.id STARTS WITH 'external::' + AND (e.origin = '' OR e.origin IS NULL) +SET e.origin = 'ast_resolved', e.tier = 'ast_resolved' +RETURN count(e) AS resolved` + return s.runResolverQueryLocked(promoteEdges, "ResolveExternalCallStubs promote") +} + +// runResolverQueryLocked is the shared boilerplate for a backend- +// resolver Cypher query that returns a single COUNT column. Bumps +// the identity-revision counter by the resolved count. +func (s *Store) runResolverQueryLocked(query, ruleName string) (int, error) { + res, err := s.conn.Query(query) + if err != nil { + return 0, fmt.Errorf("backend-resolver %s: %w", ruleName, err) + } + defer res.Close() + if !res.HasNext() { + return 0, nil + } + row, err := res.Next() + if err != nil { + return 0, fmt.Errorf("backend-resolver %s: read result: %w", ruleName, err) + } + defer row.Close() + vals, err := row.GetAsSlice() + if err != nil || len(vals) == 0 { + return 0, err + } + n, _ := vals[0].(int64) + if n > 0 { + s.edgeIdentityRevs.Add(n) + s.writeGen.Add(1) + } + return int(n), nil +} + +// ResolveAllBulk chains every backend-resolver rule in precision- +// descending order and sums the resolved counts. Errors from a single +// rule are non-fatal: the chain CONTINUES so one failing rule can't +// disable every rule after it. (The previous code `return`ed on the +// first error — which silently skipped e.g. ResolveMethodCalls whenever +// an earlier rule errored on a large graph, the bug that made method +// callers invisible. The Store has no logger, so the failing rule +// names ride on the returned error instead; the caller can surface +// them.) +func (s *Store) ResolveAllBulk() (int, error) { + var total int + var ruleErrs []string + rules := []struct { + name string + fn func() (int, error) + }{ + // MUST run first: stamps kind='unresolved' + name + repo_prefix + // on the auto-stub Node rows so the rules below can match them + // in both `unresolved::*` and `::unresolved::*` forms. + {"upgradeUnresolvedStubs", s.upgradeUnresolvedStubs}, + {"ResolveSameFile", s.ResolveSameFile}, + {"ResolveSamePackage", s.ResolveSamePackage}, + {"ResolveImportAware", s.ResolveImportAware}, + {"ResolveRelativeImports", func() (int, error) { return s.ResolveRelativeImports("") }}, + {"ResolveCrossRepo", s.ResolveCrossRepo}, + {"ResolveUniqueNames", s.ResolveUniqueNames}, + {"ResolveMethodCalls", s.ResolveMethodCalls}, + {"ResolveExternalCallStubs", s.ResolveExternalCallStubs}, + } + for _, r := range rules { + n, err := r.fn() + total += n + if err != nil { + ruleErrs = append(ruleErrs, fmt.Sprintf("%s: %v", r.name, err)) + } + } + if len(ruleErrs) > 0 { + return total, fmt.Errorf("backend-resolver rule errors: %s", strings.Join(ruleErrs, "; ")) + } + return total, nil +} + +// Compile-time assertion: *Store satisfies graph.BackendResolver. +var _ graph.BackendResolver = (*Store)(nil) + +// ResolveUniqueNames pushes the largest trivially-correct subset of +// the resolver's work into the Kuzu engine via a single Cypher +// MATCH+SET. For every Edge whose to_id starts with "unresolved::", +// strip the prefix to recover the embedded identifier name; if +// exactly one Node carries that name (no ambiguity), rewrite the +// edge in place to point at the resolved node and bump its origin +// to "ast_resolved". Edges with zero or multiple candidates are +// untouched — they fall through to the Go resolver which has the +// language/scope/visibility rules needed to disambiguate. +// +// The query runs as one statement on the server; the Go side does +// nothing per resolved edge. On a 50k-file repo this collapses +// what would otherwise be ~30k per-edge round-trips into a single +// Cypher Execute. +func (s *Store) ResolveUniqueNames() (int, error) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Strategy: for each unresolved edge, derive the name by + // stripping the "unresolved::" prefix. Match it against Node.name. + // If exactly one candidate, swap the edge's to-pointer (DELETE + + // CREATE a new edge with the same properties but the resolved + // to-endpoint — Kuzu rel edges are immutable on their endpoint + // pair so a direct SET of from/to is not supported). + const q = ` +MATCH (caller:Node)-[e:Edge]->(stub:Node) +WHERE stub.kind = 'unresolved' +WITH e, caller, stub, stub.name AS name +OPTIONAL MATCH (cnd:Node {name: name}) +WITH e, caller, stub, name, count(cnd) AS cnt +WHERE cnt = 1 +MATCH (target:Node {name: name}) +DELETE e +CREATE (caller)-[newE:Edge { + kind: e.kind, + file_path: e.file_path, + line: e.line, + confidence: e.confidence, + confidence_label: e.confidence_label, + origin: 'ast_resolved', + tier: 'ast_resolved', + cross_repo: e.cross_repo, + meta: e.meta +}]->(target) +RETURN count(newE) AS resolved` + res, err := s.conn.Query(q) + if err != nil { + return 0, fmt.Errorf("backend-resolver: %w", err) + } + defer res.Close() + if !res.HasNext() { + return 0, nil + } + row, err := res.Next() + if err != nil { + return 0, fmt.Errorf("backend-resolver: read result: %w", err) + } + defer row.Close() + vals, err := row.GetAsSlice() + if err != nil || len(vals) == 0 { + return 0, err + } + n, _ := vals[0].(int64) + if n > 0 { + s.edgeIdentityRevs.Add(n) + s.writeGen.Add(1) + } + return int(n), nil +} diff --git a/internal/graph/store_ladybug/connpool.go b/internal/graph/store_ladybug/connpool.go new file mode 100644 index 00000000..440b3981 --- /dev/null +++ b/internal/graph/store_ladybug/connpool.go @@ -0,0 +1,173 @@ +package store_ladybug + +import ( + "fmt" + "sync" + + lbug "github.com/LadybugDB/go-ladybug" +) + +// connPool holds a fixed-size pool of *lbug.Connection bound to +// the same *lbug.Database. The Go binding's `(c *Connection).Query` +// is single-threaded — two goroutines calling Query on the SAME +// Connection race in the cgo layer and SIGSEGV (we saw this with +// the per-repo IndexCtx shadow-swap NodeCount checks under +// MultiIndexer). Giving each goroutine its own Connection +// eliminates the race AND removes the writeMu serialisation +// bottleneck that was making small repos wait 100+ seconds for +// the big repo's bulk drain. +// +// Pool semantics: +// - get() blocks until a Connection is available (no allocation +// of new connections beyond the initial size; bounded +// concurrency by design — ladybug spawns its own internal +// query workers per connection). +// - put() returns the Connection to the pool. Always defer put +// after get. +// - Each Connection lazy-loads any extensions (FTS / VECTOR / +// ALGO) that have been registered with the pool. The pool +// replays the extension list on every checkout against +// connections that haven't been seen yet for that extension. +type connPool struct { + db *lbug.Database + available chan *lbug.Connection + closeOnce sync.Once + + extMu sync.RWMutex + extensions []string // ordered list of extension names + loadedExt map[*lbug.Connection]map[string]bool +} + +// newConnPool opens `size` connections on db and returns the +// pool. Caller closes via close(). On failure the partially +// created connections are torn down. +func newConnPool(db *lbug.Database, size int) (*connPool, error) { + if size <= 0 { + size = 1 + } + pool := &connPool{ + db: db, + available: make(chan *lbug.Connection, size), + loadedExt: make(map[*lbug.Connection]map[string]bool), + } + for i := 0; i < size; i++ { + conn, err := lbug.OpenConnection(db) + if err != nil { + pool.close() + return nil, fmt.Errorf("connpool: open connection %d/%d: %w", i+1, size, err) + } + pool.available <- conn + } + return pool, nil +} + +// get blocks until a connection is available, applies any +// pending extension loads to it, and returns it. Caller MUST +// defer put. +func (p *connPool) get() *lbug.Connection { + conn := <-p.available + p.ensureExtensionsLocked(conn) + return conn +} + +// put returns a connection to the pool. Calling put on a nil +// connection or after close is a no-op. +func (p *connPool) put(conn *lbug.Connection) { + if conn == nil || p.available == nil { + return + } + defer func() { + // Re-injecting into a closed channel panics — recover so a + // late put after close doesn't crash the daemon. + _ = recover() + }() + p.available <- conn +} + +// discard removes a connection from circulation instead of returning +// it to the pool, then opens a fresh replacement so the pool stays at +// its configured size. Call this — never put — for any connection +// whose last operation ERRORED. +// +// Rationale: a liblbug connection that errored mid-statement (most +// notably a COPY that hit a duplicated-primary-key Runtime/Copy +// exception during warmup) can be left with poisoned internal +// transaction / pthread-mutex state. Recycling it via put() means the +// next goroutine to check it out and call Prepare dies with +// "prepare: mutex lock failed: Invalid argument" — a panic on a +// completely unrelated goroutine (e.g. the resolver's reconcile +// ReindexEdges pass). Same hazard class as a parse cancelled +// mid-balancing poisoning a tree-sitter parser: a broken handle must +// be closed and replaced, never pooled. +func (p *connPool) discard(conn *lbug.Connection) { + if conn == nil { + return + } + // Drop any extension-load bookkeeping keyed on the dead handle so + // the loadedExt map doesn't leak entries for closed connections. + p.extMu.Lock() + delete(p.loadedExt, conn) + p.extMu.Unlock() + conn.Close() + if p.available == nil || p.db == nil { + return + } + // Open a replacement so the pool doesn't shrink by one on every + // error. If reopening fails the pool runs one connection lighter, + // which is still strictly better than handing out a dead handle. + fresh, err := lbug.OpenConnection(p.db) + if err != nil { + return + } + p.put(fresh) +} + +// ensureExtensionsLocked loads any registered extensions onto +// the given connection that haven't been loaded there yet. +// Idempotent per (conn, ext) pair. +func (p *connPool) ensureExtensionsLocked(conn *lbug.Connection) { + p.extMu.RLock() + exts := append([]string(nil), p.extensions...) + p.extMu.RUnlock() + if len(exts) == 0 { + return + } + p.extMu.Lock() + defer p.extMu.Unlock() + loaded, ok := p.loadedExt[conn] + if !ok { + loaded = make(map[string]bool, len(exts)) + p.loadedExt[conn] = loaded + } + for _, ext := range exts { + if loaded[ext] { + continue + } + // LOAD EXTENSION can soft-fail; the next operation on the + // connection will surface a real error. Ignore the return + // here — extensions that aren't available will fail at + // query time with a clearer message. + res, err := conn.Query("LOAD EXTENSION " + ext) + if err == nil && res != nil { + res.Close() + } + loaded[ext] = true + } +} + +// close releases every connection in the pool. Safe to call +// multiple times. +func (p *connPool) close() { + p.closeOnce.Do(func() { + close(p.available) + for conn := range p.available { + if conn != nil { + conn.Close() + } + } + p.available = nil + p.extMu.Lock() + p.loadedExt = nil + p.extMu.Unlock() + }) +} diff --git a/internal/graph/store_ladybug/deadcode_probe_test.go b/internal/graph/store_ladybug/deadcode_probe_test.go new file mode 100644 index 00000000..73be58fa --- /dev/null +++ b/internal/graph/store_ladybug/deadcode_probe_test.go @@ -0,0 +1,202 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// TestDeadCode_Probe probes the Cypher shapes that could implement the +// server-side dead-code candidate filter: +// +// - "WHERE NOT EXISTS { MATCH ... }" — subquery existence check; the +// spec-defined way to ask "no incoming edge of allowed kind". +// - Per-node-kind UNWIND with the allowlist baked in as a Cypher list +// literal (one query per kind). +// - LEFT JOIN trick (OPTIONAL MATCH … WHERE other IS NULL) — the +// classic anti-join pattern. +// +// The probe logs which shape Ladybug accepts and the row counts so the +// implementation can pick the one that compiles AND has reasonable +// runtime characteristics. +func TestDeadCode_Probe(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-deadcode-probe-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + // Seed a small graph with: + // - Function "Alive" called by another function. + // - Function "Dead" never called. + // - Function "WrongKindOnly" referenced but only by reads (wrong + // allowlist for functions — should still appear dead). + // - Method "AliveMethod" called. + // - Method "DeadMethod" never touched. + // - Type "AliveType" referenced. + // - Type "DeadType" with no incoming edges. + nodes := []*graph.Node{ + {ID: "Alive", Kind: graph.KindFunction, Name: "Alive", FilePath: "a.go"}, + {ID: "Dead", Kind: graph.KindFunction, Name: "Dead", FilePath: "a.go"}, + {ID: "WrongKindOnly", Kind: graph.KindFunction, Name: "WrongKindOnly", FilePath: "a.go"}, + {ID: "Caller", Kind: graph.KindFunction, Name: "Caller", FilePath: "a.go"}, + {ID: "AliveMethod", Kind: graph.KindMethod, Name: "AliveMethod", FilePath: "a.go"}, + {ID: "DeadMethod", Kind: graph.KindMethod, Name: "DeadMethod", FilePath: "a.go"}, + {ID: "AliveType", Kind: graph.KindType, Name: "AliveType", FilePath: "a.go"}, + {ID: "DeadType", Kind: graph.KindType, Name: "DeadType", FilePath: "a.go"}, + } + for _, n := range nodes { + s.AddNode(n) + } + for _, e := range []*graph.Edge{ + {From: "Caller", To: "Alive", Kind: graph.EdgeCalls, FilePath: "a.go", Line: 1}, + {From: "Caller", To: "WrongKindOnly", Kind: graph.EdgeReads, FilePath: "a.go", Line: 2}, + {From: "Caller", To: "AliveMethod", Kind: graph.EdgeCalls, FilePath: "a.go", Line: 3}, + {From: "Caller", To: "AliveType", Kind: graph.EdgeReferences, FilePath: "a.go", Line: 4}, + } { + s.AddEdge(e) + } + + probes := []struct { + name string + q string + args map[string]any + }{ + { + // Shape A: per-kind WHERE NOT EXISTS subquery (Cypher spec + // shape). One query per node kind; the allowlist is a list + // literal in $allowed. + name: "shape_A_not_exists_subquery", + q: ` +MATCH (n:Node {kind: $kind}) +WHERE NOT EXISTS { + MATCH (src:Node)-[e:Edge]->(n) + WHERE e.kind IN $allowed +} +RETURN n.id`, + args: map[string]any{ + "kind": string(graph.KindFunction), + "allowed": []any{string(graph.EdgeCalls), string(graph.EdgeReferences)}, + }, + }, + { + // Shape B: LEFT-JOIN-style OPTIONAL MATCH + IS NULL anti-join. + name: "shape_B_optional_match_isnull", + q: ` +MATCH (n:Node {kind: $kind}) +OPTIONAL MATCH (src:Node)-[e:Edge]->(n) WHERE e.kind IN $allowed +WITH n, count(e) AS inc +WHERE inc = 0 +RETURN n.id`, + args: map[string]any{ + "kind": string(graph.KindFunction), + "allowed": []any{string(graph.EdgeCalls), string(graph.EdgeReferences)}, + }, + }, + { + // Shape C: COUNT subquery (Cypher 9+ COUNT subquery form). + name: "shape_C_count_subquery", + q: ` +MATCH (n:Node {kind: $kind}) +WHERE COUNT { MATCH (src:Node)-[e:Edge]->(n) WHERE e.kind IN $allowed } = 0 +RETURN n.id`, + args: map[string]any{ + "kind": string(graph.KindFunction), + "allowed": []any{string(graph.EdgeCalls), string(graph.EdgeReferences)}, + }, + }, + { + // Shape D: per-kind without explicit allowed (any incoming + // edge counts as alive — fast path for kinds whose allowlist + // is implicit). + name: "shape_D_not_exists_any", + q: ` +MATCH (n:Node {kind: $kind}) +WHERE NOT EXISTS { MATCH (src:Node)-[e:Edge]->(n) } +RETURN n.id`, + args: map[string]any{"kind": string(graph.KindMethod)}, + }, + { + // Shape E: NOT EXISTS with the WHERE inside as a property + // match (no IN). Some Cypher dialects fail on IN inside + // subquery WHERE — try a single-kind form as a fallback. + name: "shape_E_not_exists_single_kind", + q: ` +MATCH (n:Node {kind: $kind}) +WHERE NOT EXISTS { MATCH (src:Node)-[e:Edge {kind: $alloweKind}]->(n) } +RETURN n.id`, + args: map[string]any{ + "kind": string(graph.KindFunction), + "alloweKind": string(graph.EdgeCalls), + }, + }, + } + + for _, p := range probes { + rows, qerr := tryQueryCypher(s, p.q, p.args) + if qerr != nil { + t.Logf("%s: error: %v", p.name, qerr) + continue + } + t.Logf("%s → %d rows", p.name, len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } + + // Probe interface-implements join shape used by IfaceImplementsScanner. + t.Log("--- iface implements probes ---") + s.AddNode(&graph.Node{ + ID: "iface1", Kind: graph.KindInterface, Name: "Foo", FilePath: "a.go", + Meta: map[string]any{"methods": []string{"Bar"}}, + }) + s.AddNode(&graph.Node{ + ID: "type1", Kind: graph.KindType, Name: "FooImpl", FilePath: "a.go", + }) + s.AddEdge(&graph.Edge{From: "type1", To: "iface1", Kind: graph.EdgeImplements, FilePath: "a.go", Line: 7}) + + ifaceProbes := []struct { + name string + q string + }{ + { + name: "iface_basic", + q: ` +MATCH (t:Node)-[e:Edge {kind: 'implements'}]->(iface:Node {kind: 'interface'}) +WHERE iface.meta <> '' +RETURN t.id, iface.id, iface.meta`, + }, + { + name: "iface_strict_kind_param", + q: ` +MATCH (t:Node)-[e:Edge]->(iface:Node) +WHERE e.kind = $impl AND iface.kind = $iface AND iface.meta <> '' +RETURN t.id, iface.id, iface.meta`, + }, + } + for _, p := range ifaceProbes { + args := map[string]any{ + "impl": string(graph.EdgeImplements), + "iface": string(graph.KindInterface), + } + rows, qerr := tryQueryCypher(s, p.q, args) + if qerr != nil { + t.Logf("%s: error: %v", p.name, qerr) + continue + } + t.Logf("%s → %d rows", p.name, len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } +} diff --git a/internal/graph/store_ladybug/file_index.go b/internal/graph/store_ladybug/file_index.go new file mode 100644 index 00000000..eb108d9f --- /dev/null +++ b/internal/graph/store_ladybug/file_index.go @@ -0,0 +1,118 @@ +package store_ladybug + +import ( + "sync" + + "github.com/zzet/gortex/internal/graph" +) + +// fileIDIndex is a Go-side accelerator that maps each file path to the +// set of node IDs anchored to that file. Kuzu does not expose a +// secondary index on `Node.file_path`, so every "find the symbols in +// this file" lookup defaulted to a full Node-table scan +// (`MATCH (n {file_path: $f})` — 213 k rows on the gortex graph for one +// call). This map turns the lookup into a single RLock + map probe, at +// a per-node cost of one string slot in a set entry. +// +// The set form (map[id]struct{}) is intentional: AddBatch / AddNode +// can be called multiple times for the same node id (the indexer +// re-runs after an incremental re-index, the resolver re-stamps +// metadata) and we want idempotent membership rather than duplicated +// slice entries. +// +// Concurrency: the store's writeMu serialises mutations, so every +// add/remove call already runs under that lock when invoked from the +// store's public API. The dedicated fileMu only guards the readers +// (GetFileSubGraph and friends), which run without writeMu. Holding a +// finer-grained mutex than writeMu lets readers proceed in parallel +// with each other even when a writer is mid-commit. +type fileIDIndex struct { + mu sync.RWMutex + m map[string]map[string]struct{} +} + +func newFileIDIndex() *fileIDIndex { + return &fileIDIndex{m: make(map[string]map[string]struct{})} +} + +// add registers (id, filePath). No-op when either is empty. +func (f *fileIDIndex) add(filePath, id string) { + if filePath == "" || id == "" { + return + } + f.mu.Lock() + defer f.mu.Unlock() + set, ok := f.m[filePath] + if !ok { + set = make(map[string]struct{}, 4) + f.m[filePath] = set + } + set[id] = struct{}{} +} + +// addNodes bulk-loads node IDs in one lock acquisition. The bulk-load +// fast path drains thousands of nodes per call; per-node add() would +// thrash the mutex. +func (f *fileIDIndex) addNodes(nodes []*graph.Node) { + if len(nodes) == 0 { + return + } + f.mu.Lock() + defer f.mu.Unlock() + for _, n := range nodes { + if n == nil || n.ID == "" || n.FilePath == "" { + continue + } + set, ok := f.m[n.FilePath] + if !ok { + set = make(map[string]struct{}, 4) + f.m[n.FilePath] = set + } + set[n.ID] = struct{}{} + } +} + +// removeFile drops every entry for filePath. +func (f *fileIDIndex) removeFile(filePath string) { + if filePath == "" { + return + } + f.mu.Lock() + defer f.mu.Unlock() + delete(f.m, filePath) +} + +// removeFiles drops every entry under any of paths. Used by +// EvictRepo (which first asks the store which file paths belong to +// the repo, then forwards the list here). +func (f *fileIDIndex) removeFiles(paths []string) { + if len(paths) == 0 { + return + } + f.mu.Lock() + defer f.mu.Unlock() + for _, p := range paths { + delete(f.m, p) + } +} + +// idsFor returns a copy of the id set for filePath, or nil. Returning a +// slice rather than the underlying map keeps callers' iteration +// independent of subsequent writes — they don't need to hold the lock +// past the call. +func (f *fileIDIndex) idsFor(filePath string) []string { + if filePath == "" { + return nil + } + f.mu.RLock() + defer f.mu.RUnlock() + set := f.m[filePath] + if len(set) == 0 { + return nil + } + out := make([]string, 0, len(set)) + for id := range set { + out = append(out, id) + } + return out +} diff --git a/internal/graph/store_ladybug/file_mtimes.go b/internal/graph/store_ladybug/file_mtimes.go new file mode 100644 index 00000000..f7903c43 --- /dev/null +++ b/internal/graph/store_ladybug/file_mtimes.go @@ -0,0 +1,130 @@ +package store_ladybug + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the FileMtime persistence +// capability. Lifting per-file mtimes off the daemon's gob+gzip +// snapshot and into the FileMtime node table is what lets the warm- +// restart path read incremental-reindex state through ladybug instead +// of through a sidecar file. +var ( + _ graph.FileMtimeWriter = (*Store)(nil) + _ graph.FileMtimeReader = (*Store)(nil) +) + +// BulkSetFileMtimes upserts the per-file modification times under one +// repo prefix. Mirrors the in-memory Indexer's fileMtimes map but +// makes the data durable in ladybug so the next daemon restart can +// reconstruct it without replaying a gob snapshot. +// +// Empty input is a no-op. Empty repoPrefix is allowed (the in-memory +// indexer keys mtimes the same way for single-repo daemons). +func (s *Store) BulkSetFileMtimes(repoPrefix string, mtimes map[string]int64) error { + if len(mtimes) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + // UNWIND + MERGE: one Cypher Execute per chunk amortises the parse + // + plan over the whole batch. 5k is the same chunk size the rest + // of the indexer's batched writes use; the relevant constant lives + // next to the AddBatch path. + rows := make([]map[string]any, 0, len(mtimes)) + for id, mt := range mtimes { + if id == "" { + continue + } + // The incoming map is keyed by RELATIVE path (the indexer keys + // fileMtimes by relKey). PRIMARY KEY(file_id) on the FileMtime + // table is global, but relative paths are NOT unique across + // repos: every tree-sitter grammar repo carries `src/parser.c`, + // `grammar.js`, `binding.gyp`, etc. Storing the bare relative + // path as file_id let those rows collide cross-repo — the + // last-writing repo's MERGE overwrote the row's repo_prefix, so + // every other repo sharing that path silently lost its mtimes + // and re-indexed (full COPY) on every warm restart. Prefix the + // id with the repo prefix to make it globally unique, matching + // the `repoPrefix + "/" + relPath` convention node file_paths + // already use. LoadFileMtimes strips the prefix back off. + fileID := id + if repoPrefix != "" { + fileID = repoPrefix + "/" + id + } + rows = append(rows, map[string]any{ + "file_id": fileID, + "repo_prefix": repoPrefix, + "mtime_ns": mt, + }) + } + for i := 0; i < len(rows); i += kuzuBatchChunkSize { + end := i + kuzuBatchChunkSize + if end > len(rows) { + end = len(rows) + } + const q = ` +UNWIND $rows AS row +MERGE (m:FileMtime {file_id: row.file_id}) +SET m.repo_prefix = row.repo_prefix, + m.mtime_ns = row.mtime_ns` + s.runWriteLocked(q, map[string]any{"rows": rows[i:end]}) + } + return nil +} + +// LoadFileMtimes returns the per-file mtimes for one repo prefix as a +// fresh map. Empty repo prefix returns every recorded mtime — the +// daemon doesn't currently call it that way, but the unsuffixed shape +// keeps the function useful for ad-hoc probes. +// +// The query goes through the read path's degraded-on-error wrapper +// (querySelect → querySelectInner), so a transient IO exception +// returns an empty map rather than killing the daemon. Worst case the +// warmup falls back to TrackRepoCtx for that repo, which is exactly +// what the snapshot-less path used to do. +func (s *Store) LoadFileMtimes(repoPrefix string) map[string]int64 { + var ( + q string + args map[string]any + ) + if repoPrefix == "" { + q = `MATCH (m:FileMtime) RETURN m.file_id, m.mtime_ns` + args = nil + } else { + q = `MATCH (m:FileMtime) WHERE m.repo_prefix = $repo RETURN m.file_id, m.mtime_ns` + args = map[string]any{"repo": repoPrefix} + } + rows := s.querySelect(q, args) + if len(rows) == 0 { + return nil + } + // Strip the repo prefix BulkSetFileMtimes prepends so the returned + // keys are relative paths again — that's what the indexer's + // fileMtimes map / IsStale comparison expect. Tolerate rows written + // by the pre-fix code (bare relative file_id): when the prefix isn't + // present we use the id verbatim, so a store mid-migration loads + // both shapes without re-indexing the repos that were never + // collision victims. + strip := "" + if repoPrefix != "" { + strip = repoPrefix + "/" + } + out := make(map[string]int64, len(rows)) + for _, r := range rows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + if id == "" { + continue + } + if strip != "" { + id = strings.TrimPrefix(id, strip) + } + out[id] = asInt64(r[1]) + } + return out +} diff --git a/internal/graph/store_ladybug/file_mtimes_probe_test.go b/internal/graph/store_ladybug/file_mtimes_probe_test.go new file mode 100644 index 00000000..c9180789 --- /dev/null +++ b/internal/graph/store_ladybug/file_mtimes_probe_test.go @@ -0,0 +1,144 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" +) + +// TestFileMtimes_PersistAcrossOpens locks in the warm-restart +// contract: BulkSetFileMtimes writes to the FileMtime table, the +// store closes, the store reopens, and LoadFileMtimes returns the +// same data. Pre-fix, the daemon's warmup re-walked every repo on +// each restart — find_usages stayed correct but the daemon paid 10 +// minutes of warmup it could have skipped. This probe is the +// regression guard. +func TestFileMtimes_PersistAcrossOpens(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-mtime-probe-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + path := filepath.Join(dir, "store.lbug") + + // Phase 1: open, write, close. + { + s, err := Open(path) + if err != nil { + t.Fatalf("phase1 open: %v", err) + } + mtimes := map[string]int64{ + "internal/mcp/server.go": 1779000000, + "internal/mcp/handler.go": 1779000001, + "internal/graph/graph.go": 1779000002, + } + if err := s.BulkSetFileMtimes("gortex", mtimes); err != nil { + t.Fatalf("phase1 BulkSetFileMtimes: %v", err) + } + mtimesB := map[string]int64{ + "api/billing.go": 1779000010, + } + if err := s.BulkSetFileMtimes("gortex-cloud", mtimesB); err != nil { + t.Fatalf("phase1 BulkSetFileMtimes B: %v", err) + } + _ = s.Close() + } + + // Phase 2: reopen, read, compare. + s, err := Open(path) + if err != nil { + t.Fatalf("phase2 open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + gotA := s.LoadFileMtimes("gortex") + if len(gotA) != 3 { + t.Errorf("phase2 LoadFileMtimes(gortex) = %d entries, want 3: %v", len(gotA), gotA) + } + if gotA["internal/mcp/server.go"] != 1779000000 { + t.Errorf("phase2 server.go mtime = %d, want 1779000000", gotA["internal/mcp/server.go"]) + } + + gotB := s.LoadFileMtimes("gortex-cloud") + if len(gotB) != 1 { + t.Errorf("phase2 LoadFileMtimes(gortex-cloud) = %d entries, want 1: %v", len(gotB), gotB) + } + if gotB["api/billing.go"] != 1779000010 { + t.Errorf("phase2 billing.go mtime = %d, want 1779000010", gotB["api/billing.go"]) + } + + // Empty prefix returns all. + all := s.LoadFileMtimes("") + if len(all) != 4 { + t.Errorf("phase2 LoadFileMtimes('') = %d entries, want 4", len(all)) + } +} + +// TestFileMtimes_SharedRelativePathsAcrossRepos is the regression guard +// for the cross-repo collision that re-indexed (and crashed) repos on +// every warm restart. PRIMARY KEY(file_id) is global, but relative paths +// are not unique across repos — every tree-sitter grammar repo ships +// `src/parser.c`, `grammar.js`, `binding.gyp`. With the bare relative +// path as file_id, the second repo's MERGE overwrote the first's +// repo_prefix, so LoadFileMtimes returned zero rows for every repo but +// the last writer; the daemon then full-COPY-re-indexed those repos +// against an already-populated store, SIGSEGVing on the duplicate keys. +// The fix prefixes file_id with the repo prefix; this test proves two +// repos sharing identical relative paths each round-trip their own +// mtimes. +func TestFileMtimes_SharedRelativePathsAcrossRepos(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-mtime-collide-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + path := filepath.Join(dir, "store.lbug") + + shared := []string{"src/parser.c", "grammar.js", "binding.gyp"} + + { + s, err := Open(path) + if err != nil { + t.Fatalf("open: %v", err) + } + dart := map[string]int64{} + swift := map[string]int64{} + for i, p := range shared { + dart[p] = int64(1779000000 + i) + swift[p] = int64(1779009000 + i) + } + if err := s.BulkSetFileMtimes("tree-sitter-dart", dart); err != nil { + t.Fatalf("set dart: %v", err) + } + if err := s.BulkSetFileMtimes("tree-sitter-swift", swift); err != nil { + t.Fatalf("set swift: %v", err) + } + _ = s.Close() + } + + s, err := Open(path) + if err != nil { + t.Fatalf("reopen: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + gotDart := s.LoadFileMtimes("tree-sitter-dart") + if len(gotDart) != len(shared) { + t.Fatalf("dart loaded %d entries, want %d (cross-repo collision regressed): %v", + len(gotDart), len(shared), gotDart) + } + if gotDart["src/parser.c"] != 1779000000 { + t.Errorf("dart src/parser.c = %d, want 1779000000 (got swift's value? = collision)", gotDart["src/parser.c"]) + } + + gotSwift := s.LoadFileMtimes("tree-sitter-swift") + if len(gotSwift) != len(shared) { + t.Fatalf("swift loaded %d entries, want %d: %v", len(gotSwift), len(shared), gotSwift) + } + if gotSwift["src/parser.c"] != 1779009000 { + t.Errorf("swift src/parser.c = %d, want 1779009000", gotSwift["src/parser.c"]) + } +} diff --git a/internal/graph/store_ladybug/frontier_scale_test.go b/internal/graph/store_ladybug/frontier_scale_test.go new file mode 100644 index 00000000..a14da378 --- /dev/null +++ b/internal/graph/store_ladybug/frontier_scale_test.go @@ -0,0 +1,70 @@ +package store_ladybug_test + +import ( + "fmt" + "path/filepath" + "testing" + "time" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_ladybug" + "github.com/zzet/gortex/internal/query" +) + +// TestBFS_BoundsHugeFanInHub is the regression guard for the +// smart_context 40 GB / 8-min incident. A routing hub with thousands of +// inbound edges must not drag its entire adjacency across the cgo +// boundary: GetCallers over the ladybug store routes through +// Engine.bfs -> Store.ExpandFrontier, which applies a server-side LIMIT, +// so the result is bounded by the node limit regardless of the hub's +// true degree. Pre-fix, bfs fetched every inbound edge with no LIMIT and +// issued one GetNode cgo round-trip per edge. +func TestBFS_BoundsHugeFanInHub(t *testing.T) { + const fanIn = 2000 // >> limit (64) and >> frontierRowCap (512) + + s, err := store_ladybug.Open(filepath.Join(t.TempDir(), "fanin.kuzu")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + nodes := make([]*graph.Node, 0, fanIn+1) + edges := make([]*graph.Edge, 0, fanIn) + nodes = append(nodes, &graph.Node{ID: "hub", Name: "hub", Kind: graph.KindFunction, FilePath: "hub.go", WorkspaceID: "ws"}) + for i := 0; i < fanIn; i++ { + id := fmt.Sprintf("caller%05d", i) + nodes = append(nodes, &graph.Node{ID: id, Name: id, Kind: graph.KindFunction, FilePath: id + ".go", WorkspaceID: "ws"}) + edges = append(edges, &graph.Edge{From: id, To: "hub", Kind: graph.EdgeCalls, FilePath: id + ".go", Line: 1}) + } + s.AddBatch(nodes, edges) + + // Sanity: the hub really has fanIn callers in the store. + if got := len(s.GetInEdges("hub")); got != fanIn { + t.Fatalf("store seeded with %d inbound edges, want %d", got, fanIn) + } + + eng := query.NewEngine(s) + const limit = 64 + start := time.Now() + sg := eng.GetCallers("hub", query.QueryOptions{Depth: 1, Limit: limit, Detail: "brief", WorkspaceID: "ws"}) + elapsed := time.Since(start) + + // The fix: result bounded by the node limit, not the hub's true degree. + if len(sg.Nodes) > limit+1 { // +1 for the seed node + t.Fatalf("GetCallers returned %d nodes, want <= %d (limit+seed) — fan not bounded", len(sg.Nodes), limit+1) + } + // Edges are appended only while under the node budget, so they are + // bounded too — far below the hub's true fan-in (the heap-blowup guard). + if len(sg.Edges) > limit+1 { + t.Fatalf("GetCallers returned %d edges, want <= %d — server-side LIMIT not applied (pre-fix: %d)", len(sg.Edges), limit+1, fanIn) + } + if !sg.Truncated { + t.Fatalf("a %d-fan-in hub capped at limit %d must report Truncated", fanIn, limit) + } + // The seed must be present and in-scope neighbours must have come back. + if len(sg.Nodes) < 2 { + t.Fatalf("GetCallers returned %d nodes, expected the hub plus callers", len(sg.Nodes)) + } + t.Logf("GetCallers over %d-fan-in hub: %d nodes, %d edges in %s (pre-fix would materialise %d edges + %d GetNode round-trips)", + fanIn, len(sg.Nodes), len(sg.Edges), elapsed, fanIn, fanIn) +} diff --git a/internal/graph/store_ladybug/frontier_test.go b/internal/graph/store_ladybug/frontier_test.go new file mode 100644 index 00000000..ab388385 --- /dev/null +++ b/internal/graph/store_ladybug/frontier_test.go @@ -0,0 +1,144 @@ +package store_ladybug_test + +import ( + "path/filepath" + "sort" + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_ladybug" +) + +// buildFrontierStore seeds a hub with two callers (a, b) and two +// callees reached by different edge kinds (c via Calls, d via +// References), plus a Calls edge to an unresolved stub and to an +// external stub — both of which ExpandFrontier must filter server-side. +func buildFrontierStore(t *testing.T) *store_ladybug.Store { + t.Helper() + s, err := store_ladybug.Open(filepath.Join(t.TempDir(), "frontier.kuzu")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + for _, n := range []*graph.Node{ + {ID: "a", Name: "a", Kind: graph.KindFunction, FilePath: "a.go", WorkspaceID: "ws"}, + {ID: "b", Name: "b", Kind: graph.KindFunction, FilePath: "b.go", WorkspaceID: "ws"}, + {ID: "hub", Name: "hub", Kind: graph.KindFunction, FilePath: "hub.go", WorkspaceID: "ws"}, + {ID: "c", Name: "c", Kind: graph.KindFunction, FilePath: "c.go", WorkspaceID: "ws"}, + {ID: "d", Name: "d", Kind: graph.KindFunction, FilePath: "d.go", WorkspaceID: "ws"}, + // Stub endpoints so the edges below are insertable; ExpandFrontier + // must still exclude them by id prefix. + {ID: "unresolved::ghost", Name: "ghost", Kind: graph.KindFunction, FilePath: ""}, + {ID: "external::pkg.Ext", Name: "Ext", Kind: graph.KindFunction, FilePath: ""}, + } { + s.AddNode(n) + } + for _, e := range []*graph.Edge{ + {From: "a", To: "hub", Kind: graph.EdgeCalls, FilePath: "a.go", Line: 1}, + {From: "b", To: "hub", Kind: graph.EdgeCalls, FilePath: "b.go", Line: 2}, + {From: "hub", To: "c", Kind: graph.EdgeCalls, FilePath: "hub.go", Line: 3}, + {From: "hub", To: "d", Kind: graph.EdgeReferences, FilePath: "hub.go", Line: 4}, + {From: "hub", To: "unresolved::ghost", Kind: graph.EdgeCalls, FilePath: "hub.go", Line: 5}, + {From: "hub", To: "external::pkg.Ext", Kind: graph.EdgeCalls, FilePath: "hub.go", Line: 6}, + } { + s.AddEdge(e) + } + return s +} + +func neighborIDs(hops []graph.FrontierHop) []string { + ids := make([]string, 0, len(hops)) + for _, h := range hops { + ids = append(ids, h.Neighbor.ID) + } + sort.Strings(ids) + return ids +} + +func equalIDs(got, want []string) bool { + if len(got) != len(want) { + return false + } + for i := range got { + if got[i] != want[i] { + return false + } + } + return true +} + +// TestExpandFrontier_OutgoingFiltersAndProjection verifies the forward +// expansion: edge-kind filtering, server-side exclusion of +// unresolved/external targets, and that the neighbour node is fully +// projected (columns populated) but meta-free. +func TestExpandFrontier_OutgoingFiltersAndProjection(t *testing.T) { + s := buildFrontierStore(t) + + // Calls + References → c (Calls) and d (References); the unresolved + // and external targets are dropped by the server-side id filter. + hops := s.ExpandFrontier([]string{"hub"}, true, []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, 0) + if got, want := neighborIDs(hops), []string{"c", "d"}; !equalIDs(got, want) { + t.Fatalf("forward Calls+References neighbours = %v, want %v", got, want) + } + + // Edge-kind filter: Calls only → just c (d is reached via References). + callsOnly := s.ExpandFrontier([]string{"hub"}, true, []graph.EdgeKind{graph.EdgeCalls}, 0) + if got, want := neighborIDs(callsOnly), []string{"c"}; !equalIDs(got, want) { + t.Fatalf("forward Calls-only neighbours = %v, want %v", got, want) + } + + // Projection: the c hop carries a populated, meta-free neighbour and + // the correctly-oriented edge. + var cHop *graph.FrontierHop + for i := range callsOnly { + if callsOnly[i].Neighbor.ID == "c" { + cHop = &callsOnly[i] + break + } + } + if cHop == nil { + t.Fatal("no hop for neighbour c") + } + if cHop.Neighbor.Name != "c" || cHop.Neighbor.FilePath != "c.go" || cHop.Neighbor.Kind != graph.KindFunction { + t.Fatalf("neighbour c under-projected: %+v", cHop.Neighbor) + } + if cHop.Neighbor.Meta != nil { + t.Fatalf("neighbour c should be meta-free, got Meta=%v", cHop.Neighbor.Meta) + } + if cHop.Edge.From != "hub" || cHop.Edge.To != "c" || cHop.Edge.Kind != graph.EdgeCalls { + t.Fatalf("edge hub->c mis-decoded: %+v", cHop.Edge) + } +} + +// TestExpandFrontier_Incoming verifies the reverse expansion: callers of +// the hub are the neighbours, oriented so the edge still points at the +// hub. +func TestExpandFrontier_Incoming(t *testing.T) { + s := buildFrontierStore(t) + + hops := s.ExpandFrontier([]string{"hub"}, false, []graph.EdgeKind{graph.EdgeCalls}, 0) + if got, want := neighborIDs(hops), []string{"a", "b"}; !equalIDs(got, want) { + t.Fatalf("incoming Calls neighbours = %v, want %v", got, want) + } + for _, h := range hops { + if h.Edge.To != "hub" { + t.Fatalf("incoming hop edge should point at hub, got To=%q", h.Edge.To) + } + if h.Edge.From != h.Neighbor.ID { + t.Fatalf("incoming hop neighbour %q should equal edge.From %q", h.Neighbor.ID, h.Edge.From) + } + } +} + +// TestExpandFrontier_EmptyInputs guards the early-return contract: no ids +// or no kinds yields no hops (and no query). +func TestExpandFrontier_EmptyInputs(t *testing.T) { + s := buildFrontierStore(t) + if got := s.ExpandFrontier(nil, true, []graph.EdgeKind{graph.EdgeCalls}, 0); got != nil { + t.Fatalf("ExpandFrontier(nil ids) = %v, want nil", got) + } + if got := s.ExpandFrontier([]string{"hub"}, true, nil, 0); got != nil { + t.Fatalf("ExpandFrontier(nil kinds) = %v, want nil", got) + } +} diff --git a/internal/graph/store_ladybug/fts.go b/internal/graph/store_ladybug/fts.go new file mode 100644 index 00000000..57af71c5 --- /dev/null +++ b/internal/graph/store_ladybug/fts.go @@ -0,0 +1,641 @@ +package store_ladybug + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/search" +) + +// ftsIndexName is the canonical name for the FTS index built over +// SymbolFTS.tokens. Hard-coded because the index is internal to the +// store — callers only ever query it through SearchSymbols. +const ftsIndexName = "idx_symbol_fts_tokens" + +// fts holds the per-store FTS state. The extension only needs to be +// installed + loaded once per database lifetime; built tracks whether +// CREATE_FTS_INDEX has run so SearchSymbols can lazily build on the +// first query in case BuildSymbolIndex hasn't been called yet. +type ftsState struct { + extensionLoaded atomic.Bool + indexBuilt atomic.Bool +} + +// ensureFTSExtension loads the FTS extension into the current +// connection. Idempotent — the second call is a no-op via the +// extensionLoaded sentinel. Cypher's INSTALL fails when the +// extension is already known (per the upstream error message we +// surface), so we wrap with a recovery and treat +// already-installed as success. +// +// Held under writeMu by the caller so concurrent connections don't +// race the load. +func (s *Store) ensureFTSExtensionLocked() error { + if s.fts.extensionLoaded.Load() { + return nil + } + if err := runCypherSafe(s, `INSTALL FTS`); err != nil && + !strings.Contains(err.Error(), "is already installed") { + // Ignore "already installed" — every fresh open re-runs + // this and we don't want it to be a hard failure. + _ = err + } + if err := runCypherSafe(s, `LOAD EXTENSION FTS`); err != nil { + return fmt.Errorf("load fts extension: %w", err) + } + s.fts.extensionLoaded.Store(true) + return nil +} + +// UpsertSymbolFTS records (or replaces) the pre-tokenised text for +// nodeID in the SymbolFTS sidecar table. Called by the indexer for +// every node that passes shouldIndexForSearch — non-searchable +// kinds (KindFile, KindImport, KindLocal, KindBuiltin) never reach +// here, so the FTS corpus stays a clean subset of the graph. +// +// Idempotent on nodeID via MERGE so a re-index of the same file +// replaces the prior row in place rather than appending. +// +// Per-call cost is ~one MERGE; the bulk path (FlushBulk) skips this +// and instead emits a COPY-FROM TSV in copyBulkLocked for the cold- +// start fast path. +func (s *Store) UpsertSymbolFTS(nodeID, tokens string) error { + if nodeID == "" { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + if err := s.ensureFTSExtensionLocked(); err != nil { + return err + } + const q = `MERGE (f:SymbolFTS {id: $id}) SET f.tokens = $tokens` + if err := runCypherWithArgs(s, q, map[string]any{ + "id": nodeID, + "tokens": tokens, + }); err != nil { + return fmt.Errorf("upsert SymbolFTS: %w", err) + } + return nil +} + +// BulkUpsertSymbolFTS is the cold-start fast path: write a TSV of +// (id, tokens) pairs to a temp file and COPY FROM into SymbolFTS in +// one shot. Per-row cost ≈ 1µs on Ladybug's columnar storage, +// vs ~1ms for the Cypher MERGE path UpsertSymbolFTS takes — +// ~1000x cheaper at 600k-node scale. +// +// repoPrefix scopes the pre-COPY wipe: when non-empty, only rows +// whose id starts with `repoPrefix + "/"` are deleted, leaving +// sibling repos' FTS corpus untouched. Without this scoping, the +// MultiIndexer's per-repo drain calls would each clobber every +// other repo's rows and only the last-committed repo's symbols +// would be searchable (the live bug that motivated this signature +// change). Empty repoPrefix preserves the legacy wipe-all +// behaviour for single-repo daemons. +// +// Idempotent under empty input — no-ops cleanly so callers don't +// need to length-check. +func (s *Store) BulkUpsertSymbolFTS(repoPrefix string, items []graph.SymbolFTSItem) error { + if len(items) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + if err := s.ensureFTSExtensionLocked(); err != nil { + return err + } + + // Dedup by ID — last write wins, mirroring the per-call + // UpsertSymbolFTS's MERGE semantics. The indexer's drain + // shouldn't produce duplicates at the searchable-node layer + // (every Node ID is unique), but guard against the edge case + // where a re-parse of a file emitted the same ID twice. + pos := make(map[string]int, len(items)) + deduped := items[:0] + for _, it := range items { + if it.NodeID == "" { + continue + } + if p, ok := pos[it.NodeID]; ok { + deduped[p] = it + } else { + pos[it.NodeID] = len(deduped) + deduped = append(deduped, it) + } + } + items = deduped + if len(items) == 0 { + return nil + } + + // Drop the FTS index BEFORE mutating the table. Ladybug cannot + // DELETE-from / COPY-into a table that still carries an FTS index — + // the operation errors, and the failed statement leaves the pooled + // connection poisoned; discarding it then crashes the daemon in + // lbug_connection_destroy. On a cold start the table has no index + // yet so this is a no-op, but on a warm-restart re-track the prior + // run's index is present and this drop is what keeps the re-track + // from taking the whole daemon down. BuildSymbolIndex recreates the + // index after the corpus is rewritten. Same hazard (and fix) as the + // SymbolVec vector-index path. + _ = runCypherSafe(s, fmt.Sprintf(`CALL DROP_FTS_INDEX('SymbolFTS', '%s')`, ftsIndexName)) + s.fts.indexBuilt.Store(false) + + // Wipe prior FTS rows for this repo only so sibling repos + // in a MultiIndexer store keep their corpus. Without this + // scoping a clean rebuild of repo A would wipe repo B's rows + // and search_symbols would only ever see whichever repo + // committed last. + if repoPrefix != "" { + if err := runCypherWithArgs(s, `MATCH (f:SymbolFTS) WHERE f.id STARTS WITH $p DELETE f`, map[string]any{ + "p": repoPrefix + "/", + }); err != nil { + return fmt.Errorf("clear SymbolFTS for repo %q before bulk upsert: %w", repoPrefix, err) + } + // Drop stale tier-0 name-cache entries for this repo so a + // reindex that removes a symbol doesn't leave a phantom hit + // for searches against this prefix. + if s.nameIdx != nil { + s.nameIdx.removeByPrefix(repoPrefix + "/") + } + } else if err := runCypherSafe(s, `MATCH (f:SymbolFTS) DELETE f`); err != nil { + return fmt.Errorf("clear SymbolFTS before bulk upsert: %w", err) + } + + dir, err := os.MkdirTemp("", "lbug-fts-bulk-") + if err != nil { + return fmt.Errorf("mkdir bulk tmp: %w", err) + } + defer func() { _ = os.RemoveAll(dir) }() + // Ladybug's COPY binder rejects ".tsv" with "Cannot load from file + // type tsv"; the parser dispatches on extension. ".csv" + DELIM='\t' + // is the convention the Node / Edge / SymbolVec bulk loaders use. + path := filepath.Join(dir, "symbolfts.csv") + if err := writeSymbolFTSTSV(path, items); err != nil { + return fmt.Errorf("write SymbolFTS tsv: %w", err) + } + + // Load with LOAD FROM ... MERGE rather than COPY. Kuzu's COPY into a node + // table is only legal when the table is empty or already carries a + // materialised PK hash index; the per-repo DELETE above keeps sibling + // repos' rows, so SymbolFTS is non-empty by design and a direct COPY + // fails non-deterministically ("COPY into a non-empty primary-key node + // table without a hash index is not supported"). DROP TABLE + recreate + // (the SymbolVec remedy) would wipe the siblings. LOAD FROM scans the + // file as a row source and MERGEs straight into SymbolFTS in one + // statement — a DML write with no empty-table precondition, no staging + // table, and ~2x faster than COPY-into-temp + MERGE on a 20k-row corpus. + // The just-deleted rows re-enter as inserts; any survivor is upserted, + // matching UpsertSymbolFTS's MERGE semantics. column0/column1 are the + // positional names Ladybug assigns when header=false; DELIM='\t' because + // its CSV reader doesn't honour RFC-4180 quoting (tokens are tab-stripped + // in writeSymbolFTSTSV). + loadQ := fmt.Sprintf( + "LOAD FROM '%s' (header=false, delim='\\t') MERGE (f:SymbolFTS {id: column0}) SET f.tokens = column1", + escapeCypherStringLit(path), + ) + if err := runCypherSafe(s, loadQ); err != nil { + return fmt.Errorf("load SymbolFTS: %w", err) + } + // Bulk-load invalidated the prior index; force a rebuild on + // next SearchSymbols. + s.fts.indexBuilt.Store(false) + return nil +} + +// writeSymbolFTSTSV writes items to a tab-separated file in +// (id, tokens) order. Tabs / newlines in tokens are normalised to +// spaces so the COPY parser doesn't misalign rows. +func writeSymbolFTSTSV(path string, items []graph.SymbolFTSItem) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + var b strings.Builder + clean := func(s string) string { + // Strip / replace TSV-toxic characters. Replace tabs and + // newlines with spaces; collapse runs of whitespace later + // if needed (FTS tokeniser already splits on whitespace + // so consecutive spaces are harmless). + if !strings.ContainsAny(s, "\t\r\n") { + return s + } + r := strings.NewReplacer("\t", " ", "\r", " ", "\n", " ") + return r.Replace(s) + } + for _, it := range items { + b.Reset() + b.WriteString(clean(it.NodeID)) + b.WriteByte('\t') + b.WriteString(clean(it.Tokens)) + b.WriteByte('\n') + if _, err := f.WriteString(b.String()); err != nil { + return err + } + } + return nil +} + +// BuildSymbolIndex creates the FTS index over SymbolFTS.tokens. +// Idempotent — the second call is a no-op via the indexBuilt +// sentinel. Ladybug auto-updates the index on later inserts / +// updates to the underlying table, so this is a one-shot +// cold-start call and the daemon's incremental writes (a file +// change triggering a re-parse) don't need to drop and rebuild. +// +// Must be called AFTER the SymbolFTS table has at least one row, +// because CREATE_FTS_INDEX scans the table to build the index. An +// empty table makes the index trivially empty but still valid; a +// subsequent UpsertSymbolFTS will land on it. +func (s *Store) BuildSymbolIndex() error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + if s.fts.indexBuilt.Load() { + return nil + } + if err := s.ensureFTSExtensionLocked(); err != nil { + return err + } + // CREATE_FTS_INDEX is fatal if the index already exists, so guard + // it with a DROP first. The DROP is also fatal if the index + // doesn't exist, so swallow that case. Net effect: idempotent + // build with at most one extra catalog round-trip on the first + // call. + _ = runCypherSafe(s, fmt.Sprintf(`CALL DROP_FTS_INDEX('SymbolFTS', '%s')`, ftsIndexName)) + const ddl = `CALL CREATE_FTS_INDEX('SymbolFTS', '%s', ['tokens'])` + if err := runCypherSafe(s, fmt.Sprintf(ddl, ftsIndexName)); err != nil { + return fmt.Errorf("create fts index: %w", err) + } + s.fts.indexBuilt.Store(true) + return nil +} + +// SearchSymbols runs a full-text query against the SymbolFTS index +// and returns the hits ordered by descending BM25 score. The query +// is pre-tokenised by internal/search.TokenizeQuery and re-joined +// with spaces, so a camelCase query (`getUserById`) matches the +// same way a space-separated query (`get user by id`) would — +// matching the recall contract our existing BM25 backend gives. +// +// If the index hasn't been built yet (BuildSymbolIndex not called), +// this attempts to build it lazily on the first query so a daemon +// process that came up before the index landed still serves search +// correctly. +func (s *Store) SearchSymbols(query string, limit int) ([]graph.SymbolHit, error) { + if query == "" { + return nil, nil + } + if limit <= 0 { + limit = 20 + } + // Tier 0: exact-name lookup via the in-memory name index. The + // codedb playbook calls this the flat-symbol map: when the query + // is a single identifier, an O(1) hash hit replaces the FTS + // round-trip and the BM25 ranking cycle. We only short-circuit + // when the cache hits AT LEAST one node; misses fall through + // to the FTS path so a partial-identifier query still works. + // + // The query must look like an identifier (no whitespace, no + // path separators) — multi-word queries are concept searches + // and need BM25 to rank them across the field bag. + if isIdentifierQuery(query) && s.nameIdx != nil { + s.nameIdx.bootstrap(s) + ids := s.nameIdx.lookup(query) + if len(ids) > 0 { + out := make([]graph.SymbolHit, 0, len(ids)) + // Score = 100 so the engine's rerank treats these as + // the strongest BM25-equivalent signal — exact-name + // matches dominate the head of the result set, where + // the user expects to find their literal-typed + // identifier. The downstream rerank still re-orders + // among them on the structural signals (fan-in, + // community, …) so two same-name candidates aren't + // frozen in insertion order. + for _, id := range ids { + out = append(out, graph.SymbolHit{NodeID: id, Score: 100.0}) + if len(out) >= limit { + break + } + } + return out, nil + } + } + + // Tokenise on the read side using the SAME splitter as the + // write side (search.Tokenize). Symmetry matters: the corpus + // has `ValidateToken` stored as [validate, token], so a + // user-typed `ValidateToken` query must also split to + // [validate, token] to land. search.TokenizeQuery would NOT + // split camelCase (it preserves short tokens at the cost of + // camelCase recall), which produces a single `validatetoken` + // token that misses the split corpus. + tokens := search.Tokenize(query) + if len(tokens) == 0 { + // Fallback: when Tokenize drops everything (e.g. query is a + // single sub-2-char token like "go" / "js"), use the + // query-tokeniser's looser policy so the search still + // reaches the engine instead of silently returning empty. + tokens = search.TokenizeQuery(query) + if len(tokens) == 0 { + return nil, nil + } + } + q := strings.Join(tokens, " ") + + // Lazy build: if the index isn't there yet, try to create it + // now. Failure is non-fatal — we just return no results. + if !s.fts.indexBuilt.Load() { + if err := s.BuildSymbolIndex(); err != nil { + return nil, err + } + } + const cypher = ` +CALL QUERY_FTS_INDEX('SymbolFTS', '` + ftsIndexName + `', $q) +RETURN node.id AS id, score +ORDER BY score DESC +LIMIT $k` + rows, err := querySelectSafe(s, cypher, map[string]any{ + "q": q, + "k": int64(limit), + }) + if err != nil { + return nil, fmt.Errorf("query fts: %w", err) + } + hits := make([]graph.SymbolHit, 0, len(rows)) + for _, row := range rows { + if len(row) < 2 { + continue + } + id, _ := row[0].(string) + if id == "" { + continue + } + score, _ := row[1].(float64) + hits = append(hits, graph.SymbolHit{NodeID: id, Score: score}) + } + return hits, nil +} + +// SearchSymbolBundles is the rerank-shaped fast path: in one BM25 +// fan-out we return the matched node, its score, AND the in/out +// edges the rerank pipeline reads from. The engine routes through +// this method when the backend implements graph.SymbolBundleSearcher, +// pre-seeding rerank.Context's edge caches so the prepare pass skips +// its own batched fetch. +// +// Implementation cost: one FTS Cypher + three batched MATCH-by-ids +// Cypher calls (nodes, outEdges, inEdges). The three batched MATCH +// calls fan out across goroutines via the connection pool — each +// goroutine pulls its own pool Connection (cgo-safe; see connpool.go) +// so the post-FTS phase is bounded by max() of the three round-trips +// instead of their sum. Effective cgo round-trips: 1 FTS + 1 +// concurrent batch == 2 sequential phases. The prior search path was +// 1 FTS + 1 nodes-by-ids + 2 edge fetches inside the rerank prepare +// (also 4 cgo, but they live in separate timing phases so the cost +// compounds across the engine → rerank boundary). Probe (see +// bench/ladybug-bundle-probe): +// +// NewServer (30 hits) med=87.4ms +// handleStreamable (30 hits) med=89.5ms +// daemon controller (19 hits) med=67.8ms +// +// vs the single-shot combined-Cypher candidate (OPTIONAL MATCH + +// collect twice), which clocked 150-185ms median because Kuzu +// materialises a cross-product between the two collect frames. +// +// Idempotent on a fresh DB: lazy-builds the FTS index if it isn't +// present yet (matching SearchSymbols's behaviour) so a daemon +// process that came up before BuildSymbolIndex finished still serves +// search correctly. +func (s *Store) SearchSymbolBundles(query string, limit int) ([]graph.SymbolBundle, error) { + if query == "" { + return nil, nil + } + if limit <= 0 { + limit = 20 + } + // Tier 0: same flat-symbol-map fast path as SearchSymbols. The + // rerank pipeline asks for bundles (node + edges) when the + // backend supports it; we satisfy that contract with batched + // node/edge fetches but skip the FTS round-trip when the + // in-memory name index already knows the candidates. + if isIdentifierQuery(query) && s.nameIdx != nil { + s.nameIdx.bootstrap(s) + ids := s.nameIdx.lookup(query) + if len(ids) > 0 { + if len(ids) > limit { + ids = ids[:limit] + } + return s.bundlesForIDs(ids, 100.0), nil + } + } + tokens := search.Tokenize(query) + if len(tokens) == 0 { + tokens = search.TokenizeQuery(query) + if len(tokens) == 0 { + return nil, nil + } + } + q := strings.Join(tokens, " ") + + if !s.fts.indexBuilt.Load() { + if err := s.BuildSymbolIndex(); err != nil { + return nil, err + } + } + // Phase 1: FTS yields (id, score) ordered by score descending. Skip + // the round-trip when the query degenerates to no tokens (handled + // above) — leaving this on the hot path so an empty corpus + empty + // index returns cleanly. + const ftsCypher = ` +CALL QUERY_FTS_INDEX('SymbolFTS', '` + ftsIndexName + `', $q) +RETURN node.id AS id, score +ORDER BY score DESC +LIMIT $k` + ftsRows, err := querySelectSafe(s, ftsCypher, map[string]any{ + "q": q, + "k": int64(limit), + }) + if err != nil { + return nil, fmt.Errorf("query fts: %w", err) + } + if len(ftsRows) == 0 { + return nil, nil + } + + // Preserve FTS order — the BM25 score determines TextRank, which + // the rerank pipeline reads. Build a parallel id list and a + // score map keyed by id for the join step. + ids := make([]string, 0, len(ftsRows)) + scoreByID := make(map[string]float64, len(ftsRows)) + for _, row := range ftsRows { + if len(row) < 2 { + continue + } + id, _ := row[0].(string) + if id == "" { + continue + } + score, _ := row[1].(float64) + if _, dup := scoreByID[id]; dup { + // FTS returns each node once for a given query, but defend + // against future configurations that might not — first hit + // keeps the score / position. + continue + } + scoreByID[id] = score + ids = append(ids, id) + } + if len(ids) == 0 { + return nil, nil + } + + // Phases 2-4: batched node materialise + in/out edge fetch keyed + // on the same ids. The three calls have no data dependency between + // each other (they all read from `ids`) so we fan them out across + // three goroutines. Each call goes through executeOrQuery, which + // pulls its own pool connection — Ladybug's go binding panics on + // two goroutines sharing a single *lbug.Connection, so the pool + // fan-out is what makes this safe (see connpool.go). + // + // Effective wall-clock drops from sum(nodes,out,in) to max(nodes, + // out,in); on a typical bundle (~30 ids) that collapses three + // ~25-30 ms cgo round-trips into one ~30 ms phase. + var ( + nodes map[string]*graph.Node + out map[string][]*graph.Edge + in map[string][]*graph.Edge + wg sync.WaitGroup + ) + wg.Add(3) + go func() { + defer wg.Done() + nodes = s.GetNodesByIDs(ids) + }() + go func() { + defer wg.Done() + out = s.GetOutEdgesByNodeIDs(ids) + }() + go func() { + defer wg.Done() + in = s.GetInEdgesByNodeIDs(ids) + }() + wg.Wait() + + bundles := make([]graph.SymbolBundle, 0, len(ids)) + for _, id := range ids { + n := nodes[id] + if n == nil { + // FTS hit references a node that was evicted between the + // FTS call and the node fetch — skip; the caller does its + // own dedup / kind filter anyway. + continue + } + bundles = append(bundles, graph.SymbolBundle{ + Node: n, + Score: scoreByID[id], + OutEdges: out[id], + InEdges: in[id], + }) + } + return bundles, nil +} + +// bundlesForIDs materialises bundles for a known ID list — the +// tier-0 fast path returns this when the name index hits, so the +// SymbolBundleSearcher contract still delivers nodes + in/out edges +// without paying for an FTS round-trip. Three parallel batched +// fetches mirror SearchSymbolBundles' Phase-2 fan-out so the +// engine sees an identical bundle shape regardless of which tier +// served the query. +func (s *Store) bundlesForIDs(ids []string, score float64) []graph.SymbolBundle { + if len(ids) == 0 { + return nil + } + var ( + nodes map[string]*graph.Node + out map[string][]*graph.Edge + in map[string][]*graph.Edge + wg sync.WaitGroup + ) + wg.Add(3) + go func() { + defer wg.Done() + nodes = s.GetNodesByIDs(ids) + }() + go func() { + defer wg.Done() + out = s.GetOutEdgesByNodeIDs(ids) + }() + go func() { + defer wg.Done() + in = s.GetInEdgesByNodeIDs(ids) + }() + wg.Wait() + bundles := make([]graph.SymbolBundle, 0, len(ids)) + for _, id := range ids { + n := nodes[id] + if n == nil { + continue + } + bundles = append(bundles, graph.SymbolBundle{ + Node: n, + Score: score, + OutEdges: out[id], + InEdges: in[id], + }) + } + return bundles +} + +// runCypherSafe wraps the panicking runWriteLocked helper and +// returns any runtime / catalog error as a normal Go error so the +// FTS bootstrap can react to (and report) failures instead of +// taking down the process. +func runCypherSafe(s *Store, query string) (err error) { + defer func() { + if r := recover(); r != nil { + if e, ok := r.(error); ok { + err = e + return + } + err = fmt.Errorf("%v", r) + } + }() + s.runWriteLocked(query, nil) + return nil +} + +func runCypherWithArgs(s *Store, query string, args map[string]any) (err error) { + defer func() { + if r := recover(); r != nil { + if e, ok := r.(error); ok { + err = e + return + } + err = fmt.Errorf("%v", r) + } + }() + s.runWriteLocked(query, args) + return nil +} + +func querySelectSafe(s *Store, query string, args map[string]any) (rows [][]any, err error) { + defer func() { + if r := recover(); r != nil { + if e, ok := r.(error); ok { + err = e + return + } + err = fmt.Errorf("%v", r) + } + }() + rows = s.querySelectLocked(query, args) + return rows, nil +} diff --git a/internal/graph/store_ladybug/fts_multiterm_probe_test.go b/internal/graph/store_ladybug/fts_multiterm_probe_test.go new file mode 100644 index 00000000..862b325f --- /dev/null +++ b/internal/graph/store_ladybug/fts_multiterm_probe_test.go @@ -0,0 +1,376 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/search" +) + +// TestFTS_MultiRepoIsolation is the regression for the multi-repo +// clobber bug: per-repo Indexers share one Store, and a previous +// BulkUpsertSymbolFTS implementation wiped every row in SymbolFTS +// (MATCH (f:SymbolFTS) DELETE f) before COPY. The result was that +// only the last-committed repo's symbols survived in the FTS corpus +// and search_symbols was broken for every sibling. +// +// This test seeds two "repos" with disjoint IDs, calls +// BulkUpsertSymbolFTS twice in succession (once per prefix), then +// asserts that SearchSymbols still returns hits from BOTH repos. +func TestFTS_MultiRepoIsolation(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-multi-repo-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + repoA := "gortex" + repoB := "gortex-cloud" + + itemsA := []graph.SymbolFTSItem{ + {NodeID: repoA + "/internal/mcp/server.go::NewServer", Tokens: "new server internal mcp"}, + {NodeID: repoA + "/internal/indexer/indexer.go::IndexAll", Tokens: "index all internal indexer"}, + } + itemsB := []graph.SymbolFTSItem{ + {NodeID: repoB + "/api/billing.go::ChargeCustomer", Tokens: "charge customer api billing"}, + } + for _, it := range itemsA { + s.AddNode(&graph.Node{ID: it.NodeID, Kind: graph.KindFunction, RepoPrefix: repoA, FilePath: it.NodeID}) + } + for _, it := range itemsB { + s.AddNode(&graph.Node{ID: it.NodeID, Kind: graph.KindFunction, RepoPrefix: repoB, FilePath: it.NodeID}) + } + + // Commit repo A, then repo B — the live order: each repo's + // per-repo Indexer drains and calls BulkUpsertSymbolFTS as it + // finishes warming up. + if err := s.BulkUpsertSymbolFTS(repoA, itemsA); err != nil { + t.Fatalf("repo A bulk: %v", err) + } + if err := s.BulkUpsertSymbolFTS(repoB, itemsB); err != nil { + t.Fatalf("repo B bulk: %v", err) + } + if err := s.BuildSymbolIndex(); err != nil { + t.Fatalf("build: %v", err) + } + + // Repo A's symbol must still be searchable after repo B's + // commit — pre-fix this returned 0 hits. + hitsA, err := s.SearchSymbols("NewServer", 10) + if err != nil { + t.Fatalf("search A: %v", err) + } + if len(hitsA) == 0 { + t.Fatalf("repo A NewServer wiped by repo B commit — fix regressed") + } + t.Logf("repo A 'NewServer' → %d hits", len(hitsA)) + + hitsB, err := s.SearchSymbols("ChargeCustomer", 10) + if err != nil { + t.Fatalf("search B: %v", err) + } + if len(hitsB) == 0 { + t.Fatalf("repo B ChargeCustomer not searchable") + } + t.Logf("repo B 'ChargeCustomer' → %d hits", len(hitsB)) + + // A second pass on repo A (incremental re-commit) must wipe + // only repo A's rows, leaving repo B intact. + itemsAUpdated := []graph.SymbolFTSItem{ + // Original NewServer dropped; only IndexAll re-committed. + {NodeID: repoA + "/internal/indexer/indexer.go::IndexAll", Tokens: "index all internal indexer"}, + } + if err := s.BulkUpsertSymbolFTS(repoA, itemsAUpdated); err != nil { + t.Fatalf("repo A re-commit: %v", err) + } + // Force the FTS index to rebuild against the post-wipe corpus + // — the COPY path resets indexBuilt to force a rebuild on the + // next search, but a stale build sentinel from a parallel + // rebuild would skip it. + if err := s.BuildSymbolIndex(); err != nil { + t.Fatalf("rebuild index: %v", err) + } + hitsA2, err := s.SearchSymbols("NewServer", 10) + if err != nil { + t.Fatalf("search A2: %v", err) + } + if len(hitsA2) != 0 { + t.Fatalf("expected NewServer to be dropped after repo A re-commit, got %d hits", len(hitsA2)) + } + hitsB2, err := s.SearchSymbols("ChargeCustomer", 10) + if err != nil { + t.Fatalf("search B2: %v", err) + } + if len(hitsB2) == 0 { + t.Fatalf("repo B was wiped by repo A re-commit — selective wipe is leaking") + } + t.Logf("repo B preserved across repo A re-commit: %d hits", len(hitsB2)) +} + +// realisticTokens mirrors what indexer.ftsTokensFor would produce +// for a code symbol, without pulling in the indexer package: feed +// Name / FilePath / signature through search.Tokenize and join with +// spaces. +func realisticTokens(n *graph.Node) string { + fields := []string{n.Name, n.FilePath} + if n.QualName != "" { + fields = append(fields, n.QualName) + } + if sig, ok := n.Meta["signature"].(string); ok && sig != "" { + fields = append(fields, sig) + } + var out []string + for _, f := range fields { + out = append(out, search.Tokenize(f)...) + } + return strings.Join(out, " ") +} + +// TestFTS_MultiTermRecall probes whether QUERY_FTS_INDEX matches a +// multi-word query against documents whose tokens column contains the +// same words in any order. The production search path stores +// pre-tokenised tokens like "new server" and queries with the same +// joined-by-spaces form; user-visible bench shows the multi-term case +// returning empty while single-term "store" returns hits. +// +// The probe seeds three SymbolFTS rows mirroring real symbol shapes: +// - "new server" → matches "NewServer" +// - "index all" → matches "IndexAll" +// - "store" → matches "Store" +// +// Then queries with single-term and multi-term forms and logs what +// the engine returns. +func TestFTS_MultiTermRecall(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-multi-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + items := []graph.SymbolFTSItem{ + {NodeID: "pkg/mcp.go::NewServer", Tokens: "new server newserver mcp.newserver"}, + {NodeID: "pkg/indexer.go::IndexAll", Tokens: "index all indexall indexer.indexall"}, + {NodeID: "pkg/store.go::Store", Tokens: "store ladybug.store"}, + {NodeID: "pkg/proto.go::HandleStreamable", Tokens: "handle streamable handlestreamable mcp.handlestreamable"}, + } + // Stamp the Node rows too — QUERY_FTS_INDEX joins back to the + // base table via node.id, so unreferenced FTS rows return id=null + // and the production code drops them. + for _, it := range items { + s.AddNode(&graph.Node{ + ID: it.NodeID, + Kind: graph.KindFunction, + Name: it.NodeID, // doesn't matter for FTS — index is on SymbolFTS.tokens + FilePath: "pkg/x.go", + Language: "go", + }) + } + if err := s.BulkUpsertSymbolFTS("", items); err != nil { + t.Fatalf("bulk upsert: %v", err) + } + if err := s.BuildSymbolIndex(); err != nil { + t.Fatalf("build index: %v", err) + } + + probes := []struct { + name string + query string + }{ + {"single 'store'", "store"}, + {"single 'new'", "new"}, + {"single 'server'", "server"}, + {"multi 'new server'", "new server"}, + {"multi 'index all'", "index all"}, + {"multi 'handle streamable'", "handle streamable"}, + {"concat 'newserver'", "newserver"}, + {"concat 'indexall'", "indexall"}, + } + const q = `CALL QUERY_FTS_INDEX('SymbolFTS', '` + ftsIndexName + `', $q) RETURN node.id AS id, score ORDER BY score DESC LIMIT 10` + for _, p := range probes { + rows, err := querySelectSafe(s, q, map[string]any{"q": p.query}) + if err != nil { + t.Logf("FAIL %s (%q): err=%v", p.name, p.query, err) + continue + } + t.Logf("%s (%q) → %d rows", p.name, p.query, len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } + + // Also test with the conjunctive=false / top=10 option syntax + // that some Kuzu / Ladybug builds accept. + probes2 := []struct { + name string + query string + }{ + {"opts conjunctive=false 'new server'", "new server"}, + {"opts conjunctive=true 'new server'", "new server"}, + } + for _, p := range probes2 { + // Try the optional-arg-map syntax: CALL QUERY_FTS_INDEX(..., + // {conjunctive: false, top: 10}). + conjunctive := strings.Contains(p.name, "true") + qWithOpts := `CALL QUERY_FTS_INDEX('SymbolFTS', '` + ftsIndexName + `', $q, conjunctive:=$c) RETURN node.id AS id, score ORDER BY score DESC LIMIT 10` + rows, err := querySelectSafe(s, qWithOpts, map[string]any{ + "q": p.query, + "c": conjunctive, + }) + if err != nil { + t.Logf("FAIL %s (%q): err=%v", p.name, p.query, err) + continue + } + t.Logf("%s (%q) → %d rows", p.name, p.query, len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } +} + +// TestFTS_RealisticCorpus uses ftsTokensFor-equivalent input +// (Tokenize on Name/QualName/FilePath/signature, join with spaces) so +// the probe runs against tokens shaped exactly like what the live +// indexer writes. Then it calls Store.SearchSymbols — the same code +// path the engine's BM25 backend hits. If this returns hits for +// "NewServer" the bug is in a layer above SearchSymbols (engine +// post-filter, rerank, scope); if it returns empty the bug is in the +// FTS tokenization or query construction. +func TestFTS_RealisticCorpus(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-real-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + // A small but realistic corpus modelling several real gortex + // symbols. Each Node carries the fields ftsTokensFor reads: + // Name / QualName / FilePath / Meta["signature"]. + corpus := []*graph.Node{ + { + ID: "internal/mcp/server.go::NewServer", + Kind: graph.KindFunction, + Name: "NewServer", + QualName: "mcp.NewServer", + FilePath: "internal/mcp/server.go", + Language: "go", + Meta: map[string]any{"signature": "func NewServer(g graph.Store) *Server"}, + }, + { + ID: "internal/mcp/server.go::Server", + Kind: graph.KindType, + Name: "Server", + QualName: "mcp.Server", + FilePath: "internal/mcp/server.go", + Language: "go", + Meta: map[string]any{"signature": "type Server struct"}, + }, + { + ID: "internal/indexer/indexer.go::IndexAll", + Kind: graph.KindFunction, + Name: "IndexAll", + QualName: "indexer.IndexAll", + FilePath: "internal/indexer/indexer.go", + Language: "go", + Meta: map[string]any{"signature": "func IndexAll(ctx context.Context) error"}, + }, + { + ID: "internal/mcp/streamable.go::handleStreamable", + Kind: graph.KindFunction, + Name: "handleStreamable", + QualName: "mcp.handleStreamable", + FilePath: "internal/mcp/streamable.go", + Language: "go", + Meta: map[string]any{"signature": "func handleStreamable(w http.ResponseWriter, r *http.Request)"}, + }, + { + ID: "internal/graph/store_ladybug/store.go::Store", + Kind: graph.KindType, + Name: "Store", + QualName: "store_ladybug.Store", + FilePath: "internal/graph/store_ladybug/store.go", + Language: "go", + Meta: map[string]any{"signature": "type Store struct"}, + }, + { + ID: "internal/auth/token.go::ValidateToken", + Kind: graph.KindFunction, + Name: "ValidateToken", + QualName: "auth.ValidateToken", + FilePath: "internal/auth/token.go", + Language: "go", + Meta: map[string]any{"signature": "func ValidateToken(t string) error"}, + }, + } + items := make([]graph.SymbolFTSItem, 0, len(corpus)) + for _, n := range corpus { + s.AddNode(n) + tok := realisticTokens(n) + t.Logf("seed %-65s tokens=%q", n.ID, tok) + items = append(items, graph.SymbolFTSItem{NodeID: n.ID, Tokens: tok}) + } + if err := s.BulkUpsertSymbolFTS("", items); err != nil { + t.Fatalf("bulk: %v", err) + } + if err := s.BuildSymbolIndex(); err != nil { + t.Fatalf("build: %v", err) + } + + for _, q := range []string{ + "NewServer", + "IndexAll", + "handleStreamable", + "ValidateToken", + "Store", + "server", + "index all", + "new server", + "validate token", + } { + hits, err := s.SearchSymbols(q, 20) + if err != nil { + t.Logf("FAIL %q: %v", q, err) + continue + } + t.Logf("SearchSymbols(%q) → %d hits", q, len(hits)) + for _, h := range hits { + t.Logf(" %s score=%.4f", h.NodeID, h.Score) + } + } + + // Verify STARTS WITH works for selective wipes: this is the + // primitive the multi-repo BulkUpsertSymbolFTS fix relies on. + rows, err := querySelectSafe(s, `MATCH (f:SymbolFTS) WHERE f.id STARTS WITH $p RETURN f.id`, map[string]any{ + "p": "internal/mcp/", + }) + if err != nil { + t.Logf("STARTS WITH probe err: %v", err) + } else { + t.Logf("STARTS WITH 'internal/mcp/' → %d rows", len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } +} diff --git a/internal/graph/store_ladybug/fts_probe_test.go b/internal/graph/store_ladybug/fts_probe_test.go new file mode 100644 index 00000000..6ca41383 --- /dev/null +++ b/internal/graph/store_ladybug/fts_probe_test.go @@ -0,0 +1,148 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// TestFTS_Probe is a one-shot capability probe: does the bundled +// liblbug actually expose the CALL CREATE_FTS_INDEX / +// CALL QUERY_FTS_INDEX surface? If it does, the production FTS +// integration is unblocked; if not, we need a different +// installation strategy or a fallback. +// +// Sequence: +// 1. seed three Node rows (search target, near miss, far miss) +// 2. try CALL CREATE_FTS_INDEX directly; on extension-not-loaded, +// fall back to INSTALL fts + LOAD EXTENSION fts + retry +// 3. CALL QUERY_FTS_INDEX with a query that should rank the +// two related rows above the unrelated one +// +// The test logs results rather than asserting strict ordering so a +// schema or scoring tweak doesn't fail the probe — what matters is +// "the surface exists and returns rows". +func TestFTS_Probe(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-probe-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + for _, n := range []*graph.Node{ + {ID: "pkg/auth.go::ValidateToken", Kind: graph.KindFunction, Name: "ValidateToken", QualName: "auth.ValidateToken", FilePath: "pkg/auth.go", Language: "go"}, + {ID: "pkg/auth.go::ValidateSession", Kind: graph.KindFunction, Name: "ValidateSession", QualName: "auth.ValidateSession", FilePath: "pkg/auth.go", Language: "go"}, + {ID: "pkg/format.go::PrettyPrint", Kind: graph.KindFunction, Name: "PrettyPrint", QualName: "format.PrettyPrint", FilePath: "pkg/format.go", Language: "go"}, + } { + s.AddNode(n) + } + t.Logf("seeded %d nodes", s.NodeCount()) + + // Step 1: try CREATE_FTS_INDEX directly. + createErr := tryRunCypher(s, `CALL CREATE_FTS_INDEX('Node', 'idx_name_fts', ['name', 'qual_name'])`) + if createErr != nil { + t.Logf("direct CREATE_FTS_INDEX failed: %v — falling through to INSTALL/LOAD", createErr) + + // Step 2: install + load + retry. Ladybug inherits Kuzu's + // extension-loading semantics; FTS may need to be explicitly + // loaded even though the symbols are compiled in. + if err := tryRunCypher(s, `INSTALL fts`); err != nil { + t.Logf("INSTALL fts: %v", err) + } + if err := tryRunCypher(s, `LOAD EXTENSION fts`); err != nil { + t.Logf("LOAD EXTENSION fts: %v", err) + } + if err := tryRunCypher(s, `CALL CREATE_FTS_INDEX('Node', 'idx_name_fts', ['name', 'qual_name'])`); err != nil { + t.Fatalf("CREATE_FTS_INDEX retry failed: %v", err) + } + } + t.Log("FTS index created") + + // Capability check: does the index auto-update on a node added + // AFTER index creation? Critical for incremental indexing. + s.AddNode(&graph.Node{ID: "pkg/late.go::LateAdded", Kind: graph.KindFunction, Name: "lateadded", QualName: "late.lateadded", FilePath: "pkg/late.go", Language: "go"}) + postRows, postErr := tryQueryCypher(s, `CALL QUERY_FTS_INDEX('Node', 'idx_name_fts', 'lateadded') RETURN node.id AS id ORDER BY score DESC LIMIT 5`, nil) + t.Logf("after post-create AddNode, query 'lateadded' → %d rows (err=%v): %v", len(postRows), postErr, postRows) + + // Step 3: query. The binder expects exactly three STRING args + // (table, index, query) — no limit parameter; truncate with + // LIMIT N at the Cypher level instead. + // + // Try several query shapes to learn how Ladybug's FTS tokenises: + for _, probe := range []string{ + "validate token", // two-word natural query + "validatetoken", // single concat (default tokeniser may have lower-cased CamelCase as one token) + "ValidateToken", // case-preserved + "validate", // single word + "auth", // qualifier token + "PrettyPrint", // far-miss target as control + } { + rows, qerr := tryQueryCypher(s, `CALL QUERY_FTS_INDEX('Node', 'idx_name_fts', $q) RETURN node.id AS id, score ORDER BY score DESC LIMIT 10`, map[string]any{ + "q": probe, + }) + if qerr != nil { + t.Logf("query %q: error: %v", probe, qerr) + continue + } + t.Logf("query %q → %d rows", probe, len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } +} + +// tryRunCypher invokes runWriteLocked and captures any panic / +// runtime error the binding raises so the probe can react to +// "extension not loaded" without aborting. +func tryRunCypher(s *Store, q string) (err error) { + defer func() { + if r := recover(); r != nil { + err = recoverErr(r) + } + }() + s.runWriteLocked(q, nil) + return nil +} + +func tryQueryCypher(s *Store, q string, args map[string]any) (rows [][]any, err error) { + defer func() { + if r := recover(); r != nil { + err = recoverErr(r) + } + }() + rows = s.querySelect(q, args) + return rows, nil +} + +func recoverErr(r any) error { + if e, ok := r.(error); ok { + return e + } + return &probeErr{msg: strings.TrimSpace(toString(r))} +} + +type probeErr struct{ msg string } + +func (e *probeErr) Error() string { return e.msg } + +func toString(v any) string { + switch t := v.(type) { + case string: + return t + case error: + return t.Error() + default: + return "" + } +} diff --git a/internal/graph/store_ladybug/fts_recopy_test.go b/internal/graph/store_ladybug/fts_recopy_test.go new file mode 100644 index 00000000..ba0c8289 --- /dev/null +++ b/internal/graph/store_ladybug/fts_recopy_test.go @@ -0,0 +1,59 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestSymbolFTS_RepeatedPerRepoBulkIsDeterministic exercises the multi-repo +// per-repo re-bulk path of BulkUpsertSymbolFTS: a repo's rows are DELETEd and +// re-COPYed while sibling repos' rows stay in the table, so the COPY targets a +// NON-EMPTY SymbolFTS by design. Pre-fix this hit the same non-deterministic +// "COPY into a non-empty primary-key node table without a hash index is not +// supported" as the SymbolVec path. DROP TABLE is not an option here — it would +// wipe the sibling repos — so the fix must make the non-empty COPY robust. +func TestSymbolFTS_RepeatedPerRepoBulkIsDeterministic(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-recopy-") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + // Cold start: repo alpha into an empty table. + require.NoError(t, s.BulkUpsertSymbolFTS("alpha", []graph.SymbolFTSItem{ + {NodeID: "alpha/a.go::Alpha", Tokens: "alpha apple"}, + })) + require.NoError(t, s.BuildSymbolIndex()) + + // repo beta: alpha's rows remain, so this COPYs into a non-empty table. + require.NoError(t, s.BulkUpsertSymbolFTS("beta", []graph.SymbolFTSItem{ + {NodeID: "beta/b.go::Beta", Tokens: "beta banana"}, + })) + require.NoError(t, s.BuildSymbolIndex()) + + // Re-bulk alpha repeatedly: each call deletes only alpha's rows and COPYs + // them back while beta stays in the table (a non-empty COPY every time). + for i := 0; i < 30; i++ { + require.NoErrorf(t, s.BulkUpsertSymbolFTS("alpha", []graph.SymbolFTSItem{ + {NodeID: "alpha/a.go::Alpha", Tokens: "alpha apple"}, + }), "per-repo re-bulk iteration %d hit the COPY-into-non-empty rejection", i) + require.NoErrorf(t, s.BuildSymbolIndex(), "BuildSymbolIndex iteration %d", i) + } + + // Both repos must still be searchable: per-repo re-bulk must not wipe the + // sibling, and alpha must have been re-added. + beta, err := s.SearchSymbols("banana", 10) + require.NoError(t, err) + require.NotEmpty(t, beta, "sibling repo beta must survive alpha's per-repo re-bulk") + alpha, err := s.SearchSymbols("apple", 10) + require.NoError(t, err) + require.NotEmpty(t, alpha, "alpha must be searchable after re-bulk") +} diff --git a/internal/graph/store_ladybug/fts_test.go b/internal/graph/store_ladybug/fts_test.go new file mode 100644 index 00000000..2ab4b179 --- /dev/null +++ b/internal/graph/store_ladybug/fts_test.go @@ -0,0 +1,229 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/search" +) + +// TestSymbolSearcher_EndToEnd is the conformance check for the +// Ladybug FTS path. Seeds three "symbols" via UpsertSymbolFTS with +// pre-tokenised text, builds the index, then exercises queries that +// the existing BM25 backend recall contract requires to work: +// +// - exact identifier ("ValidateToken" tokenises to "validate token") +// - mid-word camelCase ("validate" / "token" alone) +// - qualifier hop ("auth") +// - control case ("PrettyPrint" / "pretty") +// +// The probe in fts_probe_test.go proved the raw CALL surface works +// but couldn't camelCase-split — the tokenizer bridge here is what +// closes that recall gap. +func TestSymbolSearcher_EndToEnd(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-e2e-*") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + // Pre-tokenise the symbol names exactly as the indexer will at + // production time — search.Tokenize handles camelCase and + // snake_case + path separators. + upsert := func(id, raw string) { + toks := search.Tokenize(raw) + joined := "" + for i, t := range toks { + if i > 0 { + joined += " " + } + joined += t + } + require.NoError(t, s.UpsertSymbolFTS(id, joined)) + } + upsert("pkg/auth.go::ValidateToken", "ValidateToken auth.ValidateToken") + upsert("pkg/auth.go::ValidateSession", "ValidateSession auth.ValidateSession") + upsert("pkg/format.go::PrettyPrint", "PrettyPrint format.PrettyPrint") + + require.NoError(t, s.BuildSymbolIndex()) + + cases := []struct { + name string + query string + wantTopID string + minHits int + }{ + {"exact identifier", "ValidateToken", "pkg/auth.go::ValidateToken", 1}, + {"camelCase head", "validate", "", 2}, + {"camelCase tail", "token", "pkg/auth.go::ValidateToken", 1}, + {"two-word query", "validate token", "pkg/auth.go::ValidateToken", 1}, + {"qualifier", "auth", "", 2}, + {"control", "pretty", "pkg/format.go::PrettyPrint", 1}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + hits, err := s.SearchSymbols(c.query, 10) + require.NoError(t, err) + t.Logf("query %q → %d hits: %v", c.query, len(hits), hits) + assert.GreaterOrEqual(t, len(hits), c.minHits, + "query %q must return at least %d hits", c.query, c.minHits) + if c.wantTopID != "" && len(hits) > 0 { + assert.Equal(t, c.wantTopID, hits[0].NodeID, + "top hit for %q must be %s", c.query, c.wantTopID) + } + }) + } +} + +// TestSymbolSearcher_AutoUpdate verifies the FTS index reflects +// rows added after CREATE_FTS_INDEX. Critical for incremental +// reindexing — a file change re-triggers UpsertSymbolFTS and the +// new row must be findable without re-running BuildSymbolIndex. +func TestSymbolSearcher_AutoUpdate(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-auto-*") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + require.NoError(t, s.UpsertSymbolFTS("pkg/a.go::Original", "original a.original")) + require.NoError(t, s.BuildSymbolIndex()) + + // First query — only the original row exists. + hits, err := s.SearchSymbols("original", 10) + require.NoError(t, err) + require.Len(t, hits, 1) + + // Upsert a new row AFTER index creation. + require.NoError(t, s.UpsertSymbolFTS("pkg/b.go::PostAdd", "post add b.postadd")) + hits, err = s.SearchSymbols("postadd", 10) + require.NoError(t, err) + assert.GreaterOrEqual(t, len(hits), 1, + "post-create insert must be findable without rebuilding the index") +} + +// TestSymbolSearcher_IdempotentUpsert verifies that replacing a row's +// text via a second UpsertSymbolFTS call updates the FTS hit in +// place instead of producing a duplicate. Matches the indexer's +// re-parse contract. +func TestSymbolSearcher_IdempotentUpsert(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-idem-*") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + id := "pkg/foo.go::Method" + require.NoError(t, s.UpsertSymbolFTS(id, "originalname")) + require.NoError(t, s.BuildSymbolIndex()) + require.NoError(t, s.UpsertSymbolFTS(id, "renamedmethod")) + + // Old name should miss; new name should hit. Only one row total. + missHits, err := s.SearchSymbols("originalname", 10) + require.NoError(t, err) + for _, h := range missHits { + assert.NotEqual(t, id, h.NodeID, "old text must no longer match after upsert replacement") + } + freshHits, err := s.SearchSymbols("renamedmethod", 10) + require.NoError(t, err) + require.NotEmpty(t, freshHits) + assert.Equal(t, id, freshHits[0].NodeID) +} + +// TestSearchSymbolBundles_ParallelFetchEquivalence is the correctness +// guard for the post-FTS parallelisation: the three batched MATCH +// calls (nodes / out edges / in edges) now run on three goroutines +// against three pool connections. The output must be byte-for-byte +// identical to the sequential composition — same hits in the same +// FTS-ranked order, each carrying the same node payload and the same +// in/out edge slices. This is the contract callers (the engine's +// bundle-seeding gather path) rely on. +func TestSearchSymbolBundles_ParallelFetchEquivalence(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-fts-bundle-parallel-*") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + // Seed a small graph with edges so the in/out edge phase of the + // bundle returns non-empty payloads — the equivalence assertion + // matters only when there's actually something to compare. The + // FTS column stores pre-tokenised text (the indexer does this in + // production via search.Tokenize); without splitting, a query for + // "token" would not hit "ValidateToken". + upsertTokenised := func(id, raw string) { + toks := search.Tokenize(raw) + require.NoError(t, s.UpsertSymbolFTS(id, strings.Join(toks, " "))) + } + nodeSpecs := []struct { + id, name, path string + }{ + {"pkg/auth.go::ValidateToken", "ValidateToken", "pkg/auth.go"}, + {"pkg/auth.go::ParseToken", "ParseToken", "pkg/auth.go"}, + {"pkg/auth.go::AuthMiddleware", "AuthMiddleware", "pkg/auth.go"}, + {"pkg/server.go::HandleRequest", "HandleRequest", "pkg/server.go"}, + } + for i, spec := range nodeSpecs { + s.AddNode(&graph.Node{ + ID: spec.id, Kind: graph.KindFunction, Name: spec.name, + FilePath: spec.path, StartLine: i + 1, EndLine: i + 5, Language: "go", + }) + upsertTokenised(spec.id, spec.name) + } + // Edges: HandleRequest -> AuthMiddleware -> ValidateToken -> ParseToken + s.AddEdge(&graph.Edge{ + From: "pkg/server.go::HandleRequest", To: "pkg/auth.go::AuthMiddleware", + Kind: graph.EdgeCalls, + }) + s.AddEdge(&graph.Edge{ + From: "pkg/auth.go::AuthMiddleware", To: "pkg/auth.go::ValidateToken", + Kind: graph.EdgeCalls, + }) + s.AddEdge(&graph.Edge{ + From: "pkg/auth.go::ValidateToken", To: "pkg/auth.go::ParseToken", + Kind: graph.EdgeCalls, + }) + require.NoError(t, s.BuildSymbolIndex()) + + bundles, err := s.SearchSymbolBundles("token", 10) + require.NoError(t, err) + require.NotEmpty(t, bundles, "FTS must surface 'token' hits") + + // Reconstruct the same join sequentially via the public API so the + // assertion compares against the post-parallel result. + ids := make([]string, 0, len(bundles)) + for _, b := range bundles { + require.NotNil(t, b.Node, "bundle node must not be nil") + ids = append(ids, b.Node.ID) + } + seqNodes := s.GetNodesByIDs(ids) + seqOut := s.GetOutEdgesByNodeIDs(ids) + seqIn := s.GetInEdgesByNodeIDs(ids) + + for i, b := range bundles { + seqNode := seqNodes[b.Node.ID] + require.NotNil(t, seqNode, "sequential GetNodesByIDs lost id %q", b.Node.ID) + assert.Equal(t, seqNode.ID, b.Node.ID, "bundle[%d] node id drift", i) + assert.Equal(t, seqNode.Name, b.Node.Name, "bundle[%d] node name drift", i) + assert.Equal(t, len(seqOut[b.Node.ID]), len(b.OutEdges), + "bundle[%d] out-edge count drift for %q", i, b.Node.ID) + assert.Equal(t, len(seqIn[b.Node.ID]), len(b.InEdges), + "bundle[%d] in-edge count drift for %q", i, b.Node.ID) + } +} diff --git a/internal/graph/store_ladybug/fts_timing_test.go b/internal/graph/store_ladybug/fts_timing_test.go new file mode 100644 index 00000000..574e2b28 --- /dev/null +++ b/internal/graph/store_ladybug/fts_timing_test.go @@ -0,0 +1,99 @@ +//go:build ladybug + +package store_ladybug + +import ( + "fmt" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func benchFTSItems(repo string, n int) []graph.SymbolFTSItem { + items := make([]graph.SymbolFTSItem, n) + for i := range items { + items[i] = graph.SymbolFTSItem{ + NodeID: fmt.Sprintf("%s/pkg/f%06d.go::Symbol%06d", repo, i, i), + Tokens: fmt.Sprintf("symbol%06d handle request parse token alpha beta gamma", i), + } + } + return items +} + +// TestFTSBulkStrategyTiming compares three ways to land a repo's FTS corpus +// into SymbolFTS at a realistic row count: +// +// A direct COPY into an EMPTY table (the old fast path / baseline) +// B staging table: COPY into temp + MERGE (the committed fix) +// C LOAD FROM '' MERGE (single-query, no temp table) +// +// B and C run into a NON-EMPTY SymbolFTS (a sibling repo seeded first) — the +// per-repo multi-repo scenario that direct COPY (A) cannot serve. Run with: +// +// go test -tags ladybug -run TestFTSBulkStrategyTiming -v ./internal/graph/store_ladybug/ +func TestFTSBulkStrategyTiming(t *testing.T) { + if testing.Short() { + t.Skip("timing") + } + const n = 20000 + target := benchFTSItems("target", n) + + // fresh store with the target CSV written; optionally seed a sibling repo + // so the measured load targets a non-empty SymbolFTS. + setup := func(seedSibling bool) (*Store, string) { + dir := t.TempDir() + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + if seedSibling { + require.NoError(t, s.BulkUpsertSymbolFTS("sibling", benchFTSItems("sibling", n))) + } + csv := filepath.Join(dir, "target.csv") + require.NoError(t, writeSymbolFTSTSV(csv, target)) + return s, csv + } + lit := func(p string) string { return escapeCypherStringLit(p) } + + // A — direct COPY into an empty table (baseline). + func() { + s, csv := setup(false) + defer func() { _ = s.Close() }() + s.writeMu.Lock() + defer s.writeMu.Unlock() + start := time.Now() + require.NoError(t, runCypherSafe(s, fmt.Sprintf("COPY SymbolFTS FROM '%s' (HEADER=false, DELIM='\\t')", lit(csv)))) + t.Logf("A direct COPY (empty) : %8s for %d rows", time.Since(start).Round(time.Millisecond), n) + }() + + // B — staging COPY + MERGE into a non-empty table (the committed fix). + func() { + s, csv := setup(true) + defer func() { _ = s.Close() }() + s.writeMu.Lock() + defer s.writeMu.Unlock() + _ = runCypherSafe(s, fmt.Sprintf(`CALL DROP_FTS_INDEX('SymbolFTS', '%s')`, ftsIndexName)) + start := time.Now() + _ = runCypherSafe(s, `DROP TABLE IF EXISTS SymbolFTSStage`) + require.NoError(t, runCypherSafe(s, `CREATE NODE TABLE SymbolFTSStage(id STRING, tokens STRING, PRIMARY KEY(id))`)) + require.NoError(t, runCypherSafe(s, fmt.Sprintf("COPY SymbolFTSStage FROM '%s' (HEADER=false, DELIM='\\t')", lit(csv)))) + require.NoError(t, runCypherSafe(s, `MATCH (st:SymbolFTSStage) MERGE (f:SymbolFTS {id: st.id}) SET f.tokens = st.tokens`)) + _ = runCypherSafe(s, `DROP TABLE IF EXISTS SymbolFTSStage`) + t.Logf("B staging COPY+MERGE (n-e) : %8s for %d rows", time.Since(start).Round(time.Millisecond), n) + }() + + // C — LOAD FROM '' MERGE into a non-empty table (single query). + func() { + s, csv := setup(true) + defer func() { _ = s.Close() }() + s.writeMu.Lock() + defer s.writeMu.Unlock() + _ = runCypherSafe(s, fmt.Sprintf(`CALL DROP_FTS_INDEX('SymbolFTS', '%s')`, ftsIndexName)) + start := time.Now() + q := fmt.Sprintf("LOAD FROM '%s' (header=false, delim='\\t') MERGE (f:SymbolFTS {id: column0}) SET f.tokens = column1", lit(csv)) + require.NoError(t, runCypherSafe(s, q), "LOAD FROM ... MERGE") + t.Logf("C LOAD FROM MERGE (n-e) : %8s for %d rows", time.Since(start).Round(time.Millisecond), n) + }() +} diff --git a/internal/graph/store_ladybug/inedge_probe_test.go b/internal/graph/store_ladybug/inedge_probe_test.go new file mode 100644 index 00000000..a47bca24 --- /dev/null +++ b/internal/graph/store_ladybug/inedge_probe_test.go @@ -0,0 +1,108 @@ +package store_ladybug_test + +import ( + "path/filepath" + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_ladybug" +) + +// buildFanInStore seeds a fan-in graph (a, b, c → z) so the inbound +// traversal paths have something to find. +func buildFanInStore(t *testing.T) *store_ladybug.Store { + t.Helper() + dir := t.TempDir() + s, err := store_ladybug.Open(filepath.Join(dir, "test.kuzu")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + for _, id := range []string{"a", "b", "c", "z"} { + s.AddNode(&graph.Node{ + ID: id, + Name: id, + Kind: graph.KindFunction, + FilePath: id + ".go", + }) + } + for i, from := range []string{"a", "b", "c"} { + s.AddEdge(&graph.Edge{ + From: from, + To: "z", + Kind: graph.EdgeCalls, + FilePath: from + ".go", + Line: i + 1, + }) + } + return s +} + +// TestLadybugGetInEdges_InlinePropMatchesWhereClause probes a Cypher +// planner shape: inbound-edge lookup written as inline property +// match `(b:Node {id: $id})` on the arrow target vs. an outer +// `WHERE b.id = $id` clause. The two forms should be observationally +// identical; if they diverge on Ladybug the inbound path +// (find_usages / get_callers / analyze cycles / suggest_pattern) +// silently drops rows. +func TestLadybugGetInEdges_InlinePropMatchesWhereClause(t *testing.T) { + s := buildFanInStore(t) + in := s.GetInEdges("z") + if got := len(in); got != 3 { + t.Fatalf("GetInEdges(z) returned %d edges, want 3", got) + } + for _, e := range in { + if e.To != "z" { + t.Fatalf("GetInEdges(z) yielded edge with To=%q, want %q", e.To, "z") + } + } +} + +// TestLadybugInDegreePushdowns probes the two reverse-direction Cypher +// pushdowns: the `COUNT { MATCH (:Node)-[:Edge]->(n) }` sub-query used +// by InDegreeForNodes / NodeDegreeByKinds, and the IN-list inbound +// match used by GetInEdgesByNodeIDs. Both feed the same hub-detection +// + degree-counting code paths the find_usages / get_callers / +// cycles / suggest_pattern analyzers rely on. +func TestLadybugInDegreePushdowns(t *testing.T) { + s := buildFanInStore(t) + + t.Run("GetInEdgesByNodeIDs", func(t *testing.T) { + got := s.GetInEdgesByNodeIDs([]string{"z"}) + if len(got["z"]) != 3 { + t.Fatalf("GetInEdgesByNodeIDs(z) = %d edges, want 3", len(got["z"])) + } + }) + + t.Run("InDegreeForNodes", func(t *testing.T) { + got := s.InDegreeForNodes([]string{"z"}) + if c := got["z"]; c != 3 { + t.Fatalf("InDegreeForNodes(z) = %d, want 3 (full map: %+v)", c, got) + } + }) + + t.Run("NodeDegreeByKinds", func(t *testing.T) { + rows := s.NodeDegreeByKinds([]graph.NodeKind{graph.KindFunction}, "") + var zRow *graph.NodeDegreeRow + for i := range rows { + if rows[i].NodeID == "z" { + zRow = &rows[i] + break + } + } + if zRow == nil { + t.Fatalf("NodeDegreeByKinds did not return row for z; got %+v", rows) + } + if zRow.InCount != 3 { + t.Fatalf("NodeDegreeByKinds(z).InCount = %d, want 3", zRow.InCount) + } + }) + + t.Run("InEdgeCountsByKind", func(t *testing.T) { + got := s.InEdgeCountsByKind([]graph.EdgeKind{graph.EdgeCalls}) + if c := got["z"]; c != 3 { + t.Fatalf("InEdgeCountsByKind[calls][z] = %d, want 3 (full: %+v)", c, got) + } + }) +} diff --git a/internal/graph/store_ladybug/malloc_trim.go b/internal/graph/store_ladybug/malloc_trim.go new file mode 100644 index 00000000..a2e8e113 --- /dev/null +++ b/internal/graph/store_ladybug/malloc_trim.go @@ -0,0 +1,12 @@ +package store_ladybug + +// mallocTrimRowThreshold guards every mallocTrim caller — the trim +// itself takes a low-millisecond hop into C and a kernel +// madvise(MADV_FREE) per zone, so per-call overhead matters. The +// threshold should fire on the drains / queries that actually move +// the allocator's high-water mark, not on the rapid-fire low-row +// queries the daemon's steady state runs. Picked from observation: +// at 50k rows a single capability call materialises hundreds of +// kilobytes of C strings worth releasing; below that the released +// pages aren't a measurable share of physical_footprint. +const mallocTrimRowThreshold = 50000 diff --git a/internal/graph/store_ladybug/malloc_trim_darwin.go b/internal/graph/store_ladybug/malloc_trim_darwin.go new file mode 100644 index 00000000..5a69bdd3 --- /dev/null +++ b/internal/graph/store_ladybug/malloc_trim_darwin.go @@ -0,0 +1,23 @@ +//go:build darwin + +// Package store_ladybug exposes mallocTrim as a thin cgo shim over +// the platform's "return retained pages to the OS" entry point. +// Ladybug's native allocator keeps freed pages for fast reuse; on +// long-lived daemons the retained set grows monotonically and shows +// up as climbing physical_footprint even while RSS stays low. The +// shim is called from the high-volume query and drain paths after a +// large operation completes so the allocator's high-water mark +// settles back down. +package store_ladybug + +// #include +import "C" + +// mallocTrim asks the system allocator to return retained pages to +// the OS. On Darwin the call routes to malloc_zone_pressure_relief +// on the default malloc zone. The "goal" argument of 0 means "free +// as much as you can"; the return value (bytes released) is ignored +// because the caller has nothing useful to do with it. +func mallocTrim() { + C.malloc_zone_pressure_relief(C.malloc_default_zone(), 0) +} diff --git a/internal/graph/store_ladybug/malloc_trim_linux.go b/internal/graph/store_ladybug/malloc_trim_linux.go new file mode 100644 index 00000000..b7dd56e1 --- /dev/null +++ b/internal/graph/store_ladybug/malloc_trim_linux.go @@ -0,0 +1,21 @@ +//go:build linux + +// Package store_ladybug exposes mallocTrim as a thin cgo shim over +// the platform's "return retained pages to the OS" entry point. +// Ladybug's native allocator keeps freed pages for fast reuse; on +// long-lived daemons the retained set grows monotonically and shows +// up as climbing physical_footprint even while RSS stays low. The +// shim is called from the high-volume query and drain paths after a +// large operation completes so the allocator's high-water mark +// settles back down. +package store_ladybug + +// #include +import "C" + +// mallocTrim asks glibc to release free heap pages back to the OS. +// pad of 0 means "no top padding"; the return value is whether any +// memory was actually released and is ignored. +func mallocTrim() { + C.malloc_trim(0) +} diff --git a/internal/graph/store_ladybug/malloc_trim_other.go b/internal/graph/store_ladybug/malloc_trim_other.go new file mode 100644 index 00000000..2806968e --- /dev/null +++ b/internal/graph/store_ladybug/malloc_trim_other.go @@ -0,0 +1,18 @@ +//go:build !darwin && !linux + +// Package store_ladybug exposes mallocTrim as a thin cgo shim over +// the platform's "return retained pages to the OS" entry point. +// Ladybug's native allocator keeps freed pages for fast reuse; on +// long-lived daemons the retained set grows monotonically and shows +// up as climbing physical_footprint even while RSS stays low. The +// shim is called from the high-volume query and drain paths after a +// large operation completes so the allocator's high-water mark +// settles back down. +package store_ladybug + +// mallocTrim is a no-op on platforms without a documented "return +// retained pages" entry point. Windows reclaims via the heap +// manager's own background trimming and *BSDs use jemalloc tweakable +// through MALLOC_OPTIONS rather than a C entry point — both leave +// the caller no actionable hook. +func mallocTrim() {} diff --git a/internal/graph/store_ladybug/method_call_resolve_probe_test.go b/internal/graph/store_ladybug/method_call_resolve_probe_test.go new file mode 100644 index 00000000..b0330ed7 --- /dev/null +++ b/internal/graph/store_ladybug/method_call_resolve_probe_test.go @@ -0,0 +1,69 @@ +package store_ladybug_test + +import ( + "path/filepath" + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_ladybug" +) + +// TestResolveMethodCalls_UniqueBinds verifies that a receiver-method +// call stub (`unresolved::*.querySelect`) is bound to the concrete +// method node when exactly one method in the repo carries that name, +// and is LEFT unresolved when the name is ambiguous (defined on >1 +// type) — the no-false-edge guarantee. +func TestResolveMethodCalls_UniqueBinds(t *testing.T) { + dir := t.TempDir() + s, err := store_ladybug.Open(filepath.Join(dir, "test.kuzu")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + // Caller method + the unique target method, same repo. Method nodes + // store the BARE method name in `name` (the receiver lives in + // meta.receiver / enclosing) — mirror that exactly, since the + // qualified-name assumption is what masked the original bug. + s.AddNode(&graph.Node{ID: "pkg/a.go::Store.GetNode", Name: "GetNode", Kind: graph.KindMethod, FilePath: "pkg/a.go", RepoPrefix: "gortex", Meta: map[string]any{"receiver": "Store"}}) + s.AddNode(&graph.Node{ID: "pkg/b.go::Store.querySelect", Name: "querySelect", Kind: graph.KindMethod, FilePath: "pkg/b.go", RepoPrefix: "gortex", Meta: map[string]any{"receiver": "Store"}}) + // Ambiguous: two types both define Close — must stay unresolved. + s.AddNode(&graph.Node{ID: "pkg/b.go::Store.Close", Name: "Close", Kind: graph.KindMethod, FilePath: "pkg/b.go", RepoPrefix: "gortex", Meta: map[string]any{"receiver": "Store"}}) + s.AddNode(&graph.Node{ID: "pkg/c.go::Conn.Close", Name: "Close", Kind: graph.KindMethod, FilePath: "pkg/c.go", RepoPrefix: "gortex", Meta: map[string]any{"receiver": "Conn"}}) + + // Method-call edges in the pre-resolve stub form (the COPY rewrite + // prefixes the repo; emulate the prefixed form the daemon sees). + s.AddEdge(&graph.Edge{From: "pkg/a.go::Store.GetNode", To: "gortex::unresolved::*.querySelect", Kind: graph.EdgeCalls, FilePath: "pkg/a.go", Line: 5}) + s.AddEdge(&graph.Edge{From: "pkg/a.go::Store.GetNode", To: "gortex::unresolved::*.Close", Kind: graph.EdgeCalls, FilePath: "pkg/a.go", Line: 6}) + + // Stamp kind/name on the stubs (the chain runs this first), then + // the method-call rule. + if _, err := s.ResolveAllBulk(); err != nil { + t.Fatalf("ResolveAllBulk: %v", err) + } + + // querySelect is unique → the edge must now point at the method. + out := s.GetOutEdges("pkg/a.go::Store.GetNode") + var boundQuerySelect, leftClose bool + for _, e := range out { + if e.To == "pkg/b.go::Store.querySelect" && e.Kind == graph.EdgeCalls { + boundQuerySelect = true + } + // Close is ambiguous (Store.Close + Conn.Close) → stub stays. + if graph.IsUnresolvedTarget(e.To) && graph.UnresolvedName(e.To) == "*.Close" { + leftClose = true + } + } + if !boundQuerySelect { + t.Fatalf("expected *.querySelect bound to pkg/b.go::Store.querySelect; out edges = %+v", out) + } + if !leftClose { + t.Fatalf("expected ambiguous *.Close to stay unresolved (no false edge); out edges = %+v", out) + } + + // find_usages-shaped check: the method now has an incoming caller. + in := s.GetInEdges("pkg/b.go::Store.querySelect") + if len(in) != 1 || in[0].From != "pkg/a.go::Store.GetNode" { + t.Fatalf("expected Store.querySelect to have 1 caller; in edges = %+v", in) + } +} diff --git a/internal/graph/store_ladybug/migrate.go b/internal/graph/store_ladybug/migrate.go new file mode 100644 index 00000000..ec716a75 --- /dev/null +++ b/internal/graph/store_ladybug/migrate.go @@ -0,0 +1,210 @@ +package store_ladybug + +// Forward-only schema migration ladder for the Ladybug backend. +// +// The Node/Edge/SymbolFTS/FileMtime tables are a derived cache — every +// row is re-buildable by re-indexing — so this is deliberately NOT a +// golang-migrate / Flyway framework (no up/down files, no rollback, no +// per-instance lock table). It is the embedded-store equivalent of +// SQLite's PRAGMA user_version + a switch: read a single version int, +// apply the ordered steps above it, stamp the new version. +// +// Two kinds of step (see migrationStep): +// - additive ALTER (ALTER TABLE ... ADD IF NOT EXISTS ...): preserves +// the warm cache, which is the whole reason this persistence layer +// exists. The default for anything ALTER can express. (Empirically +// verified against liblbug v0.13.1: ADD [IF NOT EXISTS] +// [DEFAULT v], DROP, and existing-row backfill all work.) +// - rebuild: a change ALTER cannot express (a Meta-payload reshape — the +// in-memory store holds Meta as a live map[string]any the disk backend +// round-trips through encodeMeta, which a STRING-column ALTER cannot +// reshape — or a table restructure). Open surfaces it via +// NeedsRebuild() and the caller treats the cache as absent. + +import ( + "fmt" + + lbug "github.com/LadybugDB/go-ladybug" +) + +// currentSchemaVersion is the schema version this build expects on disk. +// Bump it by exactly one for every shipped schema change and add the +// matching migrationStep to ladybugMigrations. +// +// Version 1 is the baseline (the Node/Edge/SymbolFTS/FileMtime schema as +// of the first versioned build). Versioning was introduced without +// touching any existing table, so a database created before SchemaMeta +// existed already matches the v1 columns — applyLadybugMigrations treats +// such a DB as v1 and skips straight to stamping. +const currentSchemaVersion = 1 + +// migrationStep upgrades the on-disk schema TO version `to`. Steps MUST be +// listed in ascending `to` order. Exactly one of apply / rebuild is +// meaningful per step: an apply func runs additive DDL on the setup conn; +// rebuild==true means the change needs a full re-index instead. +type migrationStep struct { + to int + apply func(conn *lbug.Connection) error + rebuild bool +} + +// ladybugMigrations is the forward-only ladder. Empty until the schema +// first changes. When it does, add a step here AND (for additive changes) +// the new column to the relevant CREATE in schemaDDL, so fresh databases +// are born at the latest schema and the ADD IF NOT EXISTS step is a +// harmless no-op on them. Examples: +// +// // Additive column — keeps the warm cache: +// {to: 2, apply: func(c *lbug.Connection) error { +// res, err := c.Query("ALTER TABLE Node ADD IF NOT EXISTS owner STRING") +// if err != nil { +// return err +// } +// res.Close() +// return nil +// }}, +// // Meta-payload reshape ALTER can't express — force a rebuild: +// {to: 3, rebuild: true}, +var ladybugMigrations []migrationStep + +// applyLadybugMigrations brings the on-disk schema up to +// currentSchemaVersion using the package ladder. Called from Open on the +// raw setup connection, before the pool exists (single-threaded, no +// writeMu). Returns whether any crossed step requires a full re-index. +func applyLadybugMigrations(conn *lbug.Connection) (needsRebuild bool, err error) { + return migrateSchema(conn, currentSchemaVersion, ladybugMigrations) +} + +// migrateSchema is the testable core of applyLadybugMigrations: it takes +// the target version and step list explicitly so tests can exercise the +// ladder without mutating package globals. +func migrateSchema(conn *lbug.Connection, current int, steps []migrationStep) (needsRebuild bool, err error) { + stored, ok, err := readSchemaVersion(conn) + if err != nil { + return false, err + } + if !ok { + // No version row. A fresh (empty) DB is born at the current + // schema; an existing DB predates versioning and matches the v1 + // baseline. Either way its columns are correct for that version — + // we only need the right starting rung so later steps don't + // re-run (additive steps are idempotent anyway, but rebuild steps + // must NOT fire on an already-current fresh DB). + hasData, err := dbHasPriorData(conn) + if err != nil { + return false, err + } + if hasData { + stored = 1 + } else { + stored = current + } + } + for _, m := range steps { + if m.to <= stored || m.to > current { + continue + } + if m.rebuild { + needsRebuild = true + continue + } + if m.apply == nil { + continue + } + if err := m.apply(conn); err != nil { + return needsRebuild, fmt.Errorf("schema migration to v%d: %w", m.to, err) + } + } + // Stamp the new schema version. NOTE for the first rebuild step: this + // stamps `current` even when a rebuild rung was crossed, but the actual + // data re-index happens LATER (the daemon forces it via NeedsRebuild at + // warm restart — see cmd/gortex/daemon_state.go storeNeedsRebuild). A + // crash after this stamp but before that re-index finishes would leave + // version=current over old-shape rows. When the first rebuild migration + // lands, make it crash-safe — e.g. defer the stamp until the daemon + // confirms the rebuild rather than stamping here. + if err := writeSchemaVersion(conn, current); err != nil { + return needsRebuild, err + } + return needsRebuild, nil +} + +// readSchemaVersion returns the stored schema_version and whether a row +// existed (a fresh or pre-versioning DB has none). Uses the WHERE-clause +// match form, not inline {k: ...}, per the ladybug read-path convention. +func readSchemaVersion(conn *lbug.Connection) (version int, ok bool, err error) { + res, err := conn.Query("MATCH (m:SchemaMeta) WHERE m.k = 'schema_version' RETURN m.v") + if err != nil { + return 0, false, err + } + defer res.Close() + if !res.HasNext() { + return 0, false, nil + } + tup, err := res.Next() + if err != nil { + return 0, false, err + } + v, err := tup.GetValue(0) + if err != nil { + return 0, false, err + } + // SchemaMeta.v is INT64; the binding surfaces it as a Go int64. + iv, _ := v.(int64) + return int(iv), true, nil +} + +// writeSchemaVersion upserts the schema_version row. MERGE keeps it +// idempotent (last-write-wins), mirroring the FileMtime upsert. The MERGE +// pattern requires the key inline; the integer is formatted directly (no +// injection surface — it is an int). +func writeSchemaVersion(conn *lbug.Connection, version int) error { + res, err := conn.Query(fmt.Sprintf("MERGE (m:SchemaMeta {k: 'schema_version'}) SET m.v = %d", version)) + if err != nil { + return err + } + res.Close() + return nil +} + +// dbHasPriorData reports whether the database shows any evidence of prior +// use, to tell a brand-new (empty) DB from one created before SchemaMeta +// existed. Node, FileMtime, and SymbolFTS each have INDEPENDENT write +// paths (e.g. BulkSetFileMtimes MERGEs FileMtime with no Node dependency), +// so a pre-versioning DB can carry sidecar rows even with an empty Node +// table — a repo that indexed to zero symbols, or a partial index that +// recorded mtimes first. Probing only Node would misclassify such a DB as +// fresh and stamp it current, skipping a future rebuild it needs. Edge is +// omitted on purpose: a rel row cannot exist without its endpoint Node +// rows, so Node already subsumes it. +func dbHasPriorData(conn *lbug.Connection) (bool, error) { + for _, table := range []string{"Node", "FileMtime", "SymbolFTS"} { + has, err := tableHasRows(conn, table) + if err != nil { + return false, err + } + if has { + return true, nil + } + } + return false, nil +} + +// tableHasRows reports whether the named node table holds at least one +// row. Returns a literal (not a column) so it works for any node table +// regardless of its column names (FileMtime keys on file_id, not id). +func tableHasRows(conn *lbug.Connection, table string) (bool, error) { + res, err := conn.Query("MATCH (n:" + table + ") RETURN 1 LIMIT 1") + if err != nil { + return false, err + } + defer res.Close() + return res.HasNext(), nil +} + +// NeedsRebuild reports whether opening the store crossed a migration rung +// ALTER could not satisfy, so the caller should treat the on-disk graph as +// stale and re-index. False on every fresh open and after purely additive +// migrations. (Wiring this into the daemon warmup path lands with the +// first rebuild-requiring migration; the ladder is empty today.) +func (s *Store) NeedsRebuild() bool { return s.needsRebuild } diff --git a/internal/graph/store_ladybug/migrate_test.go b/internal/graph/store_ladybug/migrate_test.go new file mode 100644 index 00000000..98391793 --- /dev/null +++ b/internal/graph/store_ladybug/migrate_test.go @@ -0,0 +1,202 @@ +package store_ladybug + +import ( + "path/filepath" + "testing" + + lbug "github.com/LadybugDB/go-ladybug" +) + +func openMigrateTestStore(t *testing.T) *Store { + t.Helper() + s, err := Open(filepath.Join(t.TempDir(), "store.lbug")) + if err != nil { + t.Fatalf("open store: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s +} + +// addCol returns an apply func that runs one DDL statement on the conn. +func addCol(ddl string) func(*lbug.Connection) error { + return func(c *lbug.Connection) error { + res, err := c.Query(ddl) + if err != nil { + return err + } + res.Close() + return nil + } +} + +// mustExec runs a Cypher statement on the conn and fails the test on error. +func mustExec(t *testing.T, conn *lbug.Connection, q string) { + t.Helper() + res, err := conn.Query(q) + if err != nil { + t.Fatalf("exec %q: %v", q, err) + } + res.Close() +} + +// failIfCalled returns an apply func that fails the test if the version +// gate ever lets it run. +func failIfCalled(t *testing.T) func(*lbug.Connection) error { + return func(*lbug.Connection) error { + t.Error("a gated migration step ran when it should have been skipped") + return nil + } +} + +// A fresh Open stamps the current version and never needs a rebuild. +func TestSchemaVersion_FreshOpenStampsCurrent(t *testing.T) { + s := openMigrateTestStore(t) + v, ok, err := readSchemaVersion(s.conn) + if err != nil { + t.Fatalf("read version: %v", err) + } + if !ok { + t.Fatal("fresh open left no schema_version row") + } + if v != currentSchemaVersion { + t.Fatalf("schema_version = %d, want currentSchemaVersion %d", v, currentSchemaVersion) + } + if s.NeedsRebuild() { + t.Fatal("fresh open reported NeedsRebuild() = true") + } +} + +// The stamped version survives close/reopen (the daemon-restart path, +// which is the whole reason it is persisted), and a reopen neither +// re-migrates nor flags a rebuild. +func TestSchemaVersion_PersistsAcrossReopen(t *testing.T) { + path := filepath.Join(t.TempDir(), "store.lbug") + s1, err := Open(path) + if err != nil { + t.Fatalf("open 1: %v", err) + } + v1, _, _ := readSchemaVersion(s1.conn) + if err := s1.Close(); err != nil { + t.Fatalf("close 1: %v", err) + } + + s2, err := Open(path) + if err != nil { + t.Fatalf("reopen: %v", err) + } + defer func() { _ = s2.Close() }() + v2, ok, err := readSchemaVersion(s2.conn) + if err != nil { + t.Fatalf("read after reopen: %v", err) + } + if !ok || v2 != v1 || v2 != currentSchemaVersion { + t.Fatalf("version after reopen = %d (ok=%v), want %d (== first open %d)", v2, ok, currentSchemaVersion, v1) + } + if s2.NeedsRebuild() { + t.Fatal("reopen reported NeedsRebuild() = true") + } +} + +// An additive ALTER step runs and the version advances; re-running is a +// no-op (the version gate skips already-applied steps). +func TestMigrateSchema_AdditiveStepThenGate(t *testing.T) { + s := openMigrateTestStore(t) // starts at version 1 + + steps := []migrationStep{ + {to: 2, apply: addCol("ALTER TABLE Node ADD IF NOT EXISTS probe_owner STRING")}, + } + rebuild, err := migrateSchema(s.conn, 2, steps) + if err != nil { + t.Fatalf("migrate to v2: %v", err) + } + if rebuild { + t.Fatal("additive step reported needsRebuild = true") + } + if v, _, _ := readSchemaVersion(s.conn); v != 2 { + t.Fatalf("after migrate, version = %d, want 2", v) + } + // The column must now exist (referencing it must not error). + if res, err := s.conn.Query("MATCH (n:Node) RETURN n.probe_owner LIMIT 1"); err != nil { + t.Fatalf("new column probe_owner not queryable: %v", err) + } else { + res.Close() + } + + // Re-run at the same target with a step whose apply MUST NOT fire — + // stored (2) is not < to (2), so the gate skips it. + gate := []migrationStep{ + {to: 2, apply: func(*lbug.Connection) error { + t.Error("already-applied step re-ran (version gate failed)") + return nil + }}, + } + if _, err := migrateSchema(s.conn, 2, gate); err != nil { + t.Fatalf("gate re-run: %v", err) + } +} + +// A pre-versioning DB (no schema_version row) that has only SIDECAR data +// — an empty Node table but a populated FileMtime — must be classed as the +// v1 baseline, not as fresh/current, so a v1->v2 rebuild step still fires. +// Guards against probing Node alone (FileMtime has an independent write +// path and can outlive Node). +func TestMigrateSchema_PreVersioningSidecarOnly(t *testing.T) { + s := openMigrateTestStore(t) + // Sidecar row present, Node empty, schema_version row removed → + // indistinguishable from a real pre-SchemaMeta database. + mustExec(t, s.conn, "MERGE (m:FileMtime {file_id: 'f1'}) SET m.mtime_ns = 1") + mustExec(t, s.conn, "MATCH (m:SchemaMeta) DELETE m") + + rebuild, err := migrateSchema(s.conn, 2, []migrationStep{ + {to: 1, apply: failIfCalled(t)}, // to <= stored(1) → must be skipped + {to: 2, rebuild: true}, // to > stored(1) → must fire + }) + if err != nil { + t.Fatalf("migrate: %v", err) + } + if !rebuild { + t.Fatal("sidecar-only pre-versioning DB misclassified as fresh; the v2 rebuild step was skipped") + } + if v, _, _ := readSchemaVersion(s.conn); v != 2 { + t.Fatalf("version = %d, want 2", v) + } +} + +// A genuinely fresh/empty DB (no schema_version row, no data in any table) +// is born at the current version, so a rebuild step must NOT fire. +func TestMigrateSchema_FreshEmptyDBSkipsRebuild(t *testing.T) { + s := openMigrateTestStore(t) + mustExec(t, s.conn, "MATCH (m:SchemaMeta) DELETE m") // simulate no version row; all data tables empty + + rebuild, err := migrateSchema(s.conn, 2, []migrationStep{{to: 2, rebuild: true}}) + if err != nil { + t.Fatalf("migrate: %v", err) + } + if rebuild { + t.Fatal("fresh empty DB wrongly fired a rebuild step (should be born at current version)") + } + if v, _, _ := readSchemaVersion(s.conn); v != 2 { + t.Fatalf("version = %d, want 2", v) + } +} + +// A rebuild step sets needsRebuild and still advances the version, while a +// preceding additive step on the same ladder run also applies. +func TestMigrateSchema_RebuildStep(t *testing.T) { + s := openMigrateTestStore(t) // version 1 + + steps := []migrationStep{ + {to: 2, apply: addCol("ALTER TABLE Node ADD IF NOT EXISTS probe_x STRING")}, + {to: 3, rebuild: true}, + } + rebuild, err := migrateSchema(s.conn, 3, steps) + if err != nil { + t.Fatalf("migrate to v3: %v", err) + } + if !rebuild { + t.Fatal("rebuild step did not set needsRebuild") + } + if v, _, _ := readSchemaVersion(s.conn); v != 3 { + t.Fatalf("after migrate, version = %d, want 3", v) + } +} diff --git a/internal/graph/store_ladybug/name_index.go b/internal/graph/store_ladybug/name_index.go new file mode 100644 index 00000000..71377c68 --- /dev/null +++ b/internal/graph/store_ladybug/name_index.go @@ -0,0 +1,258 @@ +package store_ladybug + +import ( + "strings" + "sync" + "sync/atomic" + + "github.com/zzet/gortex/internal/graph" +) + +// nameIndex is a denormalised lookup from lowercased Node.Name → +// []*graph.Node. +// +// The codedb playbook calls this the "flat symbol map": a single +// hash hit replaces a graph walk + a BM25 round-trip. For Gortex it +// serves two hot paths: +// +// 1. SearchSymbols tier-0 — identifier queries return exact matches +// in O(1), skipping FTS entirely. Multi-word queries fall through +// to FTS with no recall loss. +// 2. FindNodesByName / FindNodesByNameInRepo — the resolver's name- +// to-candidates lookup. Pre-cache, every per-edge resolver pass +// paid a Cypher round-trip; on a 100k-edge multi-repo graph that +// was the warmup bottleneck. The cache is on the hot path of +// every resolveMethodCall / resolveFunctionCall, so it must +// deliver a full Node slice without a follow-up cgo fetch. +// +// Population is incremental: AddNode / addNodesUnwindLocked / +// copyBulkLocked all funnel through addNode / addNodes so a steady- +// state per-file update keeps the cache fresh. A lazy bootstrap +// runs on the first lookup if the store opened with disk-resident +// rows the live process never observed — typical after a daemon +// restart. +// +// Maintenance is best-effort: removeByPrefix runs on per-repo +// SymbolFTS wipes so a re-indexed repo's stale entries don't leak +// into tier-0. +type nameIndex struct { + mu sync.RWMutex + byN map[string][]*graph.Node // lower(name) → nodes + + bootstrapped atomic.Bool + bootstrapMu sync.Mutex +} + +// newNameIndex returns an empty index. Bootstrap fires lazily on +// the first lookup. +func newNameIndex() *nameIndex { + return &nameIndex{byN: make(map[string][]*graph.Node, 1024)} +} + +// addNode is the single-node entry point used by upsertNodeLocked. +// Skips low-value kinds so per-file updates don't flood the cache +// with locals/params. +func (idx *nameIndex) addNode(n *graph.Node) { + if idx == nil || n == nil || n.Name == "" || n.ID == "" { + return + } + if isLowValueForNameLookup(n.Kind) { + return + } + key := strings.ToLower(n.Name) + idx.mu.Lock() + defer idx.mu.Unlock() + existing := idx.byN[key] + for _, e := range existing { + if e.ID == n.ID { + return + } + } + idx.byN[key] = append(existing, n) +} + +// addNodes batches addNode calls so callers iterating a node slice +// (AddBatch, copyBulkLocked) don't pay the per-call lock acquire +// cost. +func (idx *nameIndex) addNodes(nodes []*graph.Node) { + if idx == nil || len(nodes) == 0 { + return + } + idx.mu.Lock() + defer idx.mu.Unlock() + for _, n := range nodes { + if n == nil || n.Name == "" || n.ID == "" { + continue + } + if isLowValueForNameLookup(n.Kind) { + continue + } + key := strings.ToLower(n.Name) + existing := idx.byN[key] + dup := false + for _, e := range existing { + if e.ID == n.ID { + dup = true + break + } + } + if !dup { + idx.byN[key] = append(existing, n) + } + } +} + +// isLowValueForNameLookup reports whether a node kind has so many +// identical-name occurrences per repo that adding them to the flat +// name index would balloon memory and slow tier-0 lookups without +// giving the resolver useful symbol-binding targets. +func isLowValueForNameLookup(k graph.NodeKind) bool { + switch k { + case graph.KindLocal, graph.KindParam, graph.KindFile, + graph.KindImport, graph.KindGenericParam, graph.KindBuiltin, + graph.KindClosure: + return true + } + return false +} + +// removeByPrefix drops every (name → node) entry whose Node.ID +// matches prefix. Called from the per-repo wipe paths so a re- +// indexed repo's stale entries don't leak into the tier-0 fast +// path. Iterating the entire map is acceptable because removeByPrefix +// runs only on repo-level reset (e.g. before BulkUpsertSymbolFTS's +// per-repo wipe), not on the steady-state hot path. +func (idx *nameIndex) removeByPrefix(prefix string) { + if idx == nil || prefix == "" { + return + } + idx.mu.Lock() + defer idx.mu.Unlock() + for key, nodes := range idx.byN { + kept := nodes[:0] + for _, n := range nodes { + if !strings.HasPrefix(n.ID, prefix) { + kept = append(kept, n) + } + } + if len(kept) == 0 { + delete(idx.byN, key) + } else { + idx.byN[key] = kept + } + } +} + +// lookupNodes returns the nodes whose lowercased Name equals +// strings.ToLower(name). Returns nil on miss. Caller must NOT +// mutate the returned slice's nodes — they are the live cache +// entries shared with the rest of the daemon. +func (idx *nameIndex) lookupNodes(name string) []*graph.Node { + if idx == nil || name == "" { + return nil + } + key := strings.ToLower(name) + idx.mu.RLock() + defer idx.mu.RUnlock() + nodes := idx.byN[key] + if len(nodes) == 0 { + return nil + } + out := make([]*graph.Node, len(nodes)) + copy(out, nodes) + return out +} + +// lookup retains the original ID-slice contract for the +// SearchSymbols path that only wants IDs (it builds graph.SymbolHit +// records keyed by ID). Returns a defensive copy. +func (idx *nameIndex) lookup(name string) []string { + nodes := idx.lookupNodes(name) + if len(nodes) == 0 { + return nil + } + out := make([]string, 0, len(nodes)) + for _, n := range nodes { + out = append(out, n.ID) + } + return out +} + +// isIdentifierQuery reports whether a query looks like a literal +// symbol name (no whitespace, no path separators, no dots, no +// colons). Tier-0 fast path engages only on such queries; multi- +// token / path / qualified queries always go to FTS. +func isIdentifierQuery(q string) bool { + if q == "" { + return false + } + for _, r := range q { + switch r { + case ' ', '\t', '\n', '/', '.', ':', ',': + return false + } + } + return true +} + +// bootstrap populates the index from a single Cypher scan of the +// Node table, fetching the full row so callers don't need a follow- +// up GetNodesByIDs. Filters out low-value kinds at the engine to +// skip the cgo round-trip cost on locals/params (millions of rows +// in a large multi-repo workspace). +// +// Runs once per Store lifetime on the first lookup that finds an +// empty map — typical after a daemon restart against a warm on-disk +// store where nodes exist but the live process hasn't routed any +// through AddNode/AddBatch yet. +// +// Errors during scan are non-fatal: the index stays empty and +// callers fall through to the Cypher path. +func (idx *nameIndex) bootstrap(s *Store) { + if idx == nil { + return + } + if idx.bootstrapped.Load() { + return + } + idx.bootstrapMu.Lock() + defer idx.bootstrapMu.Unlock() + if idx.bootstrapped.Load() { + return + } + // Fetch full Node rows so the bootstrap-restored cache matches + // what addNodes builds incrementally. Each row pays the cgo + + // rowToNode cost once; subsequent lookups are O(1) in-memory. + // + // The kind filter is pushed into Cypher so locals (typically + // 70%+ of all nodes) never cross the cgo boundary. On a 600k- + // node Linux-scale graph this drops bootstrap time from + // 6-10 s to < 1 s. + const q = `MATCH (n:Node) WHERE n.name <> '' AND n.kind IN ['function','method','type','interface','contract','constant','variable','field','module','package','enum_member','table','column','config_key','flag','event','migration','fixture','todo','team','license','release','doc'] RETURN ` + nodeReturnCols + rows, err := querySelectSafe(s, q, nil) + if err != nil || len(rows) == 0 { + idx.bootstrapped.Store(true) + return + } + idx.mu.Lock() + defer idx.mu.Unlock() + for _, r := range rows { + n := rowToNode(r) + if n == nil || n.Name == "" || n.ID == "" { + continue + } + key := strings.ToLower(n.Name) + existing := idx.byN[key] + dup := false + for _, e := range existing { + if e.ID == n.ID { + dup = true + break + } + } + if !dup { + idx.byN[key] = append(existing, n) + } + } + idx.bootstrapped.Store(true) +} diff --git a/internal/graph/store_ladybug/resolver_pushdown.go b/internal/graph/store_ladybug/resolver_pushdown.go new file mode 100644 index 00000000..2e1327f5 --- /dev/null +++ b/internal/graph/store_ladybug/resolver_pushdown.go @@ -0,0 +1,170 @@ +package store_ladybug + +import ( + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertions: *Store satisfies the resolver-side +// pushdown capabilities used by the global graph passes +// (InferImplements, InferOverrides, DetectCrossRepoEdges). A drift +// in any signature fails the build here instead of silently dropping +// to the Go-loop fallback. +var ( + _ graph.MemberMethodsByType = (*Store)(nil) + _ graph.StructuralParentEdges = (*Store)(nil) + _ graph.CrossRepoCandidates = (*Store)(nil) +) + +// MemberMethodsByType returns the typeID → []MemberMethodInfo +// projection of every EdgeMemberOf edge whose source is a KindMethod +// node, in one Cypher round-trip. Replaces the resolver's +// EdgesByKind(EdgeMemberOf) + per-edge GetNode(e.From) loop — each +// per-edge GetNode pulled ~10 string columns + a Meta blob over cgo +// just to read five scalar fields. The capability ships only the +// (type_id, method_id, method_name, file_path, start_line, +// repo_prefix) tuple. +// +// Per-type rows are deduplicated by MethodID — a method that appears +// twice in the EdgeMemberOf bucket (e.g. emitted from a re-index) +// yields a single info row. +func (s *Store) MemberMethodsByType() map[string][]graph.MemberMethodInfo { + const q = ` +MATCH (m:Node)-[e:Edge {kind: 'member_of'}]->(t:Node) +WHERE m.kind = 'method' +RETURN t.id, m.id, m.name, m.file_path, m.start_line, m.repo_prefix` + rows := s.querySelect(q, nil) + if len(rows) == 0 { + return nil + } + if len(rows) >= mallocTrimRowThreshold { + mallocTrim() + } + out := make(map[string][]graph.MemberMethodInfo) + seen := make(map[string]map[string]struct{}) + for _, r := range rows { + if len(r) < 6 { + continue + } + typeID, _ := r[0].(string) + methodID, _ := r[1].(string) + methodName, _ := r[2].(string) + filePath, _ := r[3].(string) + startLine := int(asInt64(r[4])) + repoPrefix, _ := r[5].(string) + if typeID == "" || methodID == "" { + continue + } + dedup := seen[typeID] + if dedup == nil { + dedup = make(map[string]struct{}) + seen[typeID] = dedup + } + if _, ok := dedup[methodID]; ok { + continue + } + dedup[methodID] = struct{}{} + out[typeID] = append(out[typeID], graph.MemberMethodInfo{ + MethodID: methodID, + Name: methodName, + FilePath: filePath, + StartLine: startLine, + RepoPrefix: repoPrefix, + }) + } + if len(out) == 0 { + return nil + } + return out +} + +// StructuralParentEdges returns every EdgeExtends / EdgeImplements / +// EdgeComposes edge whose endpoints are both KindType / KindInterface, +// projected as (FromID, ToID, FromKind, ToKind, Origin) in one Cypher +// round-trip. Replaces the InferOverrides AllEdges + per-edge +// GetNode(e.From) + GetNode(e.To) loop — on the gortex workspace the +// AllEdges scan materialised ~286k edges over cgo just to filter down +// to a few hundred type-to-type rows. +func (s *Store) StructuralParentEdges() []graph.StructuralParentEdgeRow { + const q = ` +MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE e.kind IN ['extends', 'implements', 'composes'] + AND a.kind IN ['type', 'interface'] + AND b.kind IN ['type', 'interface'] +RETURN a.id, b.id, a.kind, b.kind, e.origin` + rows := s.querySelect(q, nil) + if len(rows) == 0 { + return nil + } + if len(rows) >= mallocTrimRowThreshold { + mallocTrim() + } + out := make([]graph.StructuralParentEdgeRow, 0, len(rows)) + for _, r := range rows { + if len(r) < 5 { + continue + } + fromID, _ := r[0].(string) + toID, _ := r[1].(string) + if fromID == "" || toID == "" { + continue + } + fromKind, _ := r[2].(string) + toKind, _ := r[3].(string) + origin, _ := r[4].(string) + out = append(out, graph.StructuralParentEdgeRow{ + FromID: fromID, + ToID: toID, + FromKind: graph.NodeKind(fromKind), + ToKind: graph.NodeKind(toKind), + Origin: origin, + }) + } + return out +} + +// CrossRepoCandidates returns every edge whose Kind is in baseKinds +// AND whose endpoints carry two distinct, non-empty RepoPrefix +// values, projected with the underlying edge plus the two repo +// prefixes. Replaces the DetectCrossRepoEdges AllEdges + per-edge +// GetNode(e.From) + GetNode(e.To) loop — the in-memory scan ships +// every edge over cgo plus issues two GetNode round-trips per +// surviving row, while typical cross-repo rows are a small fraction +// of the edge table. +func (s *Store) CrossRepoCandidates(baseKinds []graph.EdgeKind) []graph.CrossRepoCandidateRow { + uniq := dedupeEdgeKinds(baseKinds) + if len(uniq) == 0 { + return nil + } + const q = ` +MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE e.kind IN $kinds + AND a.repo_prefix <> '' + AND b.repo_prefix <> '' + AND a.repo_prefix <> b.repo_prefix +RETURN ` + edgeReturnCols + `, a.repo_prefix, b.repo_prefix` + rows := s.querySelect(q, map[string]any{"kinds": edgeKindSliceToAny(uniq)}) + if len(rows) == 0 { + return nil + } + if len(rows) >= mallocTrimRowThreshold { + mallocTrim() + } + out := make([]graph.CrossRepoCandidateRow, 0, len(rows)) + for _, r := range rows { + if len(r) < 13 { + continue + } + e := rowToEdge(r[:11]) + if e == nil { + continue + } + fromRepo, _ := r[11].(string) + toRepo, _ := r[12].(string) + out = append(out, graph.CrossRepoCandidateRow{ + Edge: e, + FromRepo: fromRepo, + ToRepo: toRepo, + }) + } + return out +} diff --git a/internal/graph/store_ladybug/schema.go b/internal/graph/store_ladybug/schema.go new file mode 100644 index 00000000..17eb705f --- /dev/null +++ b/internal/graph/store_ladybug/schema.go @@ -0,0 +1,111 @@ +// Package store_ladybug is the KuzuDB-backed implementation of +// graph.Store. KuzuDB is an embedded property-graph database with a +// Cypher front-end and a columnar storage engine. The Go binding +// (github.com/LadybugDB/go-ladybug) wraps the C API and bundles +// liblbug.dylib / liblbug.so for the host platform. +// +// Schema design — one Node table and one Edge rel table parameterised +// by the `kind` column. We deliberately do not spread the ~50 edge +// kinds across 50 rel tables: every kind would need its own DDL, +// every schema query would multiplex across them, and KuzuDB rel +// tables do not share an identity column. A single Edge table keeps +// the schema small enough to evolve incrementally. +// +// Meta payloads are gob-encoded and base64-encoded, then stored as a +// STRING column. The native BLOB type is technically supported by the +// engine, but the Go binding reads a BLOB by calling strlen() on the +// returned C pointer, which truncates at the first NUL byte — gob +// frames contain arbitrary binary including NUL, so a BLOB column +// would silently lose data. base64 sidesteps both the strlen issue +// and the missing `[]byte → BLOB` parameter coercion (a raw `[]byte` +// is currently bound as `UINT8[]`, which the binder rejects against a +// BLOB column). +package store_ladybug + +// schemaDDL is the list of Cypher statements applied on every Open +// call. CREATE … IF NOT EXISTS makes the DDL idempotent so an +// existing on-disk database opens cleanly. +// +// PRIMARY KEY on Node(id) gives us the AddNode-by-id idempotency +// contract for free — a duplicate INSERT would raise a runtime +// uniqueness violation, so writes go through MERGE … SET … which +// upserts in one shot. KuzuDB rel tables do not allow a primary key, +// so Edge dedup is enforced at the Go layer (MERGE on the +// (from, to, kind, file_path, line) tuple). +var schemaDDL = []string{ + `CREATE NODE TABLE IF NOT EXISTS Node( + id STRING, + kind STRING, + name STRING, + qual_name STRING, + file_path STRING, + start_line INT64, + end_line INT64, + language STRING, + repo_prefix STRING, + workspace_id STRING, + project_id STRING, + meta STRING, + PRIMARY KEY(id) + )`, + `CREATE REL TABLE IF NOT EXISTS Edge( + FROM Node TO Node, + kind STRING, + file_path STRING, + line INT64, + confidence DOUBLE, + confidence_label STRING, + origin STRING, + tier STRING, + cross_repo INT64, + meta STRING + )`, + // SymbolFTS is the sidecar table the native FTS index is built + // against. Kept separate from Node so we don't have to touch + // every read/write path on the main schema, and so the + // search-side tokenisation (camelCase / snake_case / path-segment + // splits — see internal/search/tokenizer.go) lives in a clearly + // search-shaped column instead of polluting Node. + // + // id is the foreign anchor back to Node.id; tokens is the + // space-separated pre-tokenised text that the FTS index + // matches against. PRIMARY KEY on id makes the per-node + // UpsertSymbolFTS MERGE call idempotent (re-indexing a file + // during incremental updates replaces the prior row in place). + `CREATE NODE TABLE IF NOT EXISTS SymbolFTS( + id STRING, + tokens STRING, + PRIMARY KEY(id) + )`, + // FileMtime persists the per-file modification time the indexer + // uses for incremental re-index decisions. Moving this off the + // daemon's gob+gzip snapshot and into the store makes warm + // restarts read it through the same backend the graph already + // lives in (no second persistence surface to keep coherent), and + // is the first step toward dropping the metadata-only snapshot + // altogether for the ladybug backend. + // + // repo_prefix is column-stamped (not derived from the file_id + // prefix) so a single Cypher SELECT can slice mtimes by repo + // without parsing the id string. PRIMARY KEY on file_id makes + // the per-file upsert idempotent under MERGE. + `CREATE NODE TABLE IF NOT EXISTS FileMtime( + file_id STRING, + repo_prefix STRING, + mtime_ns INT64, + PRIMARY KEY(file_id) + )`, + // SchemaMeta is the single source of truth for the on-disk schema + // version (and any future single-scalar store metadata). The + // migration ladder in migrate.go reads `schema_version` from here at + // Open and stamps it after applying any pending step. KuzuDB has no + // PRAGMA user_version, so the version lives in a normal node table, + // the same way FileMtime / SymbolFTS persist their sidecar state. The + // k STRING primary key means one table covers every scalar without + // per-key DDL. See migrate.go for the read/upsert Cypher. + `CREATE NODE TABLE IF NOT EXISTS SchemaMeta( + k STRING, + v INT64, + PRIMARY KEY(k) + )`, +} diff --git a/internal/graph/store_ladybug/store.go b/internal/graph/store_ladybug/store.go new file mode 100644 index 00000000..e9e59f53 --- /dev/null +++ b/internal/graph/store_ladybug/store.go @@ -0,0 +1,254 @@ +package store_ladybug + +import ( + "fmt" + "sync" + "sync/atomic" + + lbug "github.com/LadybugDB/go-ladybug" + + "github.com/zzet/gortex/internal/graph" +) + +// Store is the KuzuDB-backed graph.Store implementation. +type Store struct { + db *lbug.Database + conn *lbug.Connection // setup connection — DDL + extension installs + pool *connPool // per-Store fan-out for query traffic + + // writeMu serialises every mutation AND excludes reads for the + // duration of a write. It is an RWMutex: writes take the exclusive + // Lock (one writer at a time, no concurrent readers), reads take the + // shared RLock (any number of concurrent readers, none while a write + // is in flight). + // + // The read-exclusion is load-bearing, not just for logical + // consistency: ladybug's bulk COPY extends the .lbug file in place, + // and a read issued on a *different* pooled connection while that + // COPY is mid-flight lands in a half-written buffer page. The benign + // outcome is an "IO exception: Cannot read N bytes at position M" + // (degraded to an empty result on the read path); the malign outcome + // is a SIGSEGV inside lbug_connection_query as the COPY's own CGo + // call trips over the concurrently-mutated buffer-pool state. Holding + // the writer side across every COPY/MERGE/DELETE and the reader side + // across every query makes the two mutually exclusive, which is the + // only contract this ladybug revision actually honours under + // concurrency. Concurrent reads still parallelise via RLock, so the + // steady-state fan-out the conformance suite exercises is preserved. + writeMu sync.RWMutex + + // resolveMu is the resolver-coordination mutex returned by + // ResolveMutex. Held by cross-repo / temporal / external resolver + // passes to keep their edge mutations from interleaving. Separate + // from writeMu so the resolver can hold it across multiple writes + // without blocking unrelated steady-state mutations. + resolveMu sync.Mutex + + edgeIdentityRevs atomic.Int64 + + // writeGen monotonically advances on every successful graph + // mutation. Cheap, lock-free, and consumed by the algo + // projection cache to invalidate a stale CALL PROJECT_GRAPH + // declaration when the underlying graph has changed. Reads + // must NOT bump it — only paths that hit disk via COPY / + // MERGE / CREATE / DELETE / SET on Node or Edge. + writeGen atomic.Uint64 + + // Bulk-load fast path. When the indexer brackets its parse loop + // with BeginBulkLoad/FlushBulk, AddBatch routes incoming rows + // into these slices instead of round-tripping through Cypher per + // call. FlushBulk dedupes the buffers and commits via Kuzu's + // COPY FROM CSV — one INSERT-only statement per table, no MERGE + // cost, no per-row Cypher parse/plan. See BeginBulkLoad doc. + // bulkSlot serialises BeginBulkLoad ↔ FlushBulk against the + // per-Store buffer. Concurrent per-repo Indexers each call + // BeginBulkLoad on the shared Store at drain time; without this + // mutex they would race on bulkActive and the second caller + // would observe bulkActive==true. Holding the slot for the full + // Begin→Flush window means concurrent drains serialise — the + // second drain blocks at BeginBulkLoad until the first flush + // returns the slot. + bulkSlot sync.Mutex + bulkMu sync.Mutex + bulkActive bool + bulkNodes []*graph.Node + bulkEdges []*graph.Edge + + // fts tracks whether the native FTS extension is loaded and + // whether the symbol FTS index has been built. See fts.go for + // the SymbolSearcher implementation. + fts ftsState + + // vec tracks the native VECTOR extension load + the per-dim + // SymbolVec schema declaration + index-build sentinel. See + // vector.go for the VectorSearcher implementation. + vec vectorState + + // algo tracks the native ALGO extension load + the per-call + // projection-name serialisation mutex. See algo.go for the + // PageRanker / CommunityDetector / ComponentFinder / KCorer + // implementations. + algo algoState + + // fileIDs accelerates per-file lookups (GetFileSubGraph, + // GetFileNodes …) by sidestepping the Node-table full scan Kuzu + // would otherwise need. Maintained on every node mutation; see + // file_index.go. + fileIDs *fileIDIndex + + // nameIdx is the tier-0 fast path for SearchSymbols: a + // denormalised lower(name) → []NodeID map maintained alongside + // every Node write. Identifier-shape queries skip the FTS + // round-trip when this hits. See name_index.go. + nameIdx *nameIndex + + // needsRebuild is set at Open when the migration ladder crossed a + // rung that ALTER could not satisfy (a Meta-payload reshape, a table + // restructure). The caller surfaces it via NeedsRebuild() and treats + // the on-disk graph as stale — a full re-index into the fresh schema. + // Always false on a fresh open and after purely additive migrations. + // See migrate.go. + needsRebuild bool +} + +// Compile-time assertion: *Store satisfies graph.Store. +var _ graph.Store = (*Store)(nil) + +// connPoolSize is the per-Store connection-pool fan-out. +// MultiIndexer runs one parse goroutine per repo; with 4 active +// repos and per-repo shadow drains, 8 gives ample headroom for +// concurrent reads + drains without queue contention. ladybug's +// C engine handles its own internal threadpool per query, so +// over-sizing the pool here mostly burns memory without buying +// extra parallelism. +const connPoolSize = 8 + +// DefaultBufferPoolMB is the buffer-pool cap applied when the caller +// passes Options{} (zero value). Ladybug's own default is 80% of +// system RAM, which on a 16 GiB laptop reserves ~12.8 GiB before a +// single row is inserted; clamping to a fixed 4 GiB keeps the +// daemon's resident set predictable across machine sizes. +const DefaultBufferPoolMB = 4096 + +// Options configures the embedded Ladybug instance. The zero value +// applies DefaultBufferPoolMB; callers override fields as needed. +type Options struct { + // BufferPoolMB caps the engine's page cache in MiB. Zero falls + // back to DefaultBufferPoolMB. + BufferPoolMB uint64 +} + +// Open is the zero-config entry point. Equivalent to +// OpenWithOptions(path, Options{}). +func Open(path string) (*Store, error) { + return OpenWithOptions(path, Options{}) +} + +// OpenWithOptions opens (or creates) a Ladybug database at path and +// applies the schema. The path is a directory Ladybug owns end-to-end; +// an empty directory is initialised on first open and reused on every +// subsequent open. +// +// Opens one "setup" connection for DDL + extension installs, then +// a pool of additional connections for parallel query traffic. +// MultiIndexer's per-repo goroutines each borrow their own pool +// connection so concurrent reads + drains don't serialise on a +// single Connection handle (the Go binding races in cgo without +// a per-connection serialisation point). +func OpenWithOptions(path string, opts Options) (*Store, error) { + cfg := lbug.DefaultSystemConfig() + bufMB := opts.BufferPoolMB + if bufMB == 0 { + bufMB = DefaultBufferPoolMB + } + cfg.BufferPoolSize = bufMB * 1024 * 1024 + db, err := lbug.OpenDatabase(path, cfg) + if err != nil { + return nil, fmt.Errorf("store_ladybug: open %q: %w", path, err) + } + conn, err := lbug.OpenConnection(db) + if err != nil { + db.Close() + return nil, fmt.Errorf("store_ladybug: open connection: %w", err) + } + for _, stmt := range schemaDDL { + res, err := conn.Query(stmt) + if err != nil { + conn.Close() + db.Close() + return nil, fmt.Errorf("store_ladybug: schema %q: %w", firstLine(stmt), err) + } + res.Close() + } + // Bring the on-disk schema up to currentSchemaVersion before any + // query traffic. Runs on the raw setup conn (no pool yet, no + // writeMu) — see migrate.go. needsRebuild is true only if a ladder + // step required a full re-index (ALTER could not express it). + needsRebuild, err := applyLadybugMigrations(conn) + if err != nil { + conn.Close() + db.Close() + return nil, fmt.Errorf("store_ladybug: migrate schema: %w", err) + } + pool, err := newConnPool(db, connPoolSize) + if err != nil { + conn.Close() + db.Close() + return nil, fmt.Errorf("store_ladybug: init conn pool: %w", err) + } + st := &Store{db: db, conn: conn, pool: pool, needsRebuild: needsRebuild, fileIDs: newFileIDIndex(), nameIdx: newNameIndex()} + // Populate the file→id accelerator from any data already on disk + // (daemon restart, ladybug snapshot reload). A fresh DB returns 0 + // rows and this is a cheap no-op; an existing DB pays one + // sequential Node scan in exchange for sub-millisecond file + // lookups for the rest of the process lifetime. + if err := st.populateFileIDIndexLocked(); err != nil { + conn.Close() + db.Close() + return nil, fmt.Errorf("store_ladybug: populate file-id index: %w", err) + } + return st, nil +} + +// populateFileIDIndexLocked seeds the fileIDs accelerator from the +// on-disk Node table. Runs once at Open. Streaming the (id, file_path) +// projection keeps the working set small — we don't materialise the +// full node rows for this. +func (s *Store) populateFileIDIndexLocked() error { + if s.fileIDs == nil { + s.fileIDs = newFileIDIndex() + } + const q = `MATCH (n:Node) WHERE n.file_path <> '' RETURN n.id, n.file_path` + rows := s.querySelect(q, nil) + for _, r := range rows { + if len(r) < 2 { + continue + } + id, _ := r[0].(string) + fp, _ := r[1].(string) + s.fileIDs.add(fp, id) + } + return nil +} + +// Close closes the underlying connection and database. Drops any +// cached PROJECT_GRAPH declaration first so the engine's catalog +// isn't left holding a dangling projection across the teardown — +// the algo extension's catalog state would otherwise be +// rehydrated on the next Open. +func (s *Store) Close() error { + s.dropCachedProjection() + if s.pool != nil { + s.pool.close() + } + if s.conn != nil { + s.conn.Close() + } + if s.db != nil { + s.db.Close() + } + return nil +} + +// ResolveMutex returns the resolver-coordination mutex. +func (s *Store) ResolveMutex() *sync.Mutex { return &s.resolveMu } diff --git a/internal/graph/store_ladybug/store_bulk.go b/internal/graph/store_ladybug/store_bulk.go new file mode 100644 index 00000000..21547557 --- /dev/null +++ b/internal/graph/store_ladybug/store_bulk.go @@ -0,0 +1,469 @@ +package store_ladybug + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// Compile-time assertion: *Store satisfies graph.BulkLoader, so the +// indexer's BulkLoader probe picks up the COPY-FROM-CSV fast path +// instead of falling through to per-batch UNWIND. +var _ graph.BulkLoader = (*Store)(nil) + +// BeginBulkLoad enters buffer-mode write. Subsequent AddBatch calls +// append into in-memory slices without round-tripping to Kuzu; the +// buffer is committed via Kuzu's COPY FROM primitive when FlushBulk +// is called. +// +// When two callers race (concurrent per-repo Indexers draining their +// shadows into the same Store), the second blocks on bulkSlot until +// the first FlushBulk releases it — drains serialise instead of +// panicking. The matching FlushBulk MUST run on the same goroutine +// (the IndexCtx defer pattern guarantees this). +func (s *Store) BeginBulkLoad() { + s.bulkSlot.Lock() + s.bulkMu.Lock() + defer s.bulkMu.Unlock() + s.bulkActive = true +} + +// FlushBulk commits the accumulated bulk buffer via Kuzu's COPY FROM +// CSV path — one INSERT-only statement per table, no MERGE cost, no +// per-row Cypher parse/plan. After FlushBulk, AddBatch returns to its +// regular per-call UNWIND path. +// +// Dedup contract: nodes are deduped by ID (last write wins, matching +// the in-memory store's AddBatch semantics); edges are deduped by the +// identity tuple (from, to, kind, file_path, line). Edge endpoints +// not present in the node buffer are auto-stubbed so the rel-table +// foreign-key constraint is satisfied (mirrors the per-call +// mergeStubNodeLocked path). +func (s *Store) FlushBulk() error { + s.bulkMu.Lock() + if !s.bulkActive { + s.bulkMu.Unlock() + return fmt.Errorf("store_ladybug: FlushBulk without BeginBulkLoad") + } + nodes := s.bulkNodes + edges := s.bulkEdges + s.bulkNodes = nil + s.bulkEdges = nil + s.bulkActive = false + s.bulkMu.Unlock() + // Release the per-Store bulk slot so the next concurrent drain + // (a different per-repo Indexer waiting in BeginBulkLoad) can + // take it. Held across the COPY below in the original design; + // releasing here lets the next caller start staging rows into + // its own buffer while this one's COPY is still in flight. The + // underlying COPY queries themselves still serialise on + // writeMu via runCopyPooled — that's where Ladybug's + // single-writer constraint actually bites — so unblocking the + // staging window is pure latency win, not a concurrency + // hazard. + s.bulkSlot.Unlock() + + // Always take the COPY path. The prior fallback to per-row + // upsertNodeLocked when the store was non-empty existed to + // dodge PRIMARY KEY conflicts between concurrent FlushBulks + // (and between streaming-flush chunks within a single + // IndexCtx). With per-repo-prefixed stubs (internal/graph/stub.go) + // no two per-repo Indexers can emit the same Node ID, so the + // fallback is now dead weight — it forced the gortex repo + // onto 190k per-row MERGEs holding writeMu for minutes while + // every other repo's FlushBulk queued behind it. + // + // copyBulkLocked itself runs its COPY queries through the + // connection pool, so two concurrent FlushBulks parallelise + // instead of serialising on a single Connection handle. + if err := s.copyBulkLocked(nodes, edges); err != nil { + return err + } + if len(nodes) > 0 || len(edges) > 0 { + s.writeGen.Add(1) + } + if len(nodes)+len(edges) >= mallocTrimRowThreshold { + mallocTrim() + } + return nil +} + +// copyBulkLocked dedupes the bulk buffers, writes them to temp CSV +// files, and runs COPY FROM for each table. Must be called with +// s.writeMu held. +// +// Multi-repo wrinkle: extractors emit `unresolved::` targets +// before the resolver runs. Most are resolved in the per-repo +// shadow, but a residue always remains (truly unresolved symbols, +// or names the language extractor can't bind without semantic +// context). Across repos those `unresolved::*` ids collide on the +// COPY's PRIMARY KEY. Rewrite them to `::unresolved::*` +// using the repo prefix taken from any node in the batch (one +// per-repo Indexer's drain carries nodes from a single repo). +func (s *Store) copyBulkLocked(nodes []*graph.Node, edges []*graph.Edge) error { + repoPrefix := "" + for _, n := range nodes { + if n != nil && n.RepoPrefix != "" { + repoPrefix = n.RepoPrefix + break + } + } + if repoPrefix != "" { + const unresolvedTag = "unresolved::" + // Encoding: prepend the repo prefix to the bare + // `unresolved::Name` form so cross-repo emitters don't + // collide on the COPY PK. Result: `::unresolved::`. + // The Go-level per-edge resolver's EdgesWithUnresolvedTarget + // uses a literal `STARTS WITH 'unresolved::'` scan, which + // intentionally MISSES these multi-repo stubs — the Cypher + // backend resolver runs a batched pass that handles every + // form via kind/name normalisation, so we save the per-edge + // Cypher round-trip cost on the Go side and let the engine + // resolve the whole population in one shot. + rewrite := func(id string) string { + if id == "" || !strings.HasPrefix(id, unresolvedTag) { + return id + } + return repoPrefix + "::" + id + } + for _, e := range edges { + if e == nil { + continue + } + e.From = rewrite(e.From) + e.To = rewrite(e.To) + } + for _, n := range nodes { + if n == nil { + continue + } + n.ID = rewrite(n.ID) + } + } + // Dedup nodes by SANITIZED ID (last write wins). The TSV writer + // strips tab/CR/LF — so two raw IDs that differ only in those + // characters (e.g. extractor output with embedded newlines in an + // inline TypeScript object-type literal: `unresolved::{ foo: + // X[]\n bar: () => Y }`) collapse to the same column-0 value at + // COPY time, and Kuzu rejects the run with "duplicated primary + // key value". Using the sanitized form here keeps the dedup map's + // view of "same node" aligned with what the COPY parser sees. We + // also normalize n.ID to the sanitized form so the auto-stub and + // edge endpoints match, and so the eventual writeNodesTSV / + // writeEdgesTSV pair emit identical strings on both sides of the + // rel-table FK. + // + // The in-memory store's AddBatch overwrites on duplicate ID; this + // preserves the same semantics modulo the sanitization mapping. + nodePos := make(map[string]int, len(nodes)) + dedupedNodes := nodes[:0] + for _, n := range nodes { + if n == nil || n.ID == "" { + continue + } + san := sanitizeTSV(n.ID) + if san != n.ID { + n.ID = san + } + if pos, ok := nodePos[n.ID]; ok { + dedupedNodes[pos] = n + } else { + nodePos[n.ID] = len(dedupedNodes) + dedupedNodes = append(dedupedNodes, n) + } + } + nodes = dedupedNodes + // Feed the file→id accelerator from the deduped buffer. Done here + // (before COPY) so we don't have to re-scan after the write — the + // COPY appends every row anyway, success-or-failure handling + // upstream already rolls writeGen back on a fatal error. + if s.fileIDs != nil { + s.fileIDs.addNodes(nodes) + } + if s.nameIdx != nil { + s.nameIdx.addNodes(nodes) + } + + // Dedup edges by identity tuple (last write wins). Same rationale + // as the in-memory store's MERGE semantics. Endpoints are + // sanitized to match the node-ID sanitization above — otherwise + // an edge pointing at `unresolved::Writer\n}` references a node + // the CSV writer collapses to `unresolved::Writer }`, and Kuzu's + // COPY Edge fails with "unable to find primary key value". + type edgeKey struct { + from, to, kind, file string + line int + } + edgePos := make(map[edgeKey]int, len(edges)) + dedupedEdges := edges[:0] + for _, e := range edges { + if e == nil { + continue + } + if san := sanitizeTSV(e.From); san != e.From { + e.From = san + } + if san := sanitizeTSV(e.To); san != e.To { + e.To = san + } + k := edgeKey{e.From, e.To, string(e.Kind), e.FilePath, e.Line} + if pos, ok := edgePos[k]; ok { + dedupedEdges[pos] = e + } else { + edgePos[k] = len(dedupedEdges) + dedupedEdges = append(dedupedEdges, e) + } + } + edges = dedupedEdges + + // Auto-stub endpoints not in the node buffer. The rel-table + // foreign-key constraint requires both endpoints to exist in the + // node table; per-call AddEdge handles this via + // mergeStubNodeLocked. For COPY there's no per-row hook, so we + // pre-stub here. + for _, e := range edges { + if e.From != "" { + if _, ok := nodePos[e.From]; !ok { + nodePos[e.From] = len(nodes) + nodes = append(nodes, &graph.Node{ID: e.From}) + } + } + if e.To != "" { + if _, ok := nodePos[e.To]; !ok { + nodePos[e.To] = len(nodes) + nodes = append(nodes, &graph.Node{ID: e.To}) + } + } + } + // NOTE: an earlier revision pre-filtered nodes against the live + // Node table here via a `MATCH (n:Node) WHERE n.id IN $ids` probe + // to make COPY idempotent against duplicate primary keys. That + // query crashed the daemon with `IO exception: Cannot read from + // file ... position: ` because it issued a read on the + // same .lbug file that a concurrent COPY (from a sibling + // per-repo IndexCtx whose FlushBulk had already released + // bulkSlot but still held writeMu inside runCopyPooled) was + // extending — Kuzu's MVCC can't serve a buffer-pool read while + // the file is being grown by another transaction in the same + // process. The sanitize-aware dedup above is the cheaper and + // safer fix for the duplicate-PK class this filter was meant to + // catch; cross-bulk collisions are now rare enough that the + // per-COPY error message (handled by the caller's retry) is + // acceptable when they happen. + + if len(nodes) == 0 && len(edges) == 0 { + return nil + } + + // Write CSV files to a per-flush temp dir. Cleaned up regardless + // of COPY success/failure. + dir, err := os.MkdirTemp("", "kuzu-bulk-") + if err != nil { + return fmt.Errorf("mkdir bulk tmp: %w", err) + } + defer func() { _ = os.RemoveAll(dir) }() + + if len(nodes) > 0 { + nodesPath := filepath.Join(dir, "nodes.csv") + if err := writeNodesTSV(nodesPath, nodes); err != nil { + return fmt.Errorf("write nodes tsv: %w", err) + } + // HEADER=false maps columns by position (no chance of a + // header-name mismatch silently dropping rows). DELIM='\t' + // because Kuzu's CSV parser does not handle RFC-4180-style + // quoted strings containing commas — it splits on the + // delimiter naively. Code identifiers and names never contain + // tabs, so TSV sidesteps the quoting problem entirely. + copyQ := fmt.Sprintf("COPY Node FROM '%s' (HEADER=false, DELIM='\t')", escapeCypherStringLit(nodesPath)) + if err := s.runCopyPooled(copyQ); err != nil { + return fmt.Errorf("copy nodes: %w", err) + } + } + + if len(edges) > 0 { + edgesPath := filepath.Join(dir, "edges.csv") + if err := writeEdgesTSV(edgesPath, edges); err != nil { + return fmt.Errorf("write edges tsv: %w", err) + } + copyQ := fmt.Sprintf("COPY Edge FROM '%s' (HEADER=false, DELIM='\t')", escapeCypherStringLit(edgesPath)) + if err := s.runCopyPooled(copyQ); err != nil { + return fmt.Errorf("copy edges: %w", err) + } + } + + return nil +} + +// runCopyPooled runs a parameter-less COPY query. Holds writeMu +// for the duration: Ladybug only allows ONE write transaction +// at a time per database; concurrent COPYs from different +// connections fail with "Cannot start a new write transaction +// in the system". The pool still parallelises READS (querySelect +// no longer locks), but writes serialise here at the Go layer +// to match ladybug's MVCC contract. +// +// The COPY query itself is parameter-less so we go straight +// through conn.Query on a pooled connection. +func (s *Store) runCopyPooled(copyQ string) error { + s.writeMu.Lock() + defer s.writeMu.Unlock() + res, release, err := s.executeOrQuery(copyQ, nil) + if err != nil { + return err + } + if res != nil { + res.Close() + } + release() + return nil +} + +// writeNodesTSV writes nodes to a tab-separated values file in +// schema-column order. Kuzu's COPY FROM parser does not honour +// RFC-4180 quoted-string escaping (a quoted field with embedded +// commas is naively split on the delimiter), so TSV with a sanitised +// payload is the safe transport for arbitrary user data. Tabs in +// any text column are replaced with a single space; newlines with a +// space — these characters never appear in code identifiers, +// qualified names, or file paths, and base64-encoded meta is +// tab-/newline-free by construction. +func writeNodesTSV(path string, nodes []*graph.Node) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + bw := bufio.NewWriterSize(f, 1<<20) + defer func() { _ = bw.Flush() }() + + for _, n := range nodes { + metaStr := "" + if len(n.Meta) > 0 { + s, err := encodeMeta(n.Meta) + if err != nil { + return fmt.Errorf("encode meta for %q: %w", n.ID, err) + } + metaStr = s + } + fields := [12]string{ + sanitizeTSV(n.ID), + sanitizeTSV(string(n.Kind)), + sanitizeTSV(n.Name), + sanitizeTSV(n.QualName), + sanitizeTSV(n.FilePath), + strconv.Itoa(n.StartLine), + strconv.Itoa(n.EndLine), + sanitizeTSV(n.Language), + sanitizeTSV(n.RepoPrefix), + sanitizeTSV(n.WorkspaceID), + sanitizeTSV(n.ProjectID), + metaStr, + } + for i, f := range fields { + if i > 0 { + if err := bw.WriteByte('\t'); err != nil { + return err + } + } + if _, err := bw.WriteString(f); err != nil { + return err + } + } + if err := bw.WriteByte('\n'); err != nil { + return err + } + } + return nil +} + +// writeEdgesTSV writes edges to a TSV file with FROM/TO ids in the +// first two columns (matching Kuzu's REL CSV convention) followed by +// the rel-table property columns in schema order. +func writeEdgesTSV(path string, edges []*graph.Edge) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + bw := bufio.NewWriterSize(f, 1<<20) + defer func() { _ = bw.Flush() }() + + for _, e := range edges { + metaStr := "" + if len(e.Meta) > 0 { + s, err := encodeMeta(e.Meta) + if err != nil { + return fmt.Errorf("encode meta for edge %q→%q: %w", e.From, e.To, err) + } + metaStr = s + } + crossRepo := "0" + if e.CrossRepo { + crossRepo = "1" + } + fields := [11]string{ + sanitizeTSV(e.From), + sanitizeTSV(e.To), + sanitizeTSV(string(e.Kind)), + sanitizeTSV(e.FilePath), + strconv.Itoa(e.Line), + strconv.FormatFloat(e.Confidence, 'g', -1, 64), + sanitizeTSV(e.ConfidenceLabel), + sanitizeTSV(e.Origin), + sanitizeTSV(e.Tier), + crossRepo, + metaStr, + } + for i, f := range fields { + if i > 0 { + if err := bw.WriteByte('\t'); err != nil { + return err + } + } + if _, err := bw.WriteString(f); err != nil { + return err + } + } + if err := bw.WriteByte('\n'); err != nil { + return err + } + } + return nil +} + +// sanitizeTSV strips bytes that would corrupt a tab-separated record — +// tabs become spaces, CR/LF become spaces. Code identifiers, qualified +// names, file paths, and base64-encoded meta strings never contain +// these in practice; the sanitiser exists to guarantee a malformed +// extractor output can't break the cold-load path. +func sanitizeTSV(s string) string { + if !strings.ContainsAny(s, "\t\r\n") { + return s + } + b := make([]byte, 0, len(s)) + for i := 0; i < len(s); i++ { + c := s[i] + switch c { + case '\t', '\r', '\n': + b = append(b, ' ') + default: + b = append(b, c) + } + } + return string(b) +} + +// escapeCypherStringLit escapes a string for safe use inside a Cypher +// single-quoted literal — turns ' into \' and \ into \\. Used for +// COPY FROM paths, which are templated into the Cypher query (no +// parameter binding for COPY paths in the current Kuzu binding). +func escapeCypherStringLit(s string) string { + s = strings.ReplaceAll(s, `\`, `\\`) + s = strings.ReplaceAll(s, `'`, `\'`) + return s +} diff --git a/internal/graph/store_ladybug/store_meta.go b/internal/graph/store_ladybug/store_meta.go new file mode 100644 index 00000000..7713f2fc --- /dev/null +++ b/internal/graph/store_ladybug/store_meta.go @@ -0,0 +1,42 @@ +package store_ladybug + +import ( + "bytes" + "encoding/base64" + "encoding/gob" +) + +// encodeMeta serialises a Meta map to a base64-encoded gob frame. +// Empty / nil maps become the empty string so the common case stays +// cheap to store. base64 is required because the Go binding reads +// BLOB columns through strlen(), which would truncate at the first +// NUL byte that gob encoding routinely emits. +func encodeMeta(m map[string]any) (string, error) { + if len(m) == 0 { + return "", nil + } + var buf bytes.Buffer + if err := gob.NewEncoder(&buf).Encode(m); err != nil { + return "", err + } + return base64.StdEncoding.EncodeToString(buf.Bytes()), nil +} + +// decodeMeta is the inverse of encodeMeta. +func decodeMeta(s string) (map[string]any, error) { + if s == "" { + return nil, nil + } + raw, err := base64.StdEncoding.DecodeString(s) + if err != nil { + return nil, err + } + if len(raw) == 0 { + return nil, nil + } + var m map[string]any + if err := gob.NewDecoder(bytes.NewReader(raw)).Decode(&m); err != nil { + return nil, err + } + return m, nil +} diff --git a/internal/graph/store_ladybug/store_query.go b/internal/graph/store_ladybug/store_query.go new file mode 100644 index 00000000..03eba1c3 --- /dev/null +++ b/internal/graph/store_ladybug/store_query.go @@ -0,0 +1,180 @@ +package store_ladybug + +import ( + "fmt" + "os" + "strings" + + lbug "github.com/LadybugDB/go-ladybug" +) + +// runWriteLocked executes a write-shaped Cypher statement under the +// caller-held writeMu. Panics on a genuine engine error (closed +// connection / schema mismatch / disk-full) — graph.Store has no +// error channel and the in-memory store can't fail either, so a +// fatal storage failure cannot be ignored. +func (s *Store) runWriteLocked(query string, args map[string]any) { + res, release, err := s.executeOrQuery(query, args) + if err != nil { + panicOnFatal(err) + return + } + res.Close() + release() +} + +// querySelect runs a read-shaped Cypher statement and materialises +// every row before returning. The connection pool gives each +// caller its own private connection so concurrent reads no longer +// need a serialisation mutex — every per-repo Indexer's +// NodeCount / shadow-swap probe runs in parallel. +// +// We still consume the iterator before releasing the connection +// to the pool — open iterators hold the kuzu_query handle and +// the connection isn't safe to reuse until the result is closed. +func (s *Store) querySelect(query string, args map[string]any) [][]any { + // RLock excludes the read from the window any writer (COPY / MERGE / + // DELETE) holds the exclusive Lock — a read on a sibling pooled + // connection while a COPY extends the .lbug file is the source of + // both the "Cannot read N bytes" IO exceptions and the harder + // lbug_connection_query SIGSEGV. Concurrent reads still run in + // parallel; only a write blocks them. Callers that already hold the + // write Lock must route through querySelectLocked, which skips this + // acquisition (an RWMutex is not reentrant). + s.writeMu.RLock() + defer s.writeMu.RUnlock() + return s.querySelectInner(query, args) +} + +// querySelectInner is the unlocked body shared between querySelect +// (locks) and querySelectLocked (caller already holds writeMu). +// +// Engine errors on the read path are logged + the partial-or-empty +// row buffer is returned instead of panicking. A read failure here +// is almost always a transient Kuzu IO exception (e.g. a buffer-pool +// read landing in the middle of a concurrent COPY's file extension — +// "Cannot read N bytes at position M") and used to kill the daemon +// via panicOnFatal. The graph.Store interface still has no error +// channel so we can't bubble it up; degrading to an empty result on +// reads gives the caller a recoverable "looks like the symbol has +// no edges right now" path while the daemon stays up. Write paths +// (runWriteLocked) keep panic semantics because a write failure +// means the graph is now inconsistent and continuing would corrupt +// subsequent state. +func (s *Store) querySelectInner(query string, args map[string]any) [][]any { + res, release, err := s.executeOrQuery(query, args) + if err != nil { + readPathLogf("executeOrQuery: %v (query=%q)", err, firstLine(query)) + return nil + } + defer release() + defer res.Close() + var rows [][]any + for res.HasNext() { + tup, err := res.Next() + if err != nil { + readPathLogf("Next: %v (query=%q rows=%d)", err, firstLine(query), len(rows)) + return rows + } + vals, err := tup.GetAsSlice() + if err != nil { + tup.Close() + readPathLogf("GetAsSlice: %v (query=%q rows=%d)", err, firstLine(query), len(rows)) + return rows + } + rows = append(rows, vals) + tup.Close() + } + return rows +} + +// readPathLogf emits a degraded-read warning to stderr (which the +// daemon redirects to its log file). Format: a single line prefixed +// with `store_ladybug: read degraded:` so log scrapers can find these +// without parsing JSON. We deliberately avoid the structured zap +// logger here — the Store has no logger reference and threading one +// through every callsite would be a much larger change than this +// hot-path fix is meant to be. +func readPathLogf(format string, args ...any) { + msg := fmt.Sprintf(format, args...) + _, _ = fmt.Fprintf(os.Stderr, "store_ladybug: read degraded: %s\n", msg) +} + +// querySelectLocked is querySelect for callers that already hold +// writeMu. Routes to the same unlocked body querySelect uses +// (re-acquiring writeMu would deadlock). +func (s *Store) querySelectLocked(query string, args map[string]any) [][]any { + return s.querySelectInner(query, args) +} + +// executeOrQuery hides the prepared-vs-direct distinction. KuzuDB +// requires the Prepare → Execute path for parameterised statements; +// a bare Query with `$arg` placeholders is rejected. Statements +// without parameters fall through to a direct Query for clarity. +// +// Borrows a connection from s.pool so concurrent calls don't race +// in cgo. Returns a release function the caller MUST defer — the +// connection cannot return to the pool until the QueryResult has +// been fully consumed (open iterators hold the kuzu_query handle +// on the borrowed connection). Falls back to the setup s.conn if +// the pool isn't ready (test fixtures that construct Store{} +// directly); release() is a no-op in that case. +func (s *Store) executeOrQuery(query string, args map[string]any) (*lbug.QueryResult, func(), error) { + conn := s.conn + release := func() {} + // discard pulls a connection OUT of circulation on error instead of + // recycling it — a connection that errored mid-statement (a failed + // COPY in particular) can be left poisoned, and reusing it makes a + // later Prepare on an unrelated goroutine panic with "mutex lock + // failed: Invalid argument". Falls back to a no-op for the + // non-pooled setup connection (test fixtures) where there's nothing + // to replace. + discard := func() {} + if s.pool != nil { + conn = s.pool.get() + release = func() { s.pool.put(conn) } + discard = func() { s.pool.discard(conn) } + } + if len(args) == 0 { + res, err := conn.Query(query) + if err != nil { + discard() + return nil, func() {}, err + } + return res, release, nil + } + stmt, err := conn.Prepare(query) + if err != nil { + discard() + return nil, func() {}, fmt.Errorf("prepare: %w", err) + } + defer stmt.Close() + res, err := conn.Execute(stmt, args) + if err != nil { + discard() + return nil, func() {}, err + } + return res, release, nil +} + +// panicOnFatal turns a non-nil engine error into a panic so callers +// see catastrophic failures. The graph.Store interface deliberately +// does not surface errors — it mirrors the in-memory store's +// "everything succeeds" contract — so a fatal storage failure +// cannot be silently dropped. +func panicOnFatal(err error) { + if err == nil { + return + } + panic(fmt.Errorf("store_ladybug: %w", err)) +} + +// firstLine is a small helper for trimming a multi-line Cypher +// statement to its first non-empty line for use in error messages. +func firstLine(s string) string { + s = strings.TrimSpace(s) + if i := strings.IndexByte(s, '\n'); i >= 0 { + return strings.TrimSpace(s[:i]) + } + return s +} diff --git a/internal/graph/store_ladybug/store_read.go b/internal/graph/store_ladybug/store_read.go new file mode 100644 index 00000000..527f725b --- /dev/null +++ b/internal/graph/store_ladybug/store_read.go @@ -0,0 +1,460 @@ +package store_ladybug + +import ( + "iter" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// GetNode returns the node with the given id, or nil if absent. +// +// Uses the WHERE form on the PK to match the rest of the read +// surface (GetInEdges, FindNodesByName, GetFileSubGraph etc.) — +// the inline `{id: $id}` shape has been observed to return empty +// under concurrent writers when the planner picks a plan that +// doesn't survive a buffer-pool refresh. +func (s *Store) GetNode(id string) *graph.Node { + const q = `MATCH (n:Node) WHERE n.id = $id RETURN ` + nodeReturnCols + ` LIMIT 1` + rows := s.querySelect(q, map[string]any{"id": id}) + if len(rows) == 0 { + return nil + } + return rowToNode(rows[0]) +} + +// GetNodeByQualName returns the first node whose qual_name matches, +// or nil if absent / empty. +func (s *Store) GetNodeByQualName(qualName string) *graph.Node { + if qualName == "" { + return nil + } + const q = `MATCH (n:Node) WHERE n.qual_name = $q RETURN ` + nodeReturnCols + ` LIMIT 1` + rows := s.querySelect(q, map[string]any{"q": qualName}) + if len(rows) == 0 { + return nil + } + return rowToNode(rows[0]) +} + +// FindNodesByName returns every node whose Name matches. +// +// The predicate is expressed as an outer `WHERE n.name = $name` +// instead of an inline `(n:Node {name: $name})`. Same shape as the +// GetInEdges fix elsewhere in this file: the inline-property form on +// a non-PK column has been observed to return empty rows under +// concurrent writers (the planner picks a plan that doesn't survive +// a buffer-pool refresh), while the WHERE form goes through the +// straightforward filter scan and stays correct. Both forms hit the +// same name index on Kuzu's side, so there is no measurable cost +// difference — only the correctness gap. +// +// This is the inbound-lookup the resolver's resolveMethodCall path +// uses via FindNodesByNameInRepo; an empty result there leaves the +// caller→method edge as `unresolved::Foo`, which is why +// `find_usages` on `Graph.AddNode` returned zero callers despite +// dozens of `g.AddNode(...)` call sites. +func (s *Store) FindNodesByName(name string) []*graph.Node { + // Note: an earlier revision routed this through s.nameIdx with a + // lazy bootstrap that ran a full Cypher scan. Under the parallel + // warmup's per-repo IndexCtx pressure, the bootstrap Cypher + // running concurrently with other Cypher writers tickled a + // liblbug-side semasleep panic that crashed the daemon + // mid-warmup. Keeping FindNodesByName on the engine path + // preserves the correctness contract — the resolver's per-edge + // lookup still hits Kuzu's secondary name index — and SearchSymbols + // continues to consult s.nameIdx directly via lookupNodes for its + // tier-0 fast path. + const q = `MATCH (n:Node) WHERE n.name = $name RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"name": name}) + return rowsToNodes(rows) +} + +// FindNodesByNameInRepo restricts FindNodesByName to one repo prefix. +// Same WHERE-clause rationale as FindNodesByName above — the inline +// two-property `{name: ..., repo_prefix: ...}` form was the resolver's +// primary call-edge lookup and the most likely culprit behind +// "method has obvious callers in source but find_usages returns 0". +func (s *Store) FindNodesByNameInRepo(name, repoPrefix string) []*graph.Node { + const q = `MATCH (n:Node) WHERE n.name = $name AND n.repo_prefix = $repo RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"name": name, "repo": repoPrefix}) + return rowsToNodes(rows) +} + +// FindNodesByNameContaining pushes the case-insensitive substring +// filter into a single Cypher MATCH so only matching rows cross the +// cgo boundary. Replaces the pre-existing search-substring fallback +// pattern of AllNodes()-then-filter (which materialised the entire +// node table per call — 68k rows for gortex's own graph; orders of +// magnitude more on Linux-kernel-sized indexes). +// +// Ladybug's CONTAINS is not backed by an index here, so the cost is +// still a server-side scan — but the row count crossing cgo is bound +// to the matching subset rather than every node in the graph, and the +// scan happens inside the engine's hot path rather than over a Go +// for-loop. limit caps the result; 0 means "no limit". +func (s *Store) FindNodesByNameContaining(substr string, limit int) []*graph.Node { + if substr == "" { + return nil + } + // LOWER(...) on both sides keeps the match case-insensitive; the + // graph treats `Login` / `login` as distinct names but a substring + // fallback wants to surface both. ToLower in Go before the bind so + // the engine never has to call LOWER on the literal. + needle := strings.ToLower(substr) + if limit > 0 { + const q = `MATCH (n:Node) WHERE LOWER(n.name) CONTAINS $q RETURN ` + nodeReturnCols + ` LIMIT $k` + rows := s.querySelect(q, map[string]any{"q": needle, "k": int64(limit)}) + return rowsToNodes(rows) + } + const q = `MATCH (n:Node) WHERE LOWER(n.name) CONTAINS $q RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"q": needle}) + return rowsToNodes(rows) +} + +// GetFileNodes returns every node anchored to filePath. +func (s *Store) GetFileNodes(filePath string) []*graph.Node { + // Fast path via the Go-side file→id accelerator: hand the ids + // straight to a primary-key MATCH so Kuzu uses the HASH PK + // index instead of full-scanning Node to find a missing + // file_path secondary index. + if s.fileIDs != nil { + ids := s.fileIDs.idsFor(filePath) + if len(ids) == 0 { + return nil + } + const q = `MATCH (n:Node) WHERE n.id IN $ids RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"ids": stringSliceToAny(ids)}) + return rowsToNodes(rows) + } + const q = `MATCH (n:Node) WHERE n.file_path = $f RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"f": filePath}) + return rowsToNodes(rows) +} + +// GetRepoNodes returns every node in the given repo prefix. +func (s *Store) GetRepoNodes(repoPrefix string) []*graph.Node { + const q = `MATCH (n:Node) WHERE n.repo_prefix = $r RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"r": repoPrefix}) + return rowsToNodes(rows) +} + +// GetOutEdges returns every edge whose From matches nodeID. Uses +// WHERE-form on the PK to match the GetInEdges / GetNode contract — +// the inline `{id: $id}` shape has been observed to return empty +// rows under concurrent writers. +func (s *Store) GetOutEdges(nodeID string) []*graph.Edge { + const q = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE a.id = $id RETURN ` + edgeReturnCols + rows := s.querySelect(q, map[string]any{"id": nodeID}) + return rowsToEdges(rows) +} + +// GetRepoEdges returns every edge whose source node has the given +// RepoPrefix. Implemented as one Cypher MATCH over the (Node)-[Edge]-> +// pattern with a source-side repo_prefix filter — equivalent to the +// GetRepoNodes × GetOutEdges nested walk callers used before, but +// drives the join inside the engine. Eliminates the per-source-node +// query round-trip that dominates Ladybug warmup on multi-repo +// workspaces (one extractor call against gortex's ~68k repo nodes +// previously fired ~68k Cypher queries). +func (s *Store) GetRepoEdges(repoPrefix string) []*graph.Edge { + if repoPrefix == "" { + return nil + } + const q = `MATCH (a:Node {repo_prefix: $r})-[e:Edge]->(b:Node) RETURN ` + edgeReturnCols + rows := s.querySelect(q, map[string]any{"r": repoPrefix}) + return rowsToEdges(rows) +} + +// GetInEdges returns every edge whose To matches nodeID. +// +// The target predicate is expressed as `WHERE b.id = $id`, not an +// inline `(b:Node {id: $id})` property match on the arrow target. +// On a populated workspace the inline form silently returns zero rows +// — the Kuzu planner skips the primary-key probe on the rel-table +// target side and the join collapses to empty. Find_usages / +// get_callers / analyze[cycles] / suggest_pattern all funnel through +// this single primitive, so the empty result cascades into a +// false-positive "no incoming references" verdict across the agent +// surface. Aligning the shape with GetInEdgesByNodeIDs' working +// `WHERE b.id IN $ids` keeps the planner on the same code path that +// the batched sibling exercises (and that the conformance suite +// covers). +func (s *Store) GetInEdges(nodeID string) []*graph.Edge { + const q = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE b.id = $id RETURN ` + edgeReturnCols + rows := s.querySelect(q, map[string]any{"id": nodeID}) + return rowsToEdges(rows) +} + +// GetOutEdgesByNodeIDs returns a map id→outgoing edges for every input +// id. One Cypher round-trip drives a `WHERE a.id IN $ids` match — the +// rerank hot path collapses ~30 per-candidate GetOutEdges calls into +// this single batched query (15ms cgo round-trip × 30 = ~450ms saved +// per search_symbols on ladybug). Missing nodes are absent from the +// returned map; empty input returns nil. +func (s *Store) GetOutEdgesByNodeIDs(ids []string) map[string][]*graph.Edge { + if len(ids) == 0 { + return nil + } + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + const q = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE a.id IN $ids RETURN ` + edgeReturnCols + rows := s.querySelect(q, map[string]any{"ids": stringSliceToAny(uniq)}) + out := make(map[string][]*graph.Edge, len(uniq)) + for _, r := range rows { + e := rowToEdge(r) + if e == nil { + continue + } + out[e.From] = append(out[e.From], e) + } + return out +} + +// GetInEdgesByNodeIDs is the inbound sibling of GetOutEdgesByNodeIDs. +// See that doc-comment for the contract. +func (s *Store) GetInEdgesByNodeIDs(ids []string) map[string][]*graph.Edge { + if len(ids) == 0 { + return nil + } + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + const q = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE b.id IN $ids RETURN ` + edgeReturnCols + rows := s.querySelect(q, map[string]any{"ids": stringSliceToAny(uniq)}) + out := make(map[string][]*graph.Edge, len(uniq)) + for _, r := range rows { + e := rowToEdge(r) + if e == nil { + continue + } + out[e.To] = append(out[e.To], e) + } + return out +} + +// AllNodes materialises every node into a slice. +func (s *Store) AllNodes() []*graph.Node { + const q = `MATCH (n:Node) RETURN ` + nodeReturnCols + rows := s.querySelect(q, nil) + return rowsToNodes(rows) +} + +// AllEdges materialises every edge into a slice. +func (s *Store) AllEdges() []*graph.Edge { + const q = `MATCH (a:Node)-[e:Edge]->(b:Node) RETURN ` + edgeReturnCols + rows := s.querySelect(q, nil) + return rowsToEdges(rows) +} + +// EdgesByKind yields every edge whose Kind matches. The query +// materialises into a slice before yielding so the caller's body is +// free to make re-entrant store calls (the connection is held +// exclusively by an open kuzu_query_result and a re-entrant write +// would deadlock). +func (s *Store) EdgesByKind(kind graph.EdgeKind) iter.Seq[*graph.Edge] { + return func(yield func(*graph.Edge) bool) { + const q = `MATCH (a:Node)-[e:Edge {kind: $kind}]->(b:Node) RETURN ` + edgeReturnCols + rows := s.querySelect(q, map[string]any{"kind": string(kind)}) + for _, r := range rows { + e := rowToEdge(r) + if e == nil { + continue + } + if !yield(e) { + return + } + } + } +} + +// EdgesByKinds yields every edge whose Kind is in the supplied set, +// in a single backend round-trip. One Cypher query with a kind IN-list +// replaces the N independent EdgesByKind queries the edge-driven +// analyzers (channel_ops, pubsub, k8s_resources, kustomize, …) +// otherwise need when they care about 2-5 kinds at once. Materialises +// the row set before yielding for the same reentrancy reason as +// EdgesByKind. +// +// Empty kinds yields nothing — matches the in-memory reference and +// avoids handing Kuzu's planner an empty IN-list (which it tolerates +// but plans badly). +func (s *Store) EdgesByKinds(kinds []graph.EdgeKind) iter.Seq[*graph.Edge] { + return func(yield func(*graph.Edge) bool) { + uniq := dedupeEdgeKinds(kinds) + if len(uniq) == 0 { + return + } + const q = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE e.kind IN $kinds RETURN ` + edgeReturnCols + rows := s.querySelect(q, map[string]any{"kinds": edgeKindSliceToAny(uniq)}) + for _, r := range rows { + e := rowToEdge(r) + if e == nil { + continue + } + if !yield(e) { + return + } + } + } +} + +// NodesByKind yields every node whose Kind matches. +func (s *Store) NodesByKind(kind graph.NodeKind) iter.Seq[*graph.Node] { + return func(yield func(*graph.Node) bool) { + const q = `MATCH (n:Node) WHERE n.kind = $kind RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"kind": string(kind)}) + for _, r := range rows { + n := rowToNode(r) + if n == nil { + continue + } + if !yield(n) { + return + } + } + } +} + +// EdgesWithUnresolvedTarget yields every edge whose To begins with +// "unresolved::". The COPY-time rewrite in copyBulkLocked preserves +// this prefix in the multi-repo form (`unresolved::::`), +// so a single STARTS WITH still catches every form without paying +// for an index-killing CONTAINS scan. +func (s *Store) EdgesWithUnresolvedTarget() iter.Seq[*graph.Edge] { + return func(yield func(*graph.Edge) bool) { + const q = `MATCH (a:Node)-[e:Edge]->(b:Node) WHERE b.id STARTS WITH 'unresolved::' RETURN ` + edgeReturnCols + rows := s.querySelect(q, nil) + for _, r := range rows { + e := rowToEdge(r) + if e == nil { + continue + } + if !yield(e) { + return + } + } + } +} + +// GetNodesByIDs returns a map id→*Node for every input ID present. +// IDs not in the store are absent from the returned map. +func (s *Store) GetNodesByIDs(ids []string) map[string]*graph.Node { + if len(ids) == 0 { + return nil + } + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + // IN $ids on the indexed PK collapses N point lookups into one + // Cypher statement. + const q = `MATCH (n:Node) WHERE n.id IN $ids RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"ids": stringSliceToAny(uniq)}) + out := make(map[string]*graph.Node, len(uniq)) + for _, r := range rows { + n := rowToNode(r) + if n == nil { + continue + } + out[n.ID] = n + } + return out +} + +// frontierRowCap bounds the adjacency rows ExpandFrontier materialises +// per call, derived from the caller's node limit with a generous fan +// multiplier: a normal node's full adjacency is never truncated, while a +// routing hub (precisely what a natural-language "architecture" query +// selects) can no longer stall the daemon by dragging its entire fan-out +// across the cgo boundary. ORDER BY id in the query makes any truncation +// deterministic, so a smart_context manifest pack-root stays stable. +func frontierRowCap(limit int) int { + const fanMultiple, floor, ceil = 8, 256, 4096 + switch { + case limit <= 0: + return ceil + case limit*fanMultiple < floor: + return floor + case limit*fanMultiple > ceil: + return ceil + default: + return limit * fanMultiple + } +} + +// frontierOutQuery / frontierInQuery return, in one round-trip, every +// adjacent edge of the frontier (of the given kinds) plus the neighbour +// node's columns — unresolved/external targets filtered server-side +// (both id encodings, see graph.IsUnresolvedTarget), ordered for +// deterministic truncation, meta omitted. +const frontierOutQuery = `MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE a.id IN $ids AND e.kind IN $kinds + AND NOT (b.id STARTS WITH 'unresolved::' OR b.id CONTAINS '::unresolved::' OR b.id STARTS WITH 'external::') +RETURN ` + frontierEdgeCols + `, b.kind, b.name, b.qual_name, b.file_path, b.start_line, b.end_line, b.language, b.repo_prefix, b.workspace_id, b.project_id +ORDER BY b.id LIMIT $k` + +const frontierInQuery = `MATCH (a:Node)-[e:Edge]->(b:Node) +WHERE b.id IN $ids AND e.kind IN $kinds + AND NOT (a.id STARTS WITH 'unresolved::' OR a.id CONTAINS '::unresolved::' OR a.id STARTS WITH 'external::') +RETURN ` + frontierEdgeCols + `, a.kind, a.name, a.qual_name, a.file_path, a.start_line, a.end_line, a.language, a.repo_prefix, a.workspace_id, a.project_id +ORDER BY a.id LIMIT $k` + +// ExpandFrontier implements graph.FrontierExpander: one Cypher +// round-trip returns the frontier's edges of the given kinds plus the +// neighbour node columns, so the caller needs no GetNode per edge. +func (s *Store) ExpandFrontier(ids []string, forward bool, kinds []graph.EdgeKind, limit int) []graph.FrontierHop { + if len(ids) == 0 || len(kinds) == 0 { + return nil + } + uniq := dedupeNonEmpty(ids) + if len(uniq) == 0 { + return nil + } + kindAny := make([]any, 0, len(kinds)) + for _, k := range kinds { + kindAny = append(kindAny, string(k)) + } + q := frontierOutQuery + if !forward { + q = frontierInQuery + } + rows := s.querySelect(q, map[string]any{ + "ids": stringSliceToAny(uniq), + "kinds": kindAny, + "k": int64(frontierRowCap(limit)), + }) + hops := make([]graph.FrontierHop, 0, len(rows)) + for _, r := range rows { + if h, ok := frontierHopFromRow(r, forward); ok { + hops = append(hops, h) + } + } + return hops +} + +// FindNodesByNames returns a map name→[]*Node for every input name. +// Names that match no node are absent from the returned map. +func (s *Store) FindNodesByNames(names []string) map[string][]*graph.Node { + if len(names) == 0 { + return nil + } + uniq := dedupeNonEmpty(names) + if len(uniq) == 0 { + return nil + } + const q = `MATCH (n:Node) WHERE n.name IN $names RETURN ` + nodeReturnCols + rows := s.querySelect(q, map[string]any{"names": stringSliceToAny(uniq)}) + out := make(map[string][]*graph.Node, len(uniq)) + for _, r := range rows { + n := rowToNode(r) + if n == nil { + continue + } + out[n.Name] = append(out[n.Name], n) + } + return out +} diff --git a/internal/graph/store_ladybug/store_rows.go b/internal/graph/store_ladybug/store_rows.go new file mode 100644 index 00000000..a6bc279c --- /dev/null +++ b/internal/graph/store_ladybug/store_rows.go @@ -0,0 +1,199 @@ +package store_ladybug + +import "github.com/zzet/gortex/internal/graph" + +// nodeReturnCols is the canonical projection for Node rows, ordered +// to match rowToNode's index reads. +const nodeReturnCols = `n.id, n.kind, n.name, n.qual_name, n.file_path, n.start_line, n.end_line, n.language, n.repo_prefix, n.workspace_id, n.project_id, n.meta` + +// edgeReturnCols is the canonical projection for Edge rows, ordered +// to match rowToEdge's index reads. +const edgeReturnCols = `a.id, b.id, e.kind, e.file_path, e.line, e.confidence, e.confidence_label, e.origin, e.tier, e.cross_repo, e.meta` + +// frontierEdgeCols is edgeReturnCols without e.meta — bfs / get_callers / +// get_callchain never read Edge.Meta, and gob-decoding it per row is what +// makes a wide fan-out expensive. Index order matches frontierHopFromRow. +const frontierEdgeCols = `a.id, b.id, e.kind, e.file_path, e.line, e.confidence, e.confidence_label, e.origin, e.tier, e.cross_repo` + +func rowToNode(row []any) *graph.Node { + if len(row) < 12 { + return nil + } + n := &graph.Node{} + n.ID, _ = row[0].(string) + kind, _ := row[1].(string) + n.Kind = graph.NodeKind(kind) + n.Name, _ = row[2].(string) + n.QualName, _ = row[3].(string) + n.FilePath, _ = row[4].(string) + n.StartLine = int(asInt64(row[5])) + n.EndLine = int(asInt64(row[6])) + n.Language, _ = row[7].(string) + n.RepoPrefix, _ = row[8].(string) + n.WorkspaceID, _ = row[9].(string) + n.ProjectID, _ = row[10].(string) + metaStr, _ := row[11].(string) + if metaStr != "" { + m, err := decodeMeta(metaStr) + if err == nil { + n.Meta = m + } + } + return n +} + +func rowsToNodes(rows [][]any) []*graph.Node { + out := make([]*graph.Node, 0, len(rows)) + for _, r := range rows { + if n := rowToNode(r); n != nil { + out = append(out, n) + } + } + return out +} + +func rowToEdge(row []any) *graph.Edge { + if len(row) < 11 { + return nil + } + e := &graph.Edge{} + e.From, _ = row[0].(string) + e.To, _ = row[1].(string) + kind, _ := row[2].(string) + e.Kind = graph.EdgeKind(kind) + e.FilePath, _ = row[3].(string) + e.Line = int(asInt64(row[4])) + if v, ok := row[5].(float64); ok { + e.Confidence = v + } + e.ConfidenceLabel, _ = row[6].(string) + e.Origin, _ = row[7].(string) + e.Tier, _ = row[8].(string) + e.CrossRepo = asInt64(row[9]) != 0 + metaStr, _ := row[10].(string) + if metaStr != "" { + m, err := decodeMeta(metaStr) + if err == nil { + e.Meta = m + } + } + return e +} + +func rowsToEdges(rows [][]any) []*graph.Edge { + out := make([]*graph.Edge, 0, len(rows)) + for _, r := range rows { + if e := rowToEdge(r); e != nil { + out = append(out, e) + } + } + return out +} + +// frontierHopFromRow decodes one ExpandFrontier row: cols 0..9 are the +// edge (frontierEdgeCols, no meta), cols 10..19 the neighbour node's +// columns (kind, name, qual_name, file_path, start_line, end_line, +// language, repo_prefix, workspace_id, project_id — no meta). The +// neighbour id is the far end of the stored edge: To for an outgoing +// (forward) hop, From for incoming. +func frontierHopFromRow(row []any, forward bool) (graph.FrontierHop, bool) { + if len(row) < 20 { + return graph.FrontierHop{}, false + } + e := &graph.Edge{} + e.From, _ = row[0].(string) + e.To, _ = row[1].(string) + kind, _ := row[2].(string) + e.Kind = graph.EdgeKind(kind) + e.FilePath, _ = row[3].(string) + e.Line = int(asInt64(row[4])) + if v, ok := row[5].(float64); ok { + e.Confidence = v + } + e.ConfidenceLabel, _ = row[6].(string) + e.Origin, _ = row[7].(string) + e.Tier, _ = row[8].(string) + e.CrossRepo = asInt64(row[9]) != 0 + + n := &graph.Node{} + if forward { + n.ID = e.To + } else { + n.ID = e.From + } + knd, _ := row[10].(string) + n.Kind = graph.NodeKind(knd) + n.Name, _ = row[11].(string) + n.QualName, _ = row[12].(string) + n.FilePath, _ = row[13].(string) + n.StartLine = int(asInt64(row[14])) + n.EndLine = int(asInt64(row[15])) + n.Language, _ = row[16].(string) + n.RepoPrefix, _ = row[17].(string) + n.WorkspaceID, _ = row[18].(string) + n.ProjectID, _ = row[19].(string) + return graph.FrontierHop{Edge: e, Neighbor: n}, true +} + +// asInt64 normalises every integer-shaped value the KuzuDB binding +// might hand back (int8, int16, int32, int64, plus their unsigned +// counterparts and the plain `int`). The rel/node columns we read +// were all declared as INT64 in schema.go, but the binding +// occasionally returns smaller widths for results coming out of +// count() aggregates so we cover the full set. +func asInt64(v any) int64 { + switch t := v.(type) { + case int64: + return t + case int32: + return int64(t) + case int16: + return int64(t) + case int8: + return int64(t) + case int: + return int64(t) + case uint64: + return int64(t) + case uint32: + return int64(t) + case uint16: + return int64(t) + case uint8: + return int64(t) + case uint: + return int64(t) + case float64: + return int64(t) + default: + return 0 + } +} + +func dedupeNonEmpty(in []string) []string { + seen := make(map[string]struct{}, len(in)) + out := make([]string, 0, len(in)) + for _, s := range in { + if s == "" { + continue + } + if _, ok := seen[s]; ok { + continue + } + seen[s] = struct{}{} + out = append(out, s) + } + return out +} + +// stringSliceToAny converts a typed string slice into the []any form +// the KuzuDB Go binding expects when binding a Cypher list +// parameter (the binding cannot infer a list type from a strongly +// typed slice — it walks each element through goValueToKuzuValue). +func stringSliceToAny(in []string) []any { + out := make([]any, len(in)) + for i, s := range in { + out[i] = s + } + return out +} diff --git a/internal/graph/store_ladybug/store_stats.go b/internal/graph/store_ladybug/store_stats.go new file mode 100644 index 00000000..cfd350ad --- /dev/null +++ b/internal/graph/store_ladybug/store_stats.go @@ -0,0 +1,172 @@ +package store_ladybug + +import "github.com/zzet/gortex/internal/graph" + +func (s *Store) NodeCount() int { + rows := s.querySelect(`MATCH (n:Node) RETURN count(n)`, nil) + if len(rows) == 0 { + return 0 + } + n, _ := rows[0][0].(int64) + return int(n) +} + +func (s *Store) EdgeCount() int { + rows := s.querySelect(`MATCH ()-[e:Edge]->() RETURN count(e)`, nil) + if len(rows) == 0 { + return 0 + } + n, _ := rows[0][0].(int64) + return int(n) +} + +func (s *Store) Stats() graph.GraphStats { + st := graph.GraphStats{ + ByKind: map[string]int{}, + ByLanguage: map[string]int{}, + } + st.TotalNodes = s.NodeCount() + st.TotalEdges = s.EdgeCount() + + rows := s.querySelect(`MATCH (n:Node) RETURN n.kind, count(n)`, nil) + for _, r := range rows { + kind, _ := r[0].(string) + n, _ := r[1].(int64) + if kind == "" { + continue + } + st.ByKind[kind] = int(n) + } + rows = s.querySelect(`MATCH (n:Node) RETURN n.language, count(n)`, nil) + for _, r := range rows { + lang, _ := r[0].(string) + n, _ := r[1].(int64) + if lang == "" { + continue + } + st.ByLanguage[lang] = int(n) + } + return st +} + +func (s *Store) RepoStats() map[string]graph.GraphStats { + out := map[string]graph.GraphStats{} + rows := s.querySelect(`MATCH (n:Node) WHERE n.repo_prefix <> '' RETURN n.repo_prefix, n.kind, n.language, count(n)`, nil) + for _, r := range rows { + repo, _ := r[0].(string) + kind, _ := r[1].(string) + lang, _ := r[2].(string) + n, _ := r[3].(int64) + if repo == "" { + continue + } + st, ok := out[repo] + if !ok { + st = graph.GraphStats{ByKind: map[string]int{}, ByLanguage: map[string]int{}} + } + st.TotalNodes += int(n) + st.ByKind[kind] += int(n) + st.ByLanguage[lang] += int(n) + out[repo] = st + } + rows = s.querySelect(` +MATCH (a:Node)-[e:Edge]->(:Node) +WHERE a.repo_prefix <> '' +RETURN a.repo_prefix, count(e)`, nil) + for _, r := range rows { + repo, _ := r[0].(string) + n, _ := r[1].(int64) + if repo == "" { + continue + } + st, ok := out[repo] + if !ok { + st = graph.GraphStats{ByKind: map[string]int{}, ByLanguage: map[string]int{}} + } + st.TotalEdges = int(n) + out[repo] = st + } + return out +} + +func (s *Store) RepoPrefixes() []string { + rows := s.querySelect(`MATCH (n:Node) WHERE n.repo_prefix <> '' RETURN DISTINCT n.repo_prefix`, nil) + out := make([]string, 0, len(rows)) + for _, r := range rows { + p, _ := r[0].(string) + if p == "" { + continue + } + out = append(out, p) + } + return out +} + +func (s *Store) EdgeIdentityRevisions() int { + return int(s.edgeIdentityRevs.Load()) +} + +// VerifyEdgeIdentities is a no-op for the KuzuDB backend: there is a +// single canonical row per edge in the rel table, so the "same +// pointer in both adjacency views" invariant the in-memory store +// upholds is trivially satisfied here — no walk can find a +// divergence to report. +func (s *Store) VerifyEdgeIdentities() error { return nil } + +const ( + perNodeByteEstimate = 256 + perEdgeByteEstimate = 128 +) + +func (s *Store) RepoMemoryEstimate(repoPrefix string) graph.RepoMemoryEstimate { + var est graph.RepoMemoryEstimate + rows := s.querySelect(`MATCH (n:Node) WHERE n.repo_prefix = $r RETURN count(n)`, map[string]any{"r": repoPrefix}) + if len(rows) == 0 { + return est + } + n, _ := rows[0][0].(int64) + rows = s.querySelect(` +MATCH (a:Node {repo_prefix: $r})-[e:Edge]->(:Node) +RETURN count(e)`, map[string]any{"r": repoPrefix}) + var e int64 + if len(rows) > 0 { + e, _ = rows[0][0].(int64) + } + est.NodeCount = int(n) + est.EdgeCount = int(e) + est.NodeBytes = uint64(n) * perNodeByteEstimate + est.EdgeBytes = uint64(e) * perEdgeByteEstimate + return est +} + +func (s *Store) AllRepoMemoryEstimates() map[string]graph.RepoMemoryEstimate { + out := map[string]graph.RepoMemoryEstimate{} + rows := s.querySelect(`MATCH (n:Node) WHERE n.repo_prefix <> '' RETURN n.repo_prefix, count(n)`, nil) + for _, r := range rows { + repo, _ := r[0].(string) + n, _ := r[1].(int64) + if repo == "" { + continue + } + est := out[repo] + est.NodeCount = int(n) + est.NodeBytes = uint64(n) * perNodeByteEstimate + out[repo] = est + } + rows = s.querySelect(` +MATCH (a:Node)-[e:Edge]->(:Node) +WHERE a.repo_prefix <> '' +RETURN a.repo_prefix, count(e)`, nil) + for _, r := range rows { + repo, _ := r[0].(string) + n, _ := r[1].(int64) + if repo == "" { + continue + } + est := out[repo] + est.EdgeCount = int(n) + est.EdgeBytes = uint64(n) * perEdgeByteEstimate + out[repo] = est + } + return out +} diff --git a/internal/graph/store_ladybug/store_test.go b/internal/graph/store_ladybug/store_test.go new file mode 100644 index 00000000..e1a9a338 --- /dev/null +++ b/internal/graph/store_ladybug/store_test.go @@ -0,0 +1,34 @@ +package store_ladybug_test + +import ( + "path/filepath" + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_ladybug" + "github.com/zzet/gortex/internal/graph/storetest" +) + +func TestLadybugStoreConformance(t *testing.T) { + storetest.RunConformance(t, func(t *testing.T) graph.Store { + dir := t.TempDir() + s, err := store_ladybug.Open(filepath.Join(dir, "test.kuzu")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s + }) +} + +func TestLadybugBackendResolverConformance(t *testing.T) { + storetest.RunBackendResolverConformance(t, func(t *testing.T) graph.Store { + dir := t.TempDir() + s, err := store_ladybug.Open(filepath.Join(dir, "test.kuzu")) + if err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + return s + }) +} diff --git a/internal/graph/store_ladybug/store_write.go b/internal/graph/store_ladybug/store_write.go new file mode 100644 index 00000000..7476632a --- /dev/null +++ b/internal/graph/store_ladybug/store_write.go @@ -0,0 +1,653 @@ +package store_ladybug + +import ( + "fmt" + + "github.com/zzet/gortex/internal/graph" +) + +// AddNode inserts (or upserts) a node. Idempotent on the id PK — a +// second AddNode for the same id is a no-op except for any column +// updates the new value carries, matching the in-memory store's +// "last write wins" behaviour. +func (s *Store) AddNode(n *graph.Node) { + if n == nil || n.ID == "" { + return + } + // Bulk-load fast path: if a drain has called BeginBulkLoad, route + // this write into the bulk buffer instead of taking writeMu and + // running an UNWIND-MERGE. Otherwise contracts / clones / DI + // emission paths (commitInlinedContractToGraph and friends) that + // call AddNode directly during the bulk window would slip a live + // Node row in past the bulk's view, the bulk's subsequent COPY + // Node would re-insert the same ID, and Kuzu's COPY rejects the + // duplicate primary key — torpedoing the entire repo's index. + // AddBatch already uses this routing; AddNode/AddEdge needed to + // match. + s.bulkMu.Lock() + if s.bulkActive { + s.bulkNodes = append(s.bulkNodes, n) + s.bulkMu.Unlock() + return + } + s.bulkMu.Unlock() + s.writeMu.Lock() + defer s.writeMu.Unlock() + s.upsertNodeLocked(n) + s.writeGen.Add(1) +} + +func (s *Store) upsertNodeLocked(n *graph.Node) { + metaStr, err := encodeMeta(n.Meta) + if err != nil { + panicOnFatal(fmt.Errorf("encode meta: %w", err)) + return + } + if s.fileIDs != nil { + s.fileIDs.add(n.FilePath, n.ID) + } + if s.nameIdx != nil { + s.nameIdx.addNode(n) + } + // MERGE on id, then SET every column. This is the upsert pattern + // for KuzuDB — a bare CREATE on a duplicate PK raises a + // uniqueness violation; MERGE matches-or-creates without error. + const q = ` +MERGE (n:Node {id: $id}) +SET n.kind = $kind, + n.name = $name, + n.qual_name = $qual_name, + n.file_path = $file_path, + n.start_line = $start_line, + n.end_line = $end_line, + n.language = $language, + n.repo_prefix = $repo_prefix, + n.workspace_id = $workspace_id, + n.project_id = $project_id, + n.meta = $meta` + args := map[string]any{ + "id": n.ID, + "kind": string(n.Kind), + "name": n.Name, + "qual_name": n.QualName, + "file_path": n.FilePath, + "start_line": int64(n.StartLine), + "end_line": int64(n.EndLine), + "language": n.Language, + "repo_prefix": n.RepoPrefix, + "workspace_id": n.WorkspaceID, + "project_id": n.ProjectID, + "meta": metaStr, + } + s.runWriteLocked(q, args) +} + +// AddEdge inserts an edge. Idempotent on the (from, to, kind, +// file_path, line) tuple via MERGE. +func (s *Store) AddEdge(e *graph.Edge) { + if e == nil { + return + } + // Bulk-load fast path: mirror AddNode — during a drain's + // BeginBulkLoad / FlushBulk window, contract / clones / DI emission + // code calls AddEdge directly. Letting those slip through as a live + // MERGE while the bulk buffer still holds a duplicate of the same + // edge would re-trigger the COPY-Edge "duplicate primary key" / + // "unable to find primary key" classes the AddNode fix addresses. + s.bulkMu.Lock() + if s.bulkActive { + s.bulkEdges = append(s.bulkEdges, e) + s.bulkMu.Unlock() + return + } + s.bulkMu.Unlock() + s.writeMu.Lock() + defer s.writeMu.Unlock() + s.upsertEdgeLocked(e) + s.writeGen.Add(1) +} + +func (s *Store) upsertEdgeLocked(e *graph.Edge) { + metaStr, err := encodeMeta(e.Meta) + if err != nil { + panicOnFatal(fmt.Errorf("encode edge meta: %w", err)) + return + } + var crossRepo int64 + if e.CrossRepo { + crossRepo = 1 + } + // The in-memory store happily inserts edges whose endpoints + // haven't been registered with AddNode yet (the resolver writes + // edges to "unresolved::*" stubs that never have a corresponding + // node, and AllEdges is expected to surface them so the resolver + // can iterate them). KuzuDB's rel tables require both endpoints + // to exist in the node table, so we MERGE-stub the endpoints + // first; the MERGE is a no-op for ids the caller has already + // registered via AddNode. The stub nodes carry empty + // kind/name/file_path; if the caller later AddNode's them with + // real metadata, that upsert overwrites the columns in place. + s.mergeStubNodeLocked(e.From) + s.mergeStubNodeLocked(e.To) + // MERGE the rel on the identity tuple (from, to, kind, file_path, + // line). Idempotent — a second AddEdge with the same tuple + // updates the per-edge columns (confidence / origin / tier / + // meta) in place without creating a duplicate row. + const q = ` +MATCH (a:Node {id: $from}), (b:Node {id: $to}) +MERGE (a)-[e:Edge {kind: $kind, file_path: $file_path, line: $line}]->(b) +SET e.confidence = $confidence, + e.confidence_label = $confidence_label, + e.origin = $origin, + e.tier = $tier, + e.cross_repo = $cross_repo, + e.meta = $meta` + args := map[string]any{ + "from": e.From, + "to": e.To, + "kind": string(e.Kind), + "file_path": e.FilePath, + "line": int64(e.Line), + "confidence": e.Confidence, + "confidence_label": e.ConfidenceLabel, + "origin": e.Origin, + "tier": e.Tier, + "cross_repo": crossRepo, + "meta": metaStr, + } + s.runWriteLocked(q, args) +} + +// mergeStubNodeLocked ensures a Node row exists for id without +// overwriting any columns the caller may have set via a previous +// AddNode. We use MERGE … ON CREATE SET so an existing fully- +// populated node keeps its kind / name / file_path / etc., and a +// brand-new stub gets blank defaults the columns the schema +// initialises. +func (s *Store) mergeStubNodeLocked(id string) { + if id == "" { + return + } + const q = ` +MERGE (n:Node {id: $id}) +ON CREATE SET n.kind = '', + n.name = '', + n.qual_name = '', + n.file_path = '', + n.start_line = 0, + n.end_line = 0, + n.language = '', + n.repo_prefix = '', + n.workspace_id = '', + n.project_id = '', + n.meta = ''` + s.runWriteLocked(q, map[string]any{"id": id}) +} + +// AddBatch inserts a batch of nodes and edges. KuzuDB does not expose +// an explicit transaction API through the Go binding, and the +// conformance suite only verifies the post-batch counts — looping +// the per-call mutators is the safe path that satisfies the +// contract. Indexing scale will favour a UNWIND-driven batched +// MERGE once we wire the bench harness up; the per-loop variant +// keeps the conformance suite passing today. +// kuzuBatchChunkSize bounds the row count per UNWIND-driven +// Cypher statement. The Go binding round-trip is ~ms; per-record +// loops at indexer scale (124k+ nodes, 524k+ edges) take tens of +// minutes. UNWIND lets one statement carry a list of rows, so a +// 5000-row chunk amortises one Cypher parse + plan + Execute +// across N MERGEs. +const kuzuBatchChunkSize = 5000 + +// AddBatch fans node and edge inserts into UNWIND-driven Cypher +// statements — one Execute per ≤kuzuBatchChunkSize rows instead of +// one per record. The MERGE semantics match upsertNodeLocked / +// upsertEdgeLocked exactly so the conformance idempotency contract +// is preserved. +func (s *Store) AddBatch(nodes []*graph.Node, edges []*graph.Edge) { + if len(nodes) == 0 && len(edges) == 0 { + return + } + // Bulk-load fast path: buffer in memory, defer Cypher to FlushBulk. + // The buffer lock is held briefly only across the slice append — + // the indexer's parse workers can hammer AddBatch in parallel with + // minimal contention. + s.bulkMu.Lock() + if s.bulkActive { + s.bulkNodes = append(s.bulkNodes, nodes...) + s.bulkEdges = append(s.bulkEdges, edges...) + s.bulkMu.Unlock() + return + } + s.bulkMu.Unlock() + + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Nodes use the UNWIND-MERGE batching path — safe because nodes + // carry no FK references, so the "unordered_map::at: key not + // found" crash that bites edge UNWIND can't fire here. Batching + // turns N upserts into ceil(N/chunk) Cypher calls — meaningful on + // Ladybug where each cgo round-trip costs ~1 ms. + if len(nodes) > 0 { + s.addNodesUnwindLocked(nodes) + } + // Edges stay on the per-call upsertEdgeLocked path: it stubs the + // endpoints with explicit MERGE before MERGEing the edge, which + // dodges the C++ panic the fork raises when UNWIND-MERGE sees an + // edge row whose endpoint id isn't yet in the node table. + for _, e := range edges { + if e == nil { + continue + } + s.upsertEdgeLocked(e) + } + s.writeGen.Add(1) +} + +// addNodesUnwindLocked materialises nodes as a list of structs and +// runs them through one UNWIND + MERGE per chunk. +func (s *Store) addNodesUnwindLocked(nodes []*graph.Node) { + if s.fileIDs != nil { + s.fileIDs.addNodes(nodes) + } + if s.nameIdx != nil { + s.nameIdx.addNodes(nodes) + } + for i := 0; i < len(nodes); i += kuzuBatchChunkSize { + end := i + kuzuBatchChunkSize + if end > len(nodes) { + end = len(nodes) + } + chunk := nodes[i:end] + rows := make([]map[string]any, 0, len(chunk)) + for _, n := range chunk { + if n == nil || n.ID == "" { + continue + } + metaStr, err := encodeMeta(n.Meta) + if err != nil { + panicOnFatal(fmt.Errorf("encode meta: %w", err)) + return + } + rows = append(rows, map[string]any{ + "id": n.ID, + "kind": string(n.Kind), + "name": n.Name, + "qual_name": n.QualName, + "file_path": n.FilePath, + "start_line": int64(n.StartLine), + "end_line": int64(n.EndLine), + "language": n.Language, + "repo_prefix": n.RepoPrefix, + "workspace_id": n.WorkspaceID, + "project_id": n.ProjectID, + "meta": metaStr, + }) + } + if len(rows) == 0 { + continue + } + const q = ` +UNWIND $rows AS row +MERGE (n:Node {id: row.id}) +SET n.kind = row.kind, + n.name = row.name, + n.qual_name = row.qual_name, + n.file_path = row.file_path, + n.start_line = row.start_line, + n.end_line = row.end_line, + n.language = row.language, + n.repo_prefix = row.repo_prefix, + n.workspace_id = row.workspace_id, + n.project_id = row.project_id, + n.meta = row.meta` + s.runWriteLocked(q, map[string]any{"rows": rows}) + } +} + +// SetEdgeProvenance mutates an existing edge's origin in-place and +// bumps the identity-revision counter when the origin actually +// changes. Returns true iff a change was applied. +func (s *Store) SetEdgeProvenance(e *graph.Edge, newOrigin string) bool { + if e == nil { + return false + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + return s.setEdgeProvenanceLocked(e, newOrigin) +} + +func (s *Store) setEdgeProvenanceLocked(e *graph.Edge, newOrigin string) bool { + // Look up the currently stored origin so we can skip the update + // when the value is already at the target tier (the caller- + // supplied *Edge may be a detached copy whose Origin already + // matches even though the row still has the old value). + const sel = ` +MATCH (a:Node {id: $from})-[e:Edge {kind: $kind, file_path: $file_path, line: $line}]->(b:Node {id: $to}) +RETURN e.origin LIMIT 1` + selArgs := map[string]any{ + "from": e.From, + "to": e.To, + "kind": string(e.Kind), + "file_path": e.FilePath, + "line": int64(e.Line), + } + rows := s.querySelectLocked(sel, selArgs) + if len(rows) == 0 { + return false + } + storedOrigin, _ := rows[0][0].(string) + if storedOrigin == newOrigin { + return false + } + newTier := e.Tier + if newTier != "" { + newTier = graph.ResolvedBy(newOrigin) + } + const upd = ` +MATCH (a:Node {id: $from})-[e:Edge {kind: $kind, file_path: $file_path, line: $line}]->(b:Node {id: $to}) +SET e.origin = $origin, e.tier = $tier` + updArgs := map[string]any{ + "from": e.From, + "to": e.To, + "kind": string(e.Kind), + "file_path": e.FilePath, + "line": int64(e.Line), + "origin": newOrigin, + "tier": newTier, + } + s.runWriteLocked(upd, updArgs) + e.Origin = newOrigin + if e.Tier != "" { + e.Tier = newTier + } + s.edgeIdentityRevs.Add(1) + s.writeGen.Add(1) + return true +} + +// SetEdgeProvenanceBatch UNWIND-batches origin promotions. Each +// chunk does one Cypher MATCH-WHERE-SET with a list of (key, new +// origin) rows; the WHERE clause filters down to edges whose +// stored origin actually differs, and the RETURN count gives us +// the changed-row total to bump the revision counter. +func (s *Store) SetEdgeProvenanceBatch(batch []graph.EdgeProvenanceUpdate) int { + if len(batch) == 0 { + return 0 + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + totalChanged := 0 + for i := 0; i < len(batch); i += kuzuBatchChunkSize { + end := i + kuzuBatchChunkSize + if end > len(batch) { + end = len(batch) + } + chunk := batch[i:end] + rows := make([]map[string]any, 0, len(chunk)) + // Maintain a side-index from row position → caller's *Edge so + // we can mirror the in-memory contract (the caller's pointer's + // Origin/Tier field is updated when the row actually changed). + callerEdges := make([]*graph.Edge, 0, len(chunk)) + for _, u := range chunk { + if u.Edge == nil { + continue + } + newTier := u.Edge.Tier + if newTier != "" { + newTier = graph.ResolvedBy(u.NewOrigin) + } + rows = append(rows, map[string]any{ + "from": u.Edge.From, + "to": u.Edge.To, + "kind": string(u.Edge.Kind), + "file_path": u.Edge.FilePath, + "line": int64(u.Edge.Line), + "origin": u.NewOrigin, + "tier": newTier, + }) + callerEdges = append(callerEdges, u.Edge) + } + if len(rows) == 0 { + continue + } + const q = ` +UNWIND $rows AS row +MATCH (a:Node {id: row.from})-[e:Edge {kind: row.kind, file_path: row.file_path, line: row.line}]->(b:Node {id: row.to}) +WHERE e.origin <> row.origin +SET e.origin = row.origin, e.tier = row.tier +RETURN row.from, row.to, row.kind, row.file_path, row.line, row.origin, row.tier` + res := s.querySelectLocked(q, map[string]any{"rows": rows}) + // The SELECT-style result lists every edge the SET actually + // touched (the WHERE filter dropped rows whose origin already + // matched). Mirror the per-call SetEdgeProvenance contract by + // updating the caller's Edge pointer in-place for those rows. + changed := len(res) + // Build a (from|to|kind|file|line) → *Edge map so we can map + // returned rows back to caller-supplied pointers without + // quadratic scanning. + idx := make(map[string]*graph.Edge, len(callerEdges)) + for _, e := range callerEdges { + idx[provKey(e)] = e + } + for _, row := range res { + from, _ := row[0].(string) + to, _ := row[1].(string) + kind, _ := row[2].(string) + file, _ := row[3].(string) + line, _ := row[4].(int64) + origin, _ := row[5].(string) + tier, _ := row[6].(string) + key := from + "\x00" + to + "\x00" + kind + "\x00" + file + "\x00" + strconvI64(line) + if e := idx[key]; e != nil { + e.Origin = origin + if e.Tier != "" { + e.Tier = tier + } + } + } + totalChanged += changed + if changed > 0 { + s.edgeIdentityRevs.Add(int64(changed)) + s.writeGen.Add(1) + } + } + return totalChanged +} + +// provKey builds the (from, to, kind, file, line) identity string +// used to map Cypher RETURN rows back to caller Edge pointers +// inside SetEdgeProvenanceBatch. +func provKey(e *graph.Edge) string { + return e.From + "\x00" + e.To + "\x00" + string(e.Kind) + "\x00" + e.FilePath + "\x00" + strconvI64(int64(e.Line)) +} + +func strconvI64(v int64) string { + return fmt.Sprintf("%d", v) +} + +// ReindexEdge updates the stored row after e.To has been mutated +// from oldTo to e.To. Implemented as delete-old + insert-new under +// the same write lock. A no-op when oldTo == e.To. +func (s *Store) ReindexEdge(e *graph.Edge, oldTo string) { + if e == nil || oldTo == e.To { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + s.reindexEdgeLocked(e, oldTo) + s.writeGen.Add(1) +} + +func (s *Store) reindexEdgeLocked(e *graph.Edge, oldTo string) { + const del = ` +MATCH (a:Node {id: $from})-[e:Edge {kind: $kind, file_path: $file_path, line: $line}]->(b:Node {id: $oldTo}) +DELETE e` + s.runWriteLocked(del, map[string]any{ + "from": e.From, + "oldTo": oldTo, + "kind": string(e.Kind), + "file_path": e.FilePath, + "line": int64(e.Line), + }) + s.upsertEdgeLocked(e) +} + +// ReindexEdges UNWIND-batches the delete-old + insert-new pattern: +// one MATCH-DELETE for the old-To rows, then the standard +// UNWIND-based edge insert for the new-To rows. Both use chunked +// statements so a 10k-row resolver pass fires ~4 Cypher Execs +// instead of ~10k. +func (s *Store) ReindexEdges(batch []graph.EdgeReindex) { + if len(batch) == 0 { + return + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Per-call ReindexEdge loop instead of the Kuzu-style UNWIND + // double-pass. Ladybug's UNWIND-MATCH-DELETE-then-UNWIND-MERGE + // pattern triggers the same "unordered_map::at: key not found" + // C++ panic as AddBatch's UNWIND-MERGE. The per-call form's + // explicit DELETE/MATCH/MERGE sequence sidesteps the engine bug. + // Bulk indexing routes through the BulkLoader COPY path so the + // resolver hot path doesn't pay this loop's cost on cold start. + mutated := false + for _, r := range batch { + if r.Edge == nil || r.OldTo == r.Edge.To { + continue + } + s.reindexEdgeLocked(r.Edge, r.OldTo) + mutated = true + } + if mutated { + s.writeGen.Add(1) + } +} + +// RemoveEdge deletes every edge between (from, to) with the given +// kind. Returns true iff at least one row was deleted. +func (s *Store) RemoveEdge(from, to string, kind graph.EdgeKind) bool { + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Count first so we can return the existence boolean — KuzuDB's + // DELETE statement does not return an affected-rows count + // through the Go binding. + const cnt = ` +MATCH (a:Node {id: $from})-[e:Edge {kind: $kind}]->(b:Node {id: $to}) +RETURN count(e)` + rows := s.querySelectLocked(cnt, map[string]any{ + "from": from, + "to": to, + "kind": string(kind), + }) + if len(rows) == 0 { + return false + } + n, _ := rows[0][0].(int64) + if n == 0 { + return false + } + const del = ` +MATCH (a:Node {id: $from})-[e:Edge {kind: $kind}]->(b:Node {id: $to}) +DELETE e` + s.runWriteLocked(del, map[string]any{ + "from": from, + "to": to, + "kind": string(kind), + }) + s.writeGen.Add(1) + return true +} + +// EvictFile removes every node anchored to filePath and every edge +// that touches one of those nodes. DETACH DELETE handles the edge +// cleanup as part of the node delete, so a single Cypher statement +// is enough. +func (s *Store) EvictFile(filePath string) (nodesRemoved, edgesRemoved int) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + n, e := s.evictByScopeLocked("file_path", filePath) + if s.fileIDs != nil { + s.fileIDs.removeFile(filePath) + } + return n, e +} + +// EvictRepo removes every node in repoPrefix and every edge that +// touches one. +func (s *Store) EvictRepo(repoPrefix string) (nodesRemoved, edgesRemoved int) { + s.writeMu.Lock() + defer s.writeMu.Unlock() + // Collect the file paths that will be evicted BEFORE the DELETE, + // so we can drop their entries from the fileIDs accelerator + // without scanning the whole map ourselves. evictByScopeLocked's + // DETACH DELETE wipes the rows, after which the file_path column + // is no longer queryable. + var affectedPaths []string + if s.fileIDs != nil { + const pathsQ = `MATCH (n:Node) WHERE n.repo_prefix = $r AND n.file_path <> '' RETURN DISTINCT n.file_path` + rows := s.querySelectLocked(pathsQ, map[string]any{"r": repoPrefix}) + affectedPaths = make([]string, 0, len(rows)) + for _, r := range rows { + if len(r) == 0 { + continue + } + if p, ok := r[0].(string); ok && p != "" { + affectedPaths = append(affectedPaths, p) + } + } + } + n, e := s.evictByScopeLocked("repo_prefix", repoPrefix) + // ALSO evict nodes whose ID is in this repo's namespace (`/…`) + // but whose repo_prefix column is empty. Edge-endpoint stubs created + // by mergeStubNodeLocked (cross-repo resolution, the global resolve + // pass) are written with repo_prefix='' even when their ID is + // `/unresolved::Name` — so the repo_prefix-scoped delete above + // misses them. They then collide on the INSERT-only bulk COPY when + // this repo is re-tracked (warm-restart reconcile), failing the COPY + // with "duplicated primary key" and — because the repo's real rows + // were already evicted — dropping the whole repo from the graph. The + // trailing slash keeps `gortex/` from matching `gortex-cloud/…`. + // Skipped for the single-repo (empty-prefix) store, where every ID is + // already covered by the repo_prefix='' delete shape. + if repoPrefix != "" { + const delByID = `MATCH (n:Node) WHERE n.id STARTS WITH $idp DETACH DELETE n` + s.runWriteLocked(delByID, map[string]any{"idp": repoPrefix + "/"}) + s.writeGen.Add(1) + } + if s.fileIDs != nil { + s.fileIDs.removeFiles(affectedPaths) + } + return n, e +} + +// evictByScopeLocked is the shared body of EvictFile / EvictRepo. +// We count the affected nodes and edges first so the caller gets +// accurate removal totals (DETACH DELETE does not surface them +// through the Go binding), then issue DETACH DELETE. +func (s *Store) evictByScopeLocked(column, value string) (int, int) { + cntNodes := fmt.Sprintf(`MATCH (n:Node) WHERE n.%s = $v RETURN count(n)`, column) + rows := s.querySelectLocked(cntNodes, map[string]any{"v": value}) + if len(rows) == 0 { + return 0, 0 + } + nNodes, _ := rows[0][0].(int64) + if nNodes == 0 { + return 0, 0 + } + + cntEdges := fmt.Sprintf(` +MATCH (n:Node)-[e:Edge]-(:Node) +WHERE n.%s = $v +RETURN count(DISTINCT e)`, column) + rows = s.querySelectLocked(cntEdges, map[string]any{"v": value}) + var nEdges int64 + if len(rows) > 0 { + nEdges, _ = rows[0][0].(int64) + } + + del := fmt.Sprintf(`MATCH (n:Node) WHERE n.%s = $v DETACH DELETE n`, column) + s.runWriteLocked(del, map[string]any{"v": value}) + s.writeGen.Add(1) + return int(nNodes), int(nEdges) +} diff --git a/internal/graph/store_ladybug/vector.go b/internal/graph/store_ladybug/vector.go new file mode 100644 index 00000000..3e6196d1 --- /dev/null +++ b/internal/graph/store_ladybug/vector.go @@ -0,0 +1,359 @@ +package store_ladybug + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync/atomic" + + "github.com/zzet/gortex/internal/graph" +) + +// vecIndexName is the canonical name for the HNSW index built over +// SymbolVec.emb. Hard-coded because the index is internal to the +// store — callers only ever query it through SimilarTo. +const vecIndexName = "idx_symbol_vec_emb" + +// vectorState tracks the per-store vector-side state: extension +// load, schema declaration (deferred until we know the dim), and +// index build sentinel. +type vectorState struct { + extensionLoaded atomic.Bool + dim atomic.Int32 // 0 until the SymbolVec table is created + indexBuilt atomic.Bool +} + +// ensureVectorExtensionLocked loads Ladybug's VECTOR extension into +// the current connection. Same dance as ensureFTSExtensionLocked +// (INSTALL + LOAD EXTENSION); idempotent via the sentinel. +// +// Held under writeMu by the caller so concurrent connections don't +// race the load. +func (s *Store) ensureVectorExtensionLocked() error { + if s.vec.extensionLoaded.Load() { + return nil + } + if err := runCypherSafe(s, `INSTALL VECTOR`); err != nil && + !strings.Contains(err.Error(), "is already installed") { + // Ignore "already installed" — every fresh open re-runs + // this and the soft failure shouldn't abort startup. + _ = err + } + if err := runCypherSafe(s, `LOAD EXTENSION VECTOR`); err != nil { + return fmt.Errorf("load vector extension: %w", err) + } + s.vec.extensionLoaded.Store(true) + return nil +} + +// ensureSymbolVecSchemaLocked lazily creates the SymbolVec table +// once we know the embedding dimension. Ladybug requires a +// fixed-width column (`FLOAT[N]`) declared at table-creation time +// — we can't preallocate the schema in the static DDL because +// the dim is model-dependent and only known when the first +// embedding lands. Re-creating with a different dim drops and +// re-declares the table; existing rows are wiped (a different +// embedding model means the old vectors are meaningless anyway). +// +// Held under writeMu by the caller. +func (s *Store) ensureSymbolVecSchemaLocked(dim int) error { + if dim <= 0 { + return fmt.Errorf("ensureSymbolVecSchema: invalid dim %d", dim) + } + cur := int(s.vec.dim.Load()) + if cur == dim { + return nil + } + if cur != 0 { + // Dim changed (e.g. different embedding model on this + // fresh daemon process). Drop the existing table so the + // FLOAT[N] column gets re-declared at the right width. Drop the + // HNSW index first — DROP TABLE is rejected while an index still + // references the table. + _ = runCypherSafe(s, fmt.Sprintf(`CALL DROP_VECTOR_INDEX('SymbolVec', '%s')`, vecIndexName)) + _ = runCypherSafe(s, `DROP TABLE IF EXISTS SymbolVec`) + s.vec.indexBuilt.Store(false) + } + ddl := fmt.Sprintf( + `CREATE NODE TABLE IF NOT EXISTS SymbolVec(id STRING, emb FLOAT[%d], PRIMARY KEY(id))`, + dim, + ) + if err := runCypherSafe(s, ddl); err != nil { + return fmt.Errorf("create SymbolVec schema (dim=%d): %w", dim, err) + } + s.vec.dim.Store(int32(dim)) + return nil +} + +// UpsertEmbedding writes (or replaces) the embedding for nodeID. +// Mirrors UpsertSymbolFTS shape: per-call MERGE for incremental +// reindex; the cold-start fast path is BulkUpsertEmbeddings. +// +// Auto-creates the SymbolVec table on first call (using +// len(vec) as the declared dim). Subsequent calls with a +// different-length vec error out — callers that change embedding +// model must drop the store first. +func (s *Store) UpsertEmbedding(nodeID string, vec []float32) error { + if nodeID == "" { + return nil + } + if len(vec) == 0 { + return nil + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + if err := s.ensureVectorExtensionLocked(); err != nil { + return err + } + // Per-call upserts must NOT auto-migrate to a new dim — that + // would silently drop the existing corpus when one wrong-dim + // upsert sneaks through. BulkUpsertEmbeddings is the cold-start + // path that's allowed to wipe and re-declare. Here we either + // match the declared dim or refuse. + if cur := int(s.vec.dim.Load()); cur != 0 && cur != len(vec) { + return fmt.Errorf("vector length %d does not match declared dim %d", len(vec), cur) + } + if err := s.ensureSymbolVecSchemaLocked(len(vec)); err != nil { + return err + } + const q = `MERGE (v:SymbolVec {id: $id}) SET v.emb = $emb` + if err := runCypherWithArgs(s, q, map[string]any{ + "id": nodeID, + "emb": vec, + }); err != nil { + return fmt.Errorf("upsert SymbolVec: %w", err) + } + // An upsert invalidates the prior HNSW index — Ladybug does + // auto-update on inserts but a freshly-written vector might + // not be visible to ANN queries until the next index rebuild. + // Mark dirty; SimilarTo lazy-rebuilds. + s.vec.indexBuilt.Store(false) + return nil +} + +// BulkUpsertEmbeddings is the cold-start fast path: write a TSV of +// (id, vec) pairs to a temp file and COPY FROM into SymbolVec in +// one shot. Mirrors BulkUpsertSymbolFTS for the FTS side. +// +// Wipe-and-rewrite semantics: a re-run replaces the prior corpus +// (the indexer always calls this once per IndexCtx after the +// embedding pass completes; incremental updates go through +// UpsertEmbedding which preserves prior rows). +// +// Idempotent under empty input. +func (s *Store) BulkUpsertEmbeddings(items []graph.VectorItem) error { + if len(items) == 0 { + return nil + } + dim := 0 + for _, it := range items { + if len(it.Vec) > 0 { + dim = len(it.Vec) + break + } + } + if dim == 0 { + return nil + } + + s.writeMu.Lock() + defer s.writeMu.Unlock() + if err := s.ensureVectorExtensionLocked(); err != nil { + return err + } + if err := s.ensureSymbolVecSchemaLocked(dim); err != nil { + return err + } + + // Dedup by ID, validate vector dim. Reject rows with the + // wrong width up-front rather than failing the COPY mid-batch. + pos := make(map[string]int, len(items)) + deduped := items[:0] + for _, it := range items { + if it.NodeID == "" || len(it.Vec) == 0 { + continue + } + if len(it.Vec) != dim { + return fmt.Errorf("vector length %d does not match batch dim %d (id %q)", len(it.Vec), dim, it.NodeID) + } + if p, ok := pos[it.NodeID]; ok { + deduped[p] = it + } else { + pos[it.NodeID] = len(deduped) + deduped = append(deduped, it) + } + } + items = deduped + if len(items) == 0 { + return nil + } + + // Drop the HNSW index BEFORE mutating the table. Ladybug cannot + // COPY (or bulk-DELETE) into a table that still carries a vector + // index — the operation hangs/aborts deep in the engine, which on a + // warm restart (where the prior run's index is already present) + // manifests as the whole reconcile worker wedging at 0% CPU and + // never reaching "watching". Dropping first mirrors what + // BuildVectorIndex already does before CREATE_VECTOR_INDEX. Safe + // no-op when no index exists; BuildVectorIndex recreates it after + // the embedding pass. + _ = runCypherSafe(s, fmt.Sprintf(`CALL DROP_VECTOR_INDEX('SymbolVec', '%s')`, vecIndexName)) + s.vec.indexBuilt.Store(false) + // Drop + recreate rather than DELETE: `MATCH (v:SymbolVec) DELETE v` + // empties the rows logically, but the engine still classes the table + // "non-empty" for COPY and rejects it ("COPY into a non-empty + // primary-key node table without a hash index is not supported") + // whenever the PK hash index isn't currently materialised — a state + // that depends on auto-checkpoint timing, so the failure is + // non-deterministic. A freshly recreated table is unconditionally a + // valid COPY target. The DROP_VECTOR_INDEX above must run first: DROP + // TABLE is rejected while the HNSW index still references the table. + if err := runCypherSafe(s, `DROP TABLE IF EXISTS SymbolVec`); err != nil { + return fmt.Errorf("drop SymbolVec before bulk upsert: %w", err) + } + s.vec.dim.Store(0) // force ensureSymbolVecSchemaLocked to recreate, not short-circuit + if err := s.ensureSymbolVecSchemaLocked(dim); err != nil { + return err + } + + dir, err := os.MkdirTemp("", "lbug-vec-bulk-") + if err != nil { + return fmt.Errorf("mkdir bulk tmp: %w", err) + } + defer func() { _ = os.RemoveAll(dir) }() + // Ladybug's COPY parser picks the format from the file + // extension; `.csv` with DELIM='\t' is the convention the + // existing Node/Edge bulk loader uses, and `.tsv` is rejected + // at bind time with "Cannot load from file type tsv". + path := filepath.Join(dir, "symbolvec.csv") + if err := writeSymbolVecTSV(path, items); err != nil { + return fmt.Errorf("write SymbolVec tsv: %w", err) + } + copyQ := fmt.Sprintf("COPY SymbolVec FROM '%s' (HEADER=false, DELIM='\\t')", escapeCypherStringLit(path)) + if err := runCypherSafe(s, copyQ); err != nil { + return fmt.Errorf("copy SymbolVec: %w", err) + } + s.vec.indexBuilt.Store(false) + return nil +} + +// writeSymbolVecTSV writes items to a tab-separated file. The +// FLOAT[N] column is serialised as a Ladybug array literal +// `[v0,v1,...,vN-1]` — no surrounding quotes (the COPY parser +// reads array-shaped tokens directly when DELIM is `\t`). +func writeSymbolVecTSV(path string, items []graph.VectorItem) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer func() { _ = f.Close() }() + var b strings.Builder + for _, it := range items { + b.Reset() + // Sanitize the id (tab / CR / LF -> space) exactly as writeNodesTSV + // does for the Node table: an id carrying a raw tab or newline (e.g. + // a string-literal-derived node) would otherwise split the TSV row + // and abort the whole COPY ("expected 2 values per row, but got 1"). + // Sanitizing identically keeps the SymbolVec id equal to the + // persisted Node id, so the similarity-search join still matches. + b.WriteString(sanitizeTSV(it.NodeID)) + b.WriteByte('\t') + b.WriteByte('[') + for i, v := range it.Vec { + if i > 0 { + b.WriteByte(',') + } + b.WriteString(strconv.FormatFloat(float64(v), 'g', -1, 32)) + } + b.WriteByte(']') + b.WriteByte('\n') + if _, err := f.WriteString(b.String()); err != nil { + return err + } + } + return nil +} + +// BuildVectorIndex creates the HNSW index over SymbolVec.emb. The +// dim arg must match the FLOAT[N] column the table was declared +// with; if the table doesn't exist yet, this call lazily creates +// it. +// +// Idempotent: the second call with the same dim is a no-op via +// the indexBuilt sentinel. A dim change drops and re-creates the +// schema (and invalidates the sentinel). +func (s *Store) BuildVectorIndex(dim int) error { + if dim <= 0 { + return fmt.Errorf("BuildVectorIndex: invalid dim %d", dim) + } + s.writeMu.Lock() + defer s.writeMu.Unlock() + if err := s.ensureVectorExtensionLocked(); err != nil { + return err + } + if err := s.ensureSymbolVecSchemaLocked(dim); err != nil { + return err + } + if s.vec.indexBuilt.Load() && int(s.vec.dim.Load()) == dim { + return nil + } + // Drop-and-recreate: CREATE_VECTOR_INDEX is fatal if the + // index already exists (same pattern as the FTS path). + _ = runCypherSafe(s, fmt.Sprintf(`CALL DROP_VECTOR_INDEX('SymbolVec', '%s')`, vecIndexName)) + if err := runCypherSafe(s, fmt.Sprintf(`CALL CREATE_VECTOR_INDEX('SymbolVec', '%s', 'emb')`, vecIndexName)); err != nil { + return fmt.Errorf("create vector index: %w", err) + } + s.vec.indexBuilt.Store(true) + return nil +} + +// SimilarTo runs a k-NN ANN query against the SymbolVec HNSW +// index. Returns hits in ascending distance order (lower = +// closer under cosine distance). +// +// If the index hasn't been built yet, this lazy-builds it using +// the query vector's length as the dim — saves callers from +// having to call BuildVectorIndex explicitly when the embedder +// has already populated SymbolVec via per-call upserts. +func (s *Store) SimilarTo(vec []float32, limit int) ([]graph.VectorHit, error) { + if len(vec) == 0 { + return nil, nil + } + if limit <= 0 { + limit = 20 + } + if !s.vec.indexBuilt.Load() { + if err := s.BuildVectorIndex(len(vec)); err != nil { + return nil, err + } + } + if want := int(s.vec.dim.Load()); want != len(vec) { + return nil, fmt.Errorf("query vector length %d does not match index dim %d", len(vec), want) + } + const cypher = ` +CALL QUERY_VECTOR_INDEX('SymbolVec', '` + vecIndexName + `', $vec, $k) +RETURN node.id AS id, distance +ORDER BY distance ASC` + rows, err := querySelectSafe(s, cypher, map[string]any{ + "vec": vec, + "k": int64(limit), + }) + if err != nil { + return nil, fmt.Errorf("query vector: %w", err) + } + hits := make([]graph.VectorHit, 0, len(rows)) + for _, row := range rows { + if len(row) < 2 { + continue + } + id, _ := row[0].(string) + if id == "" { + continue + } + d, _ := row[1].(float64) + hits = append(hits, graph.VectorHit{NodeID: id, Distance: d}) + } + return hits, nil +} diff --git a/internal/graph/store_ladybug/vector_escape_test.go b/internal/graph/store_ladybug/vector_escape_test.go new file mode 100644 index 00000000..380274a3 --- /dev/null +++ b/internal/graph/store_ladybug/vector_escape_test.go @@ -0,0 +1,50 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestVectorSearcher_BulkUpsertSanitizesDirtyID guards the SymbolVec +// bulk COPY against node IDs containing a tab or newline (e.g. +// string-literal-derived nodes). Unescaped, such an ID split the TSV +// row and aborted the whole COPY with "expected 2 values per row, but +// got 1". The ID is sanitized the same way writeNodesTSV sanitizes the +// Node table, so the SymbolVec id stays consistent with the persisted +// Node id (the join key). +func TestVectorSearcher_BulkUpsertSanitizesDirtyID(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-vec-dirty-") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + const dirtyID = "pkg/x.go::str\twith\ttab\nand\nnewline" + items := []graph.VectorItem{ + {NodeID: dirtyID, Vec: []float32{1, 0, 0, 0}}, + {NodeID: "clean", Vec: []float32{0, 1, 0, 0}}, + } + // Pre-fix this returned: copy SymbolVec: ... expected 2 values per + // row, but got 1. + require.NoError(t, s.BulkUpsertEmbeddings(items), "a dirty id must not abort the bulk COPY") + require.NoError(t, s.BuildVectorIndex(4)) + + hits, err := s.SimilarTo([]float32{1, 0, 0, 0}, 2) + require.NoError(t, err) + require.NotEmpty(t, hits) + // The row is retrievable under the sanitized id (tab/newline -> space), + // matching how the Node table stores the same id. + want := sanitizeTSV(dirtyID) + assert.Equal(t, want, hits[0].NodeID, "top hit must be the (sanitized) dirty id") + assert.NotContains(t, hits[0].NodeID, "\t", "stored id must not contain a tab") + assert.NotContains(t, hits[0].NodeID, "\n", "stored id must not contain a newline") +} diff --git a/internal/graph/store_ladybug/vector_probe_test.go b/internal/graph/store_ladybug/vector_probe_test.go new file mode 100644 index 00000000..a3fcf77f --- /dev/null +++ b/internal/graph/store_ladybug/vector_probe_test.go @@ -0,0 +1,126 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" +) + +// TestVector_Probe mirrors fts_probe_test.go for the vector +// extension. Confirms the CALL syntax and the auto-update +// semantics the production wiring will rely on: +// +// 1. INSTALL VECTOR + LOAD EXTENSION VECTOR (matches the FTS dance) +// 2. CREATE NODE TABLE with a FLOAT[N] column for the embedding +// 3. CALL CREATE_VECTOR_INDEX(table, name, column[, metric]) +// 4. CALL QUERY_VECTOR_INDEX(table, name, queryVec, k) — find signature +// 5. Auto-update on later AddNode +// +// Liberal logging (instead of strict assertions) so the probe +// surfaces what works regardless of where Ladybug 0.13 lands on +// the syntax-versioning curve — we'll then encode the discovered +// shape into production. +func TestVector_Probe(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-vec-probe-*") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + + s, err := Open(filepath.Join(dir, "store.lbug")) + if err != nil { + t.Fatalf("open: %v", err) + } + t.Cleanup(func() { _ = s.Close() }) + + // Step 1: install + load the vector extension. Mirrors the FTS + // dance — Ladybug ships the extension compiled in but requires + // explicit load before the CREATE_VECTOR_INDEX function appears + // in the catalog. + for _, q := range []string{`INSTALL VECTOR`, `LOAD EXTENSION VECTOR`} { + if err := tryRunCypher(s, q); err != nil { + t.Logf("%s: %v", q, err) + } else { + t.Logf("%s: ok", q) + } + } + + // Step 2: probe FLOAT[N] column support. Try the spec-style + // `FLOAT[4]` first, fall back to `ARRAY[FLOAT,4]` if needed. + for _, ddl := range []string{ + `CREATE NODE TABLE IF NOT EXISTS VecProbe(id STRING, emb FLOAT[4], PRIMARY KEY(id))`, + `CREATE NODE TABLE IF NOT EXISTS VecProbe2(id STRING, emb ARRAY[FLOAT,4], PRIMARY KEY(id))`, + } { + if err := tryRunCypher(s, ddl); err != nil { + t.Logf("CREATE %q: %v", ddl, err) + } else { + t.Logf("CREATE %q: ok", ddl) + } + } + + // Step 3: seed a few rows so the index has something to build over. + for i, vec := range [][]float32{ + {1.0, 0.0, 0.0, 0.0}, + {0.9, 0.1, 0.0, 0.0}, + {0.0, 0.0, 0.0, 1.0}, + } { + id := []string{"alpha", "alpha_neighbor", "far"}[i] + err := tryRunCypherArgs(s, `MERGE (n:VecProbe {id: $id}) SET n.emb = $emb`, map[string]any{ + "id": id, + "emb": vec, + }) + if err != nil { + t.Logf("insert %s: %v", id, err) + } + } + + // Step 4: try every CREATE_VECTOR_INDEX shape we know of. + for _, ddl := range []string{ + `CALL CREATE_VECTOR_INDEX('VecProbe', 'idx_emb_v', 'emb')`, + `CALL CREATE_VECTOR_INDEX('VecProbe', 'idx_emb_v', 'emb', 'cosine')`, + `CALL CREATE_VECTOR_INDEX('VecProbe', 'idx_emb_v', 'emb', 4, 'cosine')`, + } { + if err := tryRunCypher(s, ddl); err != nil { + t.Logf("CREATE_VECTOR_INDEX %q: %v", ddl, err) + } else { + t.Logf("CREATE_VECTOR_INDEX %q: ok", ddl) + break + } + } + + // Step 5: try QUERY_VECTOR_INDEX with both 3-arg and 4-arg shapes. + for _, probe := range []struct { + q string + args map[string]any + }{ + {`CALL QUERY_VECTOR_INDEX('VecProbe', 'idx_emb_v', $vec, 5) RETURN node.id, distance`, + map[string]any{"vec": []float32{1.0, 0.0, 0.0, 0.0}}}, + {`CALL QUERY_VECTOR_INDEX('VecProbe', 'idx_emb_v', $vec) RETURN node.id, distance LIMIT 5`, + map[string]any{"vec": []float32{1.0, 0.0, 0.0, 0.0}}}, + } { + rows, err := tryQueryCypher(s, probe.q, probe.args) + if err != nil { + t.Logf("QUERY_VECTOR_INDEX %q: %v", probe.q, err) + continue + } + t.Logf("QUERY_VECTOR_INDEX %q → %d rows", probe.q, len(rows)) + for _, r := range rows { + t.Logf(" %v", r) + } + } +} + +// tryRunCypherArgs invokes runWriteLocked with parameters, capturing +// any panic the binding raises (extension-not-loaded, wrong-types, +// etc.) as a normal Go error so the probe can react. +func tryRunCypherArgs(s *Store, q string, args map[string]any) (err error) { + defer func() { + if r := recover(); r != nil { + err = recoverErr(r) + } + }() + s.runWriteLocked(q, args) + return nil +} diff --git a/internal/graph/store_ladybug/vector_recopy_test.go b/internal/graph/store_ladybug/vector_recopy_test.go new file mode 100644 index 00000000..5da4268b --- /dev/null +++ b/internal/graph/store_ladybug/vector_recopy_test.go @@ -0,0 +1,49 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestVectorSearcher_RepeatedBulkReplaceIsDeterministic hammers the +// wipe-and-rewrite path (bulk -> BuildVectorIndex -> bulk -> ...) in a +// single store. Pre-fix the 2nd+ BulkUpsertEmbeddings non-deterministically +// failed with "COPY into a non-empty primary-key node table without a hash +// index is not supported": DELETE empties the rows logically but leaves the +// table non-empty for COPY, and whether the PK hash index is materialized at +// COPY time depended on auto-checkpoint timing. The fix drops + recreates the +// table so every COPY targets a fresh empty table. The in-process loop makes +// the formerly-racy failure reliably reproducible. +func TestVectorSearcher_RepeatedBulkReplaceIsDeterministic(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-vec-recopy-") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + require.NoError(t, s.BulkUpsertEmbeddings([]graph.VectorItem{ + {NodeID: "a", Vec: []float32{1, 0, 0, 0}}, + {NodeID: "b", Vec: []float32{0, 1, 0, 0}}, + })) + require.NoError(t, s.BuildVectorIndex(4)) + + for i := 0; i < 30; i++ { + require.NoErrorf(t, s.BulkUpsertEmbeddings([]graph.VectorItem{ + {NodeID: "z", Vec: []float32{1, 1, 0, 0}}, + }), "re-bulk iteration %d hit the COPY-into-non-empty rejection", i) + require.NoErrorf(t, s.BuildVectorIndex(4), "BuildVectorIndex iteration %d", i) + hits, err := s.SimilarTo([]float32{1, 0, 0, 0}, 10) + require.NoErrorf(t, err, "SimilarTo iteration %d", i) + require.Lenf(t, hits, 1, "wipe-and-rewrite must leave exactly 1 row (iteration %d)", i) + assert.Equal(t, "z", hits[0].NodeID) + } +} diff --git a/internal/graph/store_ladybug/vector_test.go b/internal/graph/store_ladybug/vector_test.go new file mode 100644 index 00000000..f3267abd --- /dev/null +++ b/internal/graph/store_ladybug/vector_test.go @@ -0,0 +1,114 @@ +//go:build ladybug + +package store_ladybug + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func TestVectorSearcher_BulkAndQuery(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-vec-bulk-") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + items := []graph.VectorItem{ + {NodeID: "alpha", Vec: []float32{1, 0, 0, 0}}, + {NodeID: "alpha_neighbor", Vec: []float32{0.95, 0.05, 0, 0}}, + {NodeID: "orthogonal", Vec: []float32{0, 1, 0, 0}}, + {NodeID: "opposite", Vec: []float32{-1, 0, 0, 0}}, + } + require.NoError(t, s.BulkUpsertEmbeddings(items)) + require.NoError(t, s.BuildVectorIndex(4)) + + hits, err := s.SimilarTo([]float32{1, 0, 0, 0}, 3) + require.NoError(t, err) + require.Len(t, hits, 3, "k=3 must return 3 hits") + // alpha (identical) should rank first; alpha_neighbor second; + // orthogonal third (cosine distance 1.0 > opposite's 2.0? — let + // the engine decide ordering, but assert that alpha and + // alpha_neighbor are the first two regardless of orientation). + topIDs := map[string]bool{hits[0].NodeID: true, hits[1].NodeID: true} + assert.True(t, topIDs["alpha"], "exact match must be in the top two; got hits=%v", hits) + assert.True(t, topIDs["alpha_neighbor"], "near neighbour must be in the top two; got hits=%v", hits) + assert.InDelta(t, 0.0, hits[0].Distance, 0.001, "top hit distance must be near zero for the exact-match query") +} + +func TestVectorSearcher_PerCallUpsert(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-vec-per-") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + require.NoError(t, s.UpsertEmbedding("a", []float32{1, 0, 0, 0})) + require.NoError(t, s.UpsertEmbedding("b", []float32{0, 1, 0, 0})) + + hits, err := s.SimilarTo([]float32{1, 0, 0, 0}, 2) + require.NoError(t, err) + require.Len(t, hits, 2) + assert.Equal(t, "a", hits[0].NodeID) +} + +// TestVectorSearcher_DimRejectsMismatch guards the index dim +// contract — every Upsert / Bulk must match the declared +// FLOAT[N] column width. +func TestVectorSearcher_DimRejectsMismatch(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-vec-dim-") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + require.NoError(t, s.UpsertEmbedding("a", []float32{1, 0, 0, 0})) + + // Second upsert with the wrong dim must error rather than + // silently truncate / pad. + err = s.UpsertEmbedding("b", []float32{1, 0, 0}) + require.Error(t, err) +} + +// TestVectorSearcher_BulkReplacesPriorCorpus confirms the bulk +// path's wipe-and-rewrite semantics — re-running with a smaller +// set drops the prior rows. +func TestVectorSearcher_BulkReplacesPriorCorpus(t *testing.T) { + dir, err := os.MkdirTemp("", "lbug-vec-replace-") + require.NoError(t, err) + t.Cleanup(func() { _ = os.RemoveAll(dir) }) + s, err := Open(filepath.Join(dir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = s.Close() }) + + require.NoError(t, s.BulkUpsertEmbeddings([]graph.VectorItem{ + {NodeID: "a", Vec: []float32{1, 0, 0, 0}}, + {NodeID: "b", Vec: []float32{0, 1, 0, 0}}, + {NodeID: "c", Vec: []float32{0, 0, 1, 0}}, + })) + require.NoError(t, s.BuildVectorIndex(4)) + + hits, err := s.SimilarTo([]float32{1, 0, 0, 0}, 10) + require.NoError(t, err) + require.Len(t, hits, 3, "initial bulk should land 3 rows") + + // Second bulk with one row only. + require.NoError(t, s.BulkUpsertEmbeddings([]graph.VectorItem{ + {NodeID: "z", Vec: []float32{1, 1, 0, 0}}, + })) + require.NoError(t, s.BuildVectorIndex(4)) + + hits, err = s.SimilarTo([]float32{1, 0, 0, 0}, 10) + require.NoError(t, err) + require.Len(t, hits, 1, "wipe-and-rewrite must drop prior rows; got %v", hits) + assert.Equal(t, "z", hits[0].NodeID) +} diff --git a/internal/graph/storetest/backend_resolver.go b/internal/graph/storetest/backend_resolver.go new file mode 100644 index 00000000..2400de99 --- /dev/null +++ b/internal/graph/storetest/backend_resolver.go @@ -0,0 +1,272 @@ +package storetest + +import ( + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// RunBackendResolverConformance exercises every method of the +// graph.BackendResolver interface against a Factory that produces a +// store implementing both graph.Store and graph.BackendResolver. The +// shape mirrors RunConformance (the main Store contract): a known +// fixture graph, run the rule, assert the post-state matches the +// expected resolution. +// +// Backends that haven't implemented a rule yet ship the Phase 1 stub +// that returns (0, nil); those subtests pass trivially because the +// fixture also asserts zero-progress doesn't break correctness. +func RunBackendResolverConformance(t *testing.T, factory Factory) { + t.Helper() + t.Run("BackendResolver_SameFile", func(t *testing.T) { testBRSameFile(t, factory) }) + t.Run("BackendResolver_SamePackage", func(t *testing.T) { testBRSamePackage(t, factory) }) + t.Run("BackendResolver_ImportAware", func(t *testing.T) { testBRImportAware(t, factory) }) + t.Run("BackendResolver_RelativeImports", func(t *testing.T) { testBRRelativeImports(t, factory) }) + t.Run("BackendResolver_CrossRepo", func(t *testing.T) { testBRCrossRepo(t, factory) }) + t.Run("BackendResolver_UniqueNames", func(t *testing.T) { testBRUniqueNames(t, factory) }) + t.Run("BackendResolver_ExternalCallStubs", func(t *testing.T) { testBRExternalCallStubs(t, factory) }) + t.Run("BackendResolver_AllBulk", func(t *testing.T) { testBRAllBulk(t, factory) }) +} + +func asBackendResolver(t *testing.T, s graph.Store) graph.BackendResolver { + t.Helper() + br, ok := s.(graph.BackendResolver) + if !ok { + t.Skip("store does not implement graph.BackendResolver") + } + return br +} + +func testBRSameFile(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // caller and target in same file — unambiguous match + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Bar", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveSameFile() + if err != nil { + t.Fatalf("ResolveSameFile: %v", err) + } + if n == 0 { + // stub backend — skip the post-state assertions + return + } + if n != 1 { + t.Fatalf("ResolveSameFile resolved %d, want 1", n) + } + // edge should now point at a.go::Bar with origin ast_resolved + got := s.GetOutEdges("a.go::Foo") + if len(got) != 1 || got[0].To != "a.go::Bar" || got[0].Origin != graph.OriginASTResolved { + t.Fatalf("ResolveSameFile post-state: edges=%+v", got) + } +} + +func testBRSamePackage(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // caller in pkg/a.go, target in pkg/b.go — same directory + s.AddNode(mkRepoNode("pkg/a.go::Caller", "Caller", "pkg/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("pkg/b.go::Target", "Target", "pkg/b.go", "r1", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "pkg/a.go::Caller", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "pkg/a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveSamePackage() + if err != nil { + t.Fatalf("ResolveSamePackage: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveSamePackage resolved %d, want 1", n) + } + got := s.GetOutEdges("pkg/a.go::Caller") + if len(got) != 1 || got[0].To != "pkg/b.go::Target" || got[0].Origin != graph.OriginASTResolved { + t.Fatalf("ResolveSamePackage post-state: edges=%+v", got) + } +} + +func testBRImportAware(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // caller.go imports lib.go which exports Target + s.AddNode(mkNode("caller.go", "caller.go", "caller.go", graph.KindFile)) + s.AddNode(mkNode("lib.go", "lib.go", "lib.go", graph.KindFile)) + s.AddNode(mkNode("caller.go::Caller", "Caller", "caller.go", graph.KindFunction)) + s.AddNode(mkNode("lib.go::Target", "Target", "lib.go", graph.KindFunction)) + // the imports edge + s.AddEdge(&graph.Edge{ + From: "caller.go", To: "lib.go", Kind: graph.EdgeImports, + FilePath: "caller.go", Line: 1, Origin: graph.OriginASTResolved, + }) + // the unresolved call + s.AddEdge(&graph.Edge{ + From: "caller.go::Caller", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "caller.go", Line: 5, Origin: "", + }) + n, err := br.ResolveImportAware() + if err != nil { + t.Fatalf("ResolveImportAware: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveImportAware resolved %d, want 1", n) + } + got := s.GetOutEdges("caller.go::Caller") + var found bool + for _, e := range got { + if e.To == "lib.go::Target" { + found = true + } + } + if !found { + t.Fatalf("ResolveImportAware post-state: edges=%+v, want one to lib.go::Target", got) + } +} + +func testBRRelativeImports(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // python relative-import stub + s.AddNode(mkNode("app/util.py", "app/util.py", "app/util.py", graph.KindFile)) + s.AddNode(mkNode("app/main.py", "app/main.py", "app/main.py", graph.KindFile)) + s.AddEdge(&graph.Edge{ + From: "app/main.py", To: "unresolved::pyrel::app/util", Kind: graph.EdgeImports, + FilePath: "app/main.py", Line: 1, Origin: "", + }) + n, err := br.ResolveRelativeImports("python") + if err != nil { + t.Fatalf("ResolveRelativeImports: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveRelativeImports resolved %d, want 1", n) + } + got := s.GetOutEdges("app/main.py") + var found bool + for _, e := range got { + if e.To == "app/util.py" { + found = true + } + } + if !found { + t.Fatalf("ResolveRelativeImports post-state: edges=%+v, want one to app/util.py", got) + } +} + +func testBRCrossRepo(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + s.AddNode(mkRepoNode("r1/a.go::Caller", "Caller", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Target", "Target", "r2/x.go", "r2", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "r1/a.go::Caller", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "r1/a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveCrossRepo() + if err != nil { + t.Fatalf("ResolveCrossRepo: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveCrossRepo resolved %d, want 1", n) + } + got := s.GetOutEdges("r1/a.go::Caller") + if len(got) != 1 || got[0].To != "r2/x.go::Target" || !got[0].CrossRepo { + t.Fatalf("ResolveCrossRepo post-state: edges=%+v", got) + } +} + +func testBRUniqueNames(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // One unique-name candidate in the graph. + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Target", "Target", "b.go", graph.KindFunction)) + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Target", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveUniqueNames() + if err != nil { + t.Fatalf("ResolveUniqueNames: %v", err) + } + if n == 0 { + return + } + if n != 1 { + t.Fatalf("ResolveUniqueNames resolved %d, want 1", n) + } + got := s.GetOutEdges("a.go::Foo") + if len(got) != 1 || got[0].To != "b.go::Target" { + t.Fatalf("ResolveUniqueNames post-state: edges=%+v", got) + } +} + +func testBRExternalCallStubs(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + s.AddNode(mkNode("a.go::Caller", "Caller", "a.go", graph.KindFunction)) + // edge to external::npm/foo::bar with no stub node + s.AddEdge(&graph.Edge{ + From: "a.go::Caller", To: "external::npm/foo::bar", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + n, err := br.ResolveExternalCallStubs() + if err != nil { + t.Fatalf("ResolveExternalCallStubs: %v", err) + } + if n == 0 { + return + } + if n < 1 { + t.Fatalf("ResolveExternalCallStubs resolved %d, want >= 1", n) + } + // stub node must now exist + if s.GetNode("external::npm/foo::bar") == nil { + t.Fatalf("external stub node not created") + } +} + +func testBRAllBulk(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + br := asBackendResolver(t, s) + // Mix of resolvable + stub cases. + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Unique", "Unique", "b.go", graph.KindFunction)) + // same-file + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Bar", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 1, Origin: "", + }) + // unique-name + s.AddEdge(&graph.Edge{ + From: "a.go::Foo", To: "unresolved::Unique", Kind: graph.EdgeCalls, + FilePath: "a.go", Line: 2, Origin: "", + }) + n, err := br.ResolveAllBulk() + if err != nil { + t.Fatalf("ResolveAllBulk: %v", err) + } + _ = n // 0 on stub backends, >0 on real +} diff --git a/internal/graph/storetest/memory_conformance_test.go b/internal/graph/storetest/memory_conformance_test.go new file mode 100644 index 00000000..29537241 --- /dev/null +++ b/internal/graph/storetest/memory_conformance_test.go @@ -0,0 +1,18 @@ +package storetest_test + +import ( + "testing" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/storetest" +) + +// TestMemoryStoreConformance proves the in-memory *graph.Graph (the +// only Store impl that exists today) satisfies the conformance suite. +// This is the canonical baseline; new backends must pass the same +// battery. +func TestMemoryStoreConformance(t *testing.T) { + storetest.RunConformance(t, func(t *testing.T) graph.Store { + return graph.New() + }) +} diff --git a/internal/graph/storetest/storetest.go b/internal/graph/storetest/storetest.go new file mode 100644 index 00000000..92980730 --- /dev/null +++ b/internal/graph/storetest/storetest.go @@ -0,0 +1,3238 @@ +// Package storetest provides a conformance test suite that every +// graph.Store implementation MUST pass. Each backend (in-memory, +// bbolt-on-disk, SQLite-on-disk, remote-network-client) has a thin +// _test.go that calls RunConformance(t, factory) and inherits the +// full battery. +// +// The contract this package encodes is the union of behaviour the +// rest of gortex depends on from *graph.Graph today. New Store +// implementations are expected to satisfy every test before they can +// be considered a drop-in replacement. +package storetest + +import ( + "fmt" + "sort" + "strings" + "sync" + "testing" + + "github.com/zzet/gortex/internal/graph" +) + +// Factory constructs a fresh, empty Store. RunConformance calls it +// many times across subtests; each invocation must yield an +// independent store with no leakage from previous runs. Backends with +// on-disk state should use t.TempDir() internally to isolate. +type Factory func(t *testing.T) graph.Store + +// RunConformance runs the full conformance suite against the Store +// produced by factory. Backends invoke it from a _test.go in their +// own package. +func RunConformance(t *testing.T, factory Factory) { + t.Helper() + t.Run("AddGetNode", func(t *testing.T) { testAddGetNode(t, factory) }) + t.Run("AddGetEdge", func(t *testing.T) { testAddGetEdge(t, factory) }) + t.Run("AddNodeIdempotent", func(t *testing.T) { testAddNodeIdempotent(t, factory) }) + t.Run("AddEdgeIdempotent", func(t *testing.T) { testAddEdgeIdempotent(t, factory) }) + t.Run("AddEdgeLineDisambiguates", func(t *testing.T) { testAddEdgeLineDisambiguates(t, factory) }) + t.Run("AddBatch", func(t *testing.T) { testAddBatch(t, factory) }) + t.Run("RemoveEdge", func(t *testing.T) { testRemoveEdge(t, factory) }) + t.Run("EvictFile", func(t *testing.T) { testEvictFile(t, factory) }) + t.Run("EvictFile_NoNodes", func(t *testing.T) { testEvictFileNoNodes(t, factory) }) + t.Run("EvictRepo", func(t *testing.T) { testEvictRepo(t, factory) }) + t.Run("EvictRepo_NoNodes", func(t *testing.T) { testEvictRepoNoNodes(t, factory) }) + t.Run("NodeAndEdgeCount", func(t *testing.T) { testNodeAndEdgeCount(t, factory) }) + t.Run("AllNodesAndEdges", func(t *testing.T) { testAllNodesAndEdges(t, factory) }) + t.Run("FindNodesByName", func(t *testing.T) { testFindNodesByName(t, factory) }) + t.Run("FindNodesByNameInRepo", func(t *testing.T) { testFindNodesByNameInRepo(t, factory) }) + t.Run("FindNodesByNameContaining", func(t *testing.T) { testFindNodesByNameContaining(t, factory) }) + t.Run("GetFileNodes", func(t *testing.T) { testGetFileNodes(t, factory) }) + t.Run("GetRepoNodes", func(t *testing.T) { testGetRepoNodes(t, factory) }) + t.Run("GetRepoEdges", func(t *testing.T) { testGetRepoEdges(t, factory) }) + t.Run("GetNodeByQualName", func(t *testing.T) { testGetNodeByQualName(t, factory) }) + t.Run("Stats", func(t *testing.T) { testStats(t, factory) }) + t.Run("RepoStats", func(t *testing.T) { testRepoStats(t, factory) }) + t.Run("RepoPrefixes", func(t *testing.T) { testRepoPrefixes(t, factory) }) + t.Run("SetEdgeProvenance", func(t *testing.T) { testSetEdgeProvenance(t, factory) }) + t.Run("SetEdgeProvenanceBatch", func(t *testing.T) { testSetEdgeProvenanceBatch(t, factory) }) + t.Run("ReindexEdge", func(t *testing.T) { testReindexEdge(t, factory) }) + t.Run("ReindexEdges", func(t *testing.T) { testReindexEdges(t, factory) }) + t.Run("Concurrency", func(t *testing.T) { testConcurrency(t, factory) }) + t.Run("EdgeIdentityRevisions", func(t *testing.T) { testEdgeIdentityRevisions(t, factory) }) + t.Run("VerifyEdgeIdentities", func(t *testing.T) { testVerifyEdgeIdentities(t, factory) }) + t.Run("RepoMemoryEstimate", func(t *testing.T) { testRepoMemoryEstimate(t, factory) }) + t.Run("AllRepoMemoryEstimates", func(t *testing.T) { testAllRepoMemoryEstimates(t, factory) }) + t.Run("MetaPreserved", func(t *testing.T) { testMetaPreserved(t, factory) }) + t.Run("EmptyStore", func(t *testing.T) { testEmptyStore(t, factory) }) + t.Run("EdgesByKind", func(t *testing.T) { testEdgesByKind(t, factory) }) + t.Run("NodesByKind", func(t *testing.T) { testNodesByKind(t, factory) }) + t.Run("EdgesWithUnresolvedTarget", func(t *testing.T) { testEdgesWithUnresolvedTarget(t, factory) }) + t.Run("GetNodesByIDs", func(t *testing.T) { testGetNodesByIDs(t, factory) }) + t.Run("FindNodesByNames", func(t *testing.T) { testFindNodesByNames(t, factory) }) + t.Run("GetEdgesByNodeIDs", func(t *testing.T) { testGetEdgesByNodeIDs(t, factory) }) + t.Run("SymbolBundleSearcher", func(t *testing.T) { testSymbolBundleSearcher(t, factory) }) + t.Run("DeadCodeCandidator", func(t *testing.T) { testDeadCodeCandidator(t, factory) }) + t.Run("IfaceImplementsScanner", func(t *testing.T) { testIfaceImplementsScanner(t, factory) }) + t.Run("NodeDegreeAggregator", func(t *testing.T) { testNodeDegreeAggregator(t, factory) }) + t.Run("NodeFanAggregator", func(t *testing.T) { testNodeFanAggregator(t, factory) }) + t.Run("FileImporters", func(t *testing.T) { testFileImporters(t, factory) }) + t.Run("InEdgeCounter", func(t *testing.T) { testInEdgeCounter(t, factory) }) + t.Run("NodesInFilesByKindFinder", func(t *testing.T) { testNodesInFilesByKindFinder(t, factory) }) + t.Run("EdgesByKindsScanner", func(t *testing.T) { testEdgesByKindsScanner(t, factory) }) + t.Run("NodesByKindsScanner", func(t *testing.T) { testNodesByKindsScanner(t, factory) }) + t.Run("EdgeKindCounter", func(t *testing.T) { testEdgeKindCounter(t, factory) }) + t.Run("CrossRepoEdgeAggregator", func(t *testing.T) { testCrossRepoEdgeAggregator(t, factory) }) + t.Run("FileImportAggregator", func(t *testing.T) { testFileImportAggregator(t, factory) }) + t.Run("InDegreeForNodes", func(t *testing.T) { testInDegreeForNodes(t, factory) }) + t.Run("ReachableForwardByKinds", func(t *testing.T) { testReachableForwardByKinds(t, factory) }) + t.Run("ThrowerErrorSurfacer", func(t *testing.T) { testThrowerErrorSurfacer(t, factory) }) + t.Run("EdgeAdjacencyForKinds", func(t *testing.T) { testEdgeAdjacencyForKinds(t, factory) }) + t.Run("CommunityCrossingsByKind", func(t *testing.T) { testCommunityCrossingsByKind(t, factory) }) + t.Run("NodeIDsByKinds", func(t *testing.T) { testNodeIDsByKinds(t, factory) }) + t.Run("MemberMethodsByType", func(t *testing.T) { testMemberMethodsByType(t, factory) }) + t.Run("StructuralParentEdges", func(t *testing.T) { testStructuralParentEdges(t, factory) }) + t.Run("CrossRepoCandidates", func(t *testing.T) { testCrossRepoCandidates(t, factory) }) + t.Run("ExtractCandidates", func(t *testing.T) { testExtractCandidates(t, factory) }) + t.Run("FileSymbolNamesByPaths", func(t *testing.T) { testFileSymbolNamesByPaths(t, factory) }) + t.Run("ClassHierarchyTraverser", func(t *testing.T) { testClassHierarchyTraverser(t, factory) }) + t.Run("FileEditingContext", func(t *testing.T) { testFileEditingContext(t, factory) }) + t.Run("NodeDegreeByKinds", func(t *testing.T) { testNodeDegreeByKinds(t, factory) }) +} + +// -- fixture helpers --------------------------------------------------- + +func mkNode(id, name, file string, kind graph.NodeKind) *graph.Node { + return &graph.Node{ + ID: id, + Kind: kind, + Name: name, + FilePath: file, + StartLine: 1, + EndLine: 10, + Language: "go", + } +} + +func mkRepoNode(id, name, file, repo string, kind graph.NodeKind) *graph.Node { + n := mkNode(id, name, file, kind) + n.RepoPrefix = repo + return n +} + +func mkEdge(from, to string, kind graph.EdgeKind) *graph.Edge { + return &graph.Edge{ + From: from, To: to, Kind: kind, + FilePath: "test.go", Line: 1, + Confidence: 1.0, + Origin: graph.OriginASTResolved, + } +} + +func sortNodeIDs(nodes []*graph.Node) []string { + ids := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil { + ids = append(ids, n.ID) + } + } + sort.Strings(ids) + return ids +} + +func sortEdgeKeys(edges []*graph.Edge) []string { + keys := make([]string, 0, len(edges)) + for _, e := range edges { + if e != nil { + keys = append(keys, fmt.Sprintf("%s|%s|%s|%d", e.From, e.To, e.Kind, e.Line)) + } + } + sort.Strings(keys) + return keys +} + +// -- individual subtests ---------------------------------------------- + +func testAddGetNode(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction) + s.AddNode(n) + got := s.GetNode("a.go::Foo") + if got == nil { + t.Fatalf("GetNode returned nil for inserted node") + } + if got.Name != "Foo" || got.FilePath != "a.go" || got.Kind != graph.KindFunction { + t.Fatalf("round-trip mismatch: %+v", got) + } + if s.GetNode("missing") != nil { + t.Fatalf("GetNode should return nil for missing key") + } +} + +func testAddGetEdge(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + + out := s.GetOutEdges("a") + if len(out) != 1 || out[0].To != "b" { + t.Fatalf("GetOutEdges(a) = %+v, want one edge to b", out) + } + in := s.GetInEdges("b") + if len(in) != 1 || in[0].From != "a" { + t.Fatalf("GetInEdges(b) = %+v, want one edge from a", in) + } +} + +func testAddNodeIdempotent(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("dup", "Dup", "x.go", graph.KindFunction) + s.AddNode(n) + s.AddNode(n) + s.AddNode(n) + if s.NodeCount() != 1 { + t.Fatalf("NodeCount after 3x add = %d, want 1", s.NodeCount()) + } +} + +func testAddEdgeIdempotent(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e := mkEdge("a", "b", graph.EdgeCalls) + s.AddEdge(e) + s.AddEdge(e) + s.AddEdge(e) + if got := len(s.GetOutEdges("a")); got != 1 { + t.Fatalf("OutEdges after 3x add = %d, want 1", got) + } +} + +func testAddEdgeLineDisambiguates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "b", graph.EdgeCalls) + e2.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + if got := len(s.GetOutEdges("a")); got != 2 { + t.Fatalf("OutEdges with different lines = %d, want 2", got) + } +} + +func testAddBatch(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + nodes := []*graph.Node{ + mkNode("a", "A", "x.go", graph.KindFunction), + mkNode("b", "B", "x.go", graph.KindFunction), + mkNode("c", "C", "y.go", graph.KindType), + } + edges := []*graph.Edge{ + mkEdge("a", "b", graph.EdgeCalls), + mkEdge("b", "c", graph.EdgeReferences), + } + s.AddBatch(nodes, edges) + if s.NodeCount() != 3 { + t.Fatalf("NodeCount after AddBatch = %d, want 3", s.NodeCount()) + } + if s.EdgeCount() != 2 { + t.Fatalf("EdgeCount after AddBatch = %d, want 2", s.EdgeCount()) + } +} + +func testRemoveEdge(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e := mkEdge("a", "b", graph.EdgeCalls) + s.AddEdge(e) + if !s.RemoveEdge("a", "b", graph.EdgeCalls) { + t.Fatalf("RemoveEdge returned false for existing edge") + } + if len(s.GetOutEdges("a")) != 0 { + t.Fatalf("OutEdges after RemoveEdge = nonzero") + } + if len(s.GetInEdges("b")) != 0 { + t.Fatalf("InEdges after RemoveEdge = nonzero") + } + // Removing non-existent should report false but not panic. + if s.RemoveEdge("a", "b", graph.EdgeCalls) { + t.Fatalf("RemoveEdge returned true for missing edge") + } +} + +func testEvictFile(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Baz", "Baz", "b.go", graph.KindFunction)) + s.AddEdge(mkEdge("a.go::Foo", "a.go::Bar", graph.EdgeCalls)) + s.AddEdge(mkEdge("a.go::Bar", "b.go::Baz", graph.EdgeCalls)) + + nodesRemoved, edgesRemoved := s.EvictFile("a.go") + if nodesRemoved != 2 { + t.Fatalf("EvictFile nodesRemoved = %d, want 2", nodesRemoved) + } + if edgesRemoved == 0 { + t.Fatalf("EvictFile edgesRemoved should be > 0") + } + if s.GetNode("a.go::Foo") != nil { + t.Fatalf("evicted node still present") + } + if s.GetNode("b.go::Baz") == nil { + t.Fatalf("unrelated node was evicted") + } +} + +func testEvictFileNoNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n, e := s.EvictFile("nonexistent.go") + if n != 0 || e != 0 { + t.Fatalf("EvictFile on empty file returned (%d, %d), want (0, 0)", n, e) + } +} + +func testEvictRepo(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r1/b.go::Bar", "Bar", "r1/b.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + s.AddEdge(mkEdge("r1/a.go::Foo", "r1/b.go::Bar", graph.EdgeCalls)) + + nodesRemoved, edgesRemoved := s.EvictRepo("r1") + if nodesRemoved != 2 { + t.Fatalf("EvictRepo nodesRemoved = %d, want 2", nodesRemoved) + } + if edgesRemoved == 0 { + t.Fatalf("EvictRepo edgesRemoved should be > 0") + } + if s.GetNode("r1/a.go::Foo") != nil { + t.Fatalf("r1 node still present") + } + if s.GetNode("r2/x.go::Baz") == nil { + t.Fatalf("r2 node was evicted") + } +} + +func testEvictRepoNoNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n, e := s.EvictRepo("nonexistent-repo") + if n != 0 || e != 0 { + t.Fatalf("EvictRepo on missing repo returned (%d, %d), want (0, 0)", n, e) + } +} + +func testNodeAndEdgeCount(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + if s.NodeCount() != 0 || s.EdgeCount() != 0 { + t.Fatalf("empty store reports nonzero counts") + } + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + if s.NodeCount() != 2 { + t.Fatalf("NodeCount = %d, want 2", s.NodeCount()) + } + if s.EdgeCount() != 1 { + t.Fatalf("EdgeCount = %d, want 1", s.EdgeCount()) + } +} + +func testAllNodesAndEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "y.go", graph.KindType)) + s.AddEdge(mkEdge("a", "b", graph.EdgeReferences)) + + gotN := sortNodeIDs(s.AllNodes()) + wantN := []string{"a", "b"} + if fmt.Sprint(gotN) != fmt.Sprint(wantN) { + t.Fatalf("AllNodes = %v, want %v", gotN, wantN) + } + gotE := sortEdgeKeys(s.AllEdges()) + if len(gotE) != 1 { + t.Fatalf("AllEdges = %v, want one entry", gotE) + } +} + +func testFindNodesByName(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Foo", "Foo", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Bar", "Bar", "c.go", graph.KindFunction)) + got := sortNodeIDs(s.FindNodesByName("Foo")) + want := []string{"a.go::Foo", "b.go::Foo"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FindNodesByName(Foo) = %v, want %v", got, want) + } + if len(s.FindNodesByName("MissingName")) != 0 { + t.Fatalf("FindNodesByName for missing name should be empty") + } +} + +func testFindNodesByNameInRepo(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/a.go::Foo", "Foo", "r2/a.go", "r2", graph.KindFunction)) + got := sortNodeIDs(s.FindNodesByNameInRepo("Foo", "r1")) + want := []string{"r1/a.go::Foo"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FindNodesByNameInRepo(Foo, r1) = %v, want %v", got, want) + } +} + +func testFindNodesByNameContaining(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Three "log"-containing names + one unrelated. + s.AddNode(mkNode("a.go::Login", "Login", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::LoginHandler", "LoginHandler", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Logout", "Logout", "c.go", graph.KindFunction)) + s.AddNode(mkNode("d.go::Unrelated", "Unrelated", "d.go", graph.KindFunction)) + + // Case-insensitive substring match should return exactly the 3 + // "log"-bearing nodes. + got := sortNodeIDs(s.FindNodesByNameContaining("log", 10)) + want := []string{"a.go::Login", "b.go::LoginHandler", "c.go::Logout"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FindNodesByNameContaining(log, 10) = %v, want %v", got, want) + } + + // Mixed-case query — must still match (case-insensitive). + gotUpper := sortNodeIDs(s.FindNodesByNameContaining("LOG", 10)) + if fmt.Sprint(gotUpper) != fmt.Sprint(want) { + t.Fatalf("FindNodesByNameContaining(LOG, 10) = %v, want %v", gotUpper, want) + } + + // Limit is honoured. Asking for 2 must return at most 2. + gotLimited := s.FindNodesByNameContaining("log", 2) + if len(gotLimited) != 2 { + t.Fatalf("FindNodesByNameContaining(log, 2) returned %d, want 2", len(gotLimited)) + } + + // Empty needle returns nothing — never the whole graph. + if got := s.FindNodesByNameContaining("", 10); len(got) != 0 { + t.Fatalf("FindNodesByNameContaining(\"\") returned %d, want 0", len(got)) + } + + // No match — empty slice. + if got := s.FindNodesByNameContaining("nonexistent_substring_xyz", 10); len(got) != 0 { + t.Fatalf("FindNodesByNameContaining(no-match) returned %d, want 0", len(got)) + } +} + +func testGetFileNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Baz", "Baz", "b.go", graph.KindFunction)) + got := sortNodeIDs(s.GetFileNodes("a.go")) + want := []string{"a.go::Bar", "a.go::Foo"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("GetFileNodes(a.go) = %v, want %v", got, want) + } +} + +func testGetRepoNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r1/b.go::Bar", "Bar", "r1/b.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + got := sortNodeIDs(s.GetRepoNodes("r1")) + want := []string{"r1/a.go::Foo", "r1/b.go::Bar"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("GetRepoNodes(r1) = %v, want %v", got, want) + } +} + +// testGetRepoEdges asserts that GetRepoEdges returns every edge whose +// SOURCE node carries the requested RepoPrefix, regardless of where +// the target lives — same-repo intra edges, cross-repo edges (source +// in r1 → target in r2), AND unresolved::* targets all count. Edges +// whose source is in a different repo (or unscoped) MUST NOT appear. +// Empty prefix returns nil so callers don't accidentally fall through +// to a full-graph scan. +func testGetRepoEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // r1 has two nodes that originate outgoing edges; r2 has a target + // node and one of its own source nodes. + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r1/b.go::Bar", "Bar", "r1/b.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/y.go::Qux", "Qux", "r2/y.go", "r2", graph.KindFunction)) + + // r1-intra (Foo → Bar) — same repo. + s.AddEdge(mkEdge("r1/a.go::Foo", "r1/b.go::Bar", graph.EdgeCalls)) + // r1 → r2 cross-repo (Foo → Baz). + s.AddEdge(mkEdge("r1/a.go::Foo", "r2/x.go::Baz", graph.EdgeCalls)) + // r1 → unresolved (Bar → unresolved::Missing) — counts because + // source is in r1. + s.AddEdge(mkEdge("r1/b.go::Bar", "unresolved::Missing", graph.EdgeCalls)) + // r2-intra (Qux → Baz) — MUST NOT appear in r1's slice. + s.AddEdge(mkEdge("r2/y.go::Qux", "r2/x.go::Baz", graph.EdgeCalls)) + // r2 → r1 cross-repo (Qux → Foo) — MUST NOT appear in r1's slice + // because the source is in r2. + s.AddEdge(mkEdge("r2/y.go::Qux", "r1/a.go::Foo", graph.EdgeCalls)) + + gotR1 := sortEdgeKeys(s.GetRepoEdges("r1")) + wantR1 := sortEdgeKeys([]*graph.Edge{ + mkEdge("r1/a.go::Foo", "r1/b.go::Bar", graph.EdgeCalls), + mkEdge("r1/a.go::Foo", "r2/x.go::Baz", graph.EdgeCalls), + mkEdge("r1/b.go::Bar", "unresolved::Missing", graph.EdgeCalls), + }) + if fmt.Sprint(gotR1) != fmt.Sprint(wantR1) { + t.Fatalf("GetRepoEdges(r1) =\n %v\nwant\n %v", gotR1, wantR1) + } + + gotR2 := sortEdgeKeys(s.GetRepoEdges("r2")) + wantR2 := sortEdgeKeys([]*graph.Edge{ + mkEdge("r2/y.go::Qux", "r2/x.go::Baz", graph.EdgeCalls), + mkEdge("r2/y.go::Qux", "r1/a.go::Foo", graph.EdgeCalls), + }) + if fmt.Sprint(gotR2) != fmt.Sprint(wantR2) { + t.Fatalf("GetRepoEdges(r2) =\n %v\nwant\n %v", gotR2, wantR2) + } + + // Empty prefix MUST return nothing (use AllEdges for the global + // view). Disk backends must not fall through to a full scan. + if got := s.GetRepoEdges(""); len(got) != 0 { + t.Fatalf("GetRepoEdges(\"\") = %d edges, want 0", len(got)) + } + + // Unknown prefix MUST return empty (no panic, no fallthrough). + if got := s.GetRepoEdges("nope"); len(got) != 0 { + t.Fatalf("GetRepoEdges(nope) = %d edges, want 0", len(got)) + } +} + +func testGetNodeByQualName(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction) + n.QualName = "pkg.Foo" + s.AddNode(n) + got := s.GetNodeByQualName("pkg.Foo") + if got == nil || got.ID != "a.go::Foo" { + t.Fatalf("GetNodeByQualName(pkg.Foo) = %v, want a.go::Foo", got) + } + if s.GetNodeByQualName("missing.Qual") != nil { + t.Fatalf("GetNodeByQualName missing should be nil") + } +} + +func testStats(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "y.go", graph.KindType)) + s.AddEdge(mkEdge("a", "b", graph.EdgeReferences)) + st := s.Stats() + if st.TotalNodes != 2 || st.TotalEdges != 1 { + t.Fatalf("Stats = %+v, want TotalNodes=2, TotalEdges=1", st) + } + if st.ByKind[string(graph.KindFunction)] != 1 || st.ByKind[string(graph.KindType)] != 1 { + t.Fatalf("Stats.ByKind = %v, want one each", st.ByKind) + } +} + +func testRepoStats(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindType)) + st := s.RepoStats() + if len(st) != 2 { + t.Fatalf("RepoStats has %d entries, want 2", len(st)) + } + if st["r1"].TotalNodes != 1 { + t.Fatalf("RepoStats[r1].TotalNodes = %d, want 1", st["r1"].TotalNodes) + } +} + +func testRepoPrefixes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindType)) + got := s.RepoPrefixes() + sort.Strings(got) + want := []string{"r1", "r2"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("RepoPrefixes = %v, want %v", got, want) + } +} + +func testSetEdgeProvenance(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + e := mkEdge("a", "b", graph.EdgeCalls) + e.Origin = graph.OriginTextMatched + s.AddEdge(e) + + bumped := s.SetEdgeProvenance(e, graph.OriginLSPResolved) + if !bumped { + t.Fatalf("SetEdgeProvenance returned false for real upgrade") + } + out := s.GetOutEdges("a") + if len(out) != 1 || out[0].Origin != graph.OriginLSPResolved { + t.Fatalf("Origin did not propagate: %+v", out) + } +} + +func testReindexEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Build a small graph with three out-edges from "a" pointing at + // three different targets, then re-bind all three to a fourth + // target in one batched call. + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "x.go", graph.KindFunction)) + s.AddNode(mkNode("d", "D", "x.go", graph.KindFunction)) + s.AddNode(mkNode("z", "Z", "x.go", graph.KindFunction)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "c", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("a", "d", graph.EdgeCalls) + e3.Line = 3 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + + // Mutate each edge's To, then hand the batch over. After the + // call, all three edges must show as in-edges of z; none of the + // originals must remain. + e1.To, e2.To, e3.To = "z", "z", "z" + s.ReindexEdges([]graph.EdgeReindex{ + {Edge: e1, OldTo: "b"}, + {Edge: e2, OldTo: "c"}, + {Edge: e3, OldTo: "d"}, + }) + + for _, oldID := range []string{"b", "c", "d"} { + if got := len(s.GetInEdges(oldID)); got != 0 { + t.Fatalf("GetInEdges(%q) after batch reindex = %d, want 0", oldID, got) + } + } + if got := len(s.GetInEdges("z")); got != 3 { + t.Fatalf("GetInEdges(z) after batch reindex = %d, want 3", got) + } + if got := len(s.GetOutEdges("a")); got != 3 { + t.Fatalf("GetOutEdges(a) after batch reindex = %d, want 3", got) + } + + // Empty batch is a no-op. + s.ReindexEdges(nil) + s.ReindexEdges([]graph.EdgeReindex{}) +} + +func testSetEdgeProvenanceBatch(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e1.Origin = graph.OriginTextMatched + e2 := mkEdge("a", "b", graph.EdgeCalls) + e2.Line = 2 + e2.Origin = graph.OriginTextMatched + e3 := mkEdge("a", "b", graph.EdgeCalls) + e3.Line = 3 + e3.Origin = graph.OriginLSPResolved // already at target tier — should be no-op + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + + changed := s.SetEdgeProvenanceBatch([]graph.EdgeProvenanceUpdate{ + {Edge: e1, NewOrigin: graph.OriginLSPResolved}, + {Edge: e2, NewOrigin: graph.OriginLSPResolved}, + {Edge: e3, NewOrigin: graph.OriginLSPResolved}, + }) + if changed != 2 { + t.Fatalf("SetEdgeProvenanceBatch reported %d changed, want 2 (one was already at target tier)", changed) + } + // Verify both promotions landed in the persisted edges. + out := s.GetOutEdges("a") + if len(out) != 3 { + t.Fatalf("GetOutEdges(a) = %d, want 3", len(out)) + } + for _, e := range out { + if e.Origin != graph.OriginLSPResolved { + t.Fatalf("edge %s->%s Origin = %q, want lsp_resolved", e.From, e.To, e.Origin) + } + } + + // Empty batch is a no-op and returns 0. + if got := s.SetEdgeProvenanceBatch(nil); got != 0 { + t.Fatalf("empty batch returned %d, want 0", got) + } + if got := s.SetEdgeProvenanceBatch([]graph.EdgeProvenanceUpdate{}); got != 0 { + t.Fatalf("empty batch returned %d, want 0", got) + } +} + +func testReindexEdge(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("old", "Old", "x.go", graph.KindFunction)) + s.AddNode(mkNode("new", "New", "x.go", graph.KindFunction)) + e := mkEdge("a", "old", graph.EdgeCalls) + s.AddEdge(e) + + e.To = "new" + s.ReindexEdge(e, "old") + + if got := len(s.GetInEdges("old")); got != 0 { + t.Fatalf("InEdges(old) after reindex = %d, want 0", got) + } + in := s.GetInEdges("new") + if len(in) != 1 || in[0].From != "a" { + t.Fatalf("InEdges(new) = %+v, want one edge from a", in) + } +} + +func testConcurrency(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + const workers = 8 + const perWorker = 50 + var wg sync.WaitGroup + for w := range workers { + wg.Add(1) + go func(w int) { + defer wg.Done() + for i := range perWorker { + id := fmt.Sprintf("w%d/n%d", w, i) + s.AddNode(mkNode(id, fmt.Sprintf("N%d", i), fmt.Sprintf("f%d.go", w), graph.KindFunction)) + } + }(w) + } + wg.Wait() + if got, want := s.NodeCount(), workers*perWorker; got != want { + t.Fatalf("concurrent NodeCount = %d, want %d", got, want) + } +} + +func testEdgeIdentityRevisions(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Just ensure the method exists and returns a non-negative int. + // The semantic invariant ("bumps on origin change") is + // implementation-defined; backends may return 0 if they don't + // track this. + if got := s.EdgeIdentityRevisions(); got < 0 { + t.Fatalf("EdgeIdentityRevisions negative: %d", got) + } +} + +func testVerifyEdgeIdentities(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + if err := s.VerifyEdgeIdentities(); err != nil { + t.Fatalf("VerifyEdgeIdentities on consistent store: %v", err) + } +} + +func testRepoMemoryEstimate(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + // Backends may return zero (disk/remote) or a real estimate + // (in-memory). The contract is that the call succeeds and + // NodeCount matches what we inserted. + est := s.RepoMemoryEstimate("r1") + if est.NodeCount != 1 { + t.Fatalf("RepoMemoryEstimate NodeCount = %d, want 1", est.NodeCount) + } +} + +func testAllRepoMemoryEstimates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkRepoNode("r1/a.go::Foo", "Foo", "r1/a.go", "r1", graph.KindFunction)) + s.AddNode(mkRepoNode("r2/x.go::Baz", "Baz", "r2/x.go", "r2", graph.KindFunction)) + all := s.AllRepoMemoryEstimates() + if len(all) != 2 { + t.Fatalf("AllRepoMemoryEstimates len = %d, want 2", len(all)) + } +} + +func testMetaPreserved(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + n := mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction) + n.Meta = map[string]any{ + "signature": "func Foo(x int) error", + "visibility": "public", + } + s.AddNode(n) + got := s.GetNode("a.go::Foo") + if got == nil { + t.Fatalf("GetNode returned nil") + } + if got.Meta == nil { + t.Fatalf("Meta not preserved") + } + if got.Meta["signature"] != "func Foo(x int) error" { + t.Fatalf("Meta[signature] = %v", got.Meta["signature"]) + } + if got.Meta["visibility"] != "public" { + t.Fatalf("Meta[visibility] = %v", got.Meta["visibility"]) + } +} + +func testEdgesByKind(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindType)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "b", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("a", "c", graph.EdgeReferences) + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + + var calls []*graph.Edge + for e := range s.EdgesByKind(graph.EdgeCalls) { + calls = append(calls, e) + } + if len(calls) != 2 { + t.Fatalf("EdgesByKind(EdgeCalls) yielded %d, want 2", len(calls)) + } + for _, e := range calls { + if e.Kind != graph.EdgeCalls { + t.Fatalf("yielded edge has wrong kind: %s", e.Kind) + } + } + + var refs []*graph.Edge + for e := range s.EdgesByKind(graph.EdgeReferences) { + refs = append(refs, e) + } + if len(refs) != 1 { + t.Fatalf("EdgesByKind(EdgeReferences) yielded %d, want 1", len(refs)) + } + + // Unknown kind yields nothing. + count := 0 + for range s.EdgesByKind(graph.EdgeKind("nonexistent")) { + count++ + } + if count != 0 { + t.Fatalf("EdgesByKind(nonexistent) yielded %d, want 0", count) + } + + // Early stop honours the contract. + stopped := 0 + for range s.EdgesByKind(graph.EdgeCalls) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +func testNodesByKind(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindType)) + + var fns []*graph.Node + for n := range s.NodesByKind(graph.KindFunction) { + fns = append(fns, n) + } + if len(fns) != 2 { + t.Fatalf("NodesByKind(KindFunction) yielded %d, want 2", len(fns)) + } + for _, n := range fns { + if n.Kind != graph.KindFunction { + t.Fatalf("yielded node has wrong kind: %s", n.Kind) + } + } + + var types []*graph.Node + for n := range s.NodesByKind(graph.KindType) { + types = append(types, n) + } + if len(types) != 1 { + t.Fatalf("NodesByKind(KindType) yielded %d, want 1", len(types)) + } + + // Early stop honours the contract. + stopped := 0 + for range s.NodesByKind(graph.KindFunction) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +func testEdgesWithUnresolvedTarget(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + + e1 := mkEdge("a", "b", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("a", "unresolved::Foo", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("a", "unresolved::Bar", graph.EdgeCalls) + e3.Line = 3 + e4 := mkEdge("a", "resolved", graph.EdgeCalls) + e4.Line = 4 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + + var unres []*graph.Edge + for e := range s.EdgesWithUnresolvedTarget() { + unres = append(unres, e) + } + if len(unres) != 2 { + t.Fatalf("EdgesWithUnresolvedTarget yielded %d, want 2", len(unres)) + } + for _, e := range unres { + if !strings.HasPrefix(e.To, "unresolved::") { + t.Fatalf("yielded edge has non-unresolved To: %s", e.To) + } + } +} + +func testEmptyStore(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + if s.NodeCount() != 0 { + t.Fatalf("empty NodeCount = %d, want 0", s.NodeCount()) + } + if s.EdgeCount() != 0 { + t.Fatalf("empty EdgeCount = %d, want 0", s.EdgeCount()) + } + if len(s.AllNodes()) != 0 { + t.Fatalf("empty AllNodes nonzero") + } + if len(s.AllEdges()) != 0 { + t.Fatalf("empty AllEdges nonzero") + } + if len(s.RepoPrefixes()) != 0 { + t.Fatalf("empty RepoPrefixes nonzero") + } +} + +func testGetNodesByIDs(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Baz", "Baz", "b.go", graph.KindType)) + + got := s.GetNodesByIDs([]string{"a.go::Foo", "b.go::Baz", "missing", "a.go::Bar", "a.go::Foo"}) + if len(got) != 3 { + t.Fatalf("GetNodesByIDs len = %d, want 3 (3 present, 1 missing, 1 duplicate)", len(got)) + } + if got["a.go::Foo"] == nil || got["a.go::Foo"].Name != "Foo" { + t.Fatalf("missing or wrong Foo: %v", got["a.go::Foo"]) + } + if got["b.go::Baz"] == nil || got["b.go::Baz"].Kind != graph.KindType { + t.Fatalf("missing or wrong Baz: %v", got["b.go::Baz"]) + } + if _, present := got["missing"]; present { + t.Fatalf("missing ID should not be in map, got %v", got["missing"]) + } + + // Empty / nil input is a no-op. + if got := s.GetNodesByIDs(nil); len(got) != 0 { + t.Fatalf("nil input returned %d entries", len(got)) + } + if got := s.GetNodesByIDs([]string{}); len(got) != 0 { + t.Fatalf("empty input returned %d entries", len(got)) + } + if got := s.GetNodesByIDs([]string{""}); len(got) != 0 { + t.Fatalf("empty-string ID returned %d entries", len(got)) + } +} + +func testFindNodesByNames(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("b.go::Foo", "Foo", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Bar", "Bar", "c.go", graph.KindFunction)) + + got := s.FindNodesByNames([]string{"Foo", "Missing", "Bar", "Foo"}) + if len(got) != 2 { + t.Fatalf("FindNodesByNames len = %d, want 2 (2 present, 1 missing, 1 duplicate)", len(got)) + } + foos := got["Foo"] + if len(foos) != 2 { + t.Fatalf("Foo matches = %d, want 2", len(foos)) + } + for _, n := range foos { + if n.Name != "Foo" { + t.Fatalf("matched node has wrong Name: %s", n.Name) + } + } + bars := got["Bar"] + if len(bars) != 1 || bars[0].Name != "Bar" { + t.Fatalf("Bar matches wrong: %v", bars) + } + if _, present := got["Missing"]; present { + t.Fatalf("missing name should not be in map") + } + + // Empty / nil input. + if got := s.FindNodesByNames(nil); len(got) != 0 { + t.Fatalf("nil input returned %d entries", len(got)) + } + if got := s.FindNodesByNames([]string{}); len(got) != 0 { + t.Fatalf("empty input returned %d entries", len(got)) + } +} + +// testGetEdgesByNodeIDs covers the batched fan-in / fan-out edge +// lookups. Builds a small graph with mixed fan-in/out, calls both +// methods with a mix of present and missing ids (plus an empty +// string), and asserts the per-id slices match what GetInEdges / +// GetOutEdges would return individually. +func testGetEdgesByNodeIDs(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + // Nodes + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindFunction)) + s.AddNode(mkNode("d", "D", "y.go", graph.KindFunction)) + // Edges: a→b, a→c, b→c, d→c (so c has 3 in-edges, a has 2 out-edges). + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + s.AddEdge(mkEdge("a", "c", graph.EdgeCalls)) + s.AddEdge(mkEdge("b", "c", graph.EdgeCalls)) + s.AddEdge(mkEdge("d", "c", graph.EdgeReferences)) + + // --- GetOutEdgesByNodeIDs --- + outMap := s.GetOutEdgesByNodeIDs([]string{"a", "b", "d", "missing", "a"}) + // a has 2 out-edges (a→b, a→c). + if got := sortEdgeKeys(outMap["a"]); len(got) != 2 { + t.Fatalf("GetOutEdgesByNodeIDs[a] = %v, want 2 edges", got) + } + // b has 1 out-edge (b→c). + if got := outMap["b"]; len(got) != 1 || got[0].To != "c" { + t.Fatalf("GetOutEdgesByNodeIDs[b] = %v, want one edge to c", got) + } + // d has 1 out-edge (d→c). + if got := outMap["d"]; len(got) != 1 || got[0].To != "c" { + t.Fatalf("GetOutEdgesByNodeIDs[d] = %v, want one edge to c", got) + } + // missing key — range over nil is a no-op, so callers can index + // without an ok-check. + if got := outMap["missing"]; len(got) != 0 { + t.Fatalf("GetOutEdgesByNodeIDs[missing] = %v, want empty", got) + } + + // --- GetInEdgesByNodeIDs --- + inMap := s.GetInEdgesByNodeIDs([]string{"a", "b", "c", "missing"}) + // a has 0 in-edges. + if got := inMap["a"]; len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs[a] = %v, want empty", got) + } + // b has 1 in-edge (a→b). + if got := inMap["b"]; len(got) != 1 || got[0].From != "a" { + t.Fatalf("GetInEdgesByNodeIDs[b] = %v, want one edge from a", got) + } + // c has 3 in-edges (a→c, b→c, d→c). + if got := inMap["c"]; len(got) != 3 { + t.Fatalf("GetInEdgesByNodeIDs[c] = %v, want 3 edges", got) + } + froms := map[string]bool{} + for _, e := range inMap["c"] { + froms[e.From] = true + } + for _, want := range []string{"a", "b", "d"} { + if !froms[want] { + t.Fatalf("GetInEdgesByNodeIDs[c] missing edge from %q", want) + } + } + if got := inMap["missing"]; len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs[missing] = %v, want empty", got) + } + + // Empty / nil / empty-string inputs are no-ops. + if got := s.GetOutEdgesByNodeIDs(nil); len(got) != 0 { + t.Fatalf("GetOutEdgesByNodeIDs(nil) returned %d entries", len(got)) + } + if got := s.GetInEdgesByNodeIDs(nil); len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs(nil) returned %d entries", len(got)) + } + if got := s.GetOutEdgesByNodeIDs([]string{}); len(got) != 0 { + t.Fatalf("GetOutEdgesByNodeIDs([]) returned %d entries", len(got)) + } + if got := s.GetInEdgesByNodeIDs([]string{""}); len(got) != 0 { + t.Fatalf("GetInEdgesByNodeIDs([\"\"]) returned %d entries", len(got)) + } +} + +// testSymbolBundleSearcher exercises the optional +// graph.SymbolBundleSearcher capability. The interface is opt-in +// (today only the Ladybug backend implements it; the in-memory +// *Graph deliberately leaves it unimplemented so the engine's +// fallback path stays exercised) — backends without the capability +// skip the subtest cleanly. +// +// Coverage: +// - SymbolSearcher.BulkUpsertSymbolFTS + BuildSymbolIndex must be +// called first so the FTS index is populated. +// - SearchSymbolBundles returns a bundle per matched id with the +// correct in/out edges attached. +// - Empty / no-match query returns an empty bundle slice. +func testSymbolBundleSearcher(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + bs, ok := s.(graph.SymbolBundleSearcher) + if !ok { + t.Skip("backend does not implement graph.SymbolBundleSearcher") + } + ss, ok := s.(graph.SymbolSearcher) + if !ok { + t.Skip("backend implements SymbolBundleSearcher but not SymbolSearcher — cannot populate FTS") + } + + // Build a small graph: A → B → C, plus an unrelated isolated D. + // FTS-searchable name tokens that should land on the same hit. + s.AddNode(mkNode("a", "AlphaWidget", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "BetaWidget", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "GammaWidget", "y.go", graph.KindFunction)) + s.AddNode(mkNode("d", "Delta", "y.go", graph.KindFunction)) + s.AddEdge(mkEdge("a", "b", graph.EdgeCalls)) + s.AddEdge(mkEdge("b", "c", graph.EdgeCalls)) + s.AddEdge(mkEdge("a", "c", graph.EdgeCalls)) + + // Populate the FTS sidecar — every searchable node carries its + // tokenised name as the FTS text. + items := []graph.SymbolFTSItem{ + {NodeID: "a", Tokens: "alpha widget"}, + {NodeID: "b", Tokens: "beta widget"}, + {NodeID: "c", Tokens: "gamma widget"}, + {NodeID: "d", Tokens: "delta"}, + } + if err := ss.BulkUpsertSymbolFTS("", items); err != nil { + t.Fatalf("BulkUpsertSymbolFTS: %v", err) + } + if err := ss.BuildSymbolIndex(); err != nil { + t.Fatalf("BuildSymbolIndex: %v", err) + } + + // Querying for "widget" should match a/b/c and not d. Each bundle + // must carry the correct in/out edges off the graph. + bundles, err := bs.SearchSymbolBundles("widget", 10) + if err != nil { + t.Fatalf("SearchSymbolBundles: %v", err) + } + if len(bundles) == 0 { + t.Fatalf("SearchSymbolBundles returned no bundles — expected matches for a/b/c") + } + gotIDs := make(map[string]graph.SymbolBundle, len(bundles)) + for _, b := range bundles { + if b.Node == nil { + t.Fatalf("bundle has nil node: %+v", b) + } + gotIDs[b.Node.ID] = b + } + for _, want := range []string{"a", "b", "c"} { + if _, ok := gotIDs[want]; !ok { + t.Fatalf("missing bundle for id %q; got ids=%v", want, idsOf(bundles)) + } + } + if _, ok := gotIDs["d"]; ok { + t.Fatalf("unexpected bundle for id %q (no 'widget' token in its FTS row)", "d") + } + + // Edge verification: per-bundle in/out edges must match the + // in-memory truth surfaced via the existing GetIn/Out edges. + for id, b := range gotIDs { + wantOut := s.GetOutEdges(id) + if !edgeSlicesMatch(wantOut, b.OutEdges) { + t.Fatalf("bundle[%s].OutEdges mismatch: want=%v got=%v", id, edgeKeys(wantOut), edgeKeys(b.OutEdges)) + } + wantIn := s.GetInEdges(id) + if !edgeSlicesMatch(wantIn, b.InEdges) { + t.Fatalf("bundle[%s].InEdges mismatch: want=%v got=%v", id, edgeKeys(wantIn), edgeKeys(b.InEdges)) + } + } + + // Empty query is a clean no-op. + if empty, err := bs.SearchSymbolBundles("", 10); err != nil || len(empty) != 0 { + t.Fatalf("SearchSymbolBundles(\"\"): err=%v len=%d, want empty", err, len(empty)) + } + // No-match query — backend MAY return nil or empty slice; both + // are valid. + if no, err := bs.SearchSymbolBundles("nomatchforanything", 10); err != nil { + t.Fatalf("SearchSymbolBundles(nomatch): err=%v", err) + } else if len(no) != 0 { + t.Fatalf("SearchSymbolBundles(nomatch) returned %d bundles, want 0", len(no)) + } +} + +// idsOf is a small helper for the bundle assertions above. +func idsOf(bs []graph.SymbolBundle) []string { + out := make([]string, 0, len(bs)) + for _, b := range bs { + if b.Node != nil { + out = append(out, b.Node.ID) + } + } + sort.Strings(out) + return out +} + +// edgeSlicesMatch reports whether two edge slices contain the same +// (from, to, kind) tuples regardless of order. Used by the bundle +// assertions to ignore back-end-imposed ordering differences. +func edgeSlicesMatch(want, got []*graph.Edge) bool { + if len(want) != len(got) { + return false + } + wantKeys := edgeKeys(want) + gotKeys := edgeKeys(got) + sort.Strings(wantKeys) + sort.Strings(gotKeys) + for i := range wantKeys { + if wantKeys[i] != gotKeys[i] { + return false + } + } + return true +} + +// edgeKeys flattens a slice of edges into deterministic (from→to:kind) +// strings for ordered diffing. +func edgeKeys(es []*graph.Edge) []string { + out := make([]string, 0, len(es)) + for _, e := range es { + if e == nil { + continue + } + out = append(out, fmt.Sprintf("%s->%s:%s", e.From, e.To, e.Kind)) + } + return out +} + +// testDeadCodeCandidator exercises the optional +// graph.DeadCodeCandidator capability. Builds a small graph with +// nodes that fall into each filter case the analyzer cares about: +// +// - zero in-edges (dead). +// - in-edges of disallowed kind only (dead). +// - in-edges of allowed kind (alive). +// - mixed kinds across the candidate set (per-row allowlist must apply). +// +// The in-memory *graph.Graph implements this; Ladybug overrides with +// a server-side Cypher query. Both must return the same candidate set. +func testDeadCodeCandidator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + dc, ok := s.(graph.DeadCodeCandidator) + if !ok { + t.Skip("backend does not implement graph.DeadCodeCandidator") + } + + // Functions: AliveFunc (called), DeadFunc (no in-edges), + // ReadOnlyFunc (only EdgeReads — disallowed for KindFunction). + s.AddNode(mkNode("AliveFunc", "AliveFunc", "a.go", graph.KindFunction)) + s.AddNode(mkNode("DeadFunc", "DeadFunc", "a.go", graph.KindFunction)) + s.AddNode(mkNode("ReadOnlyFunc", "ReadOnlyFunc", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Caller", "Caller", "a.go", graph.KindFunction)) + // Types: AliveType (referenced), DeadType (no in-edges). + s.AddNode(mkNode("AliveType", "AliveType", "b.go", graph.KindType)) + s.AddNode(mkNode("DeadType", "DeadType", "b.go", graph.KindType)) + // Methods: AliveMethod (called), DeadMethod (no in-edges). + s.AddNode(mkNode("AliveMethod", "AliveMethod", "c.go", graph.KindMethod)) + s.AddNode(mkNode("DeadMethod", "DeadMethod", "c.go", graph.KindMethod)) + + // Edges that exercise the per-kind allowlist. + e1 := mkEdge("Caller", "AliveFunc", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("Caller", "ReadOnlyFunc", graph.EdgeReads) + e2.Line = 2 + e3 := mkEdge("Caller", "AliveMethod", graph.EdgeCalls) + e3.Line = 3 + e4 := mkEdge("Caller", "AliveType", graph.EdgeReferences) + e4.Line = 4 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + + // Per-kind allowlist mirrors analysis.incomingUsageKinds for the + // three kinds under test. Functions are alive on Calls/References; + // methods on Calls/Implements; types on References/Instantiates. + allowedKinds := []graph.NodeKind{ + graph.KindFunction, + graph.KindMethod, + graph.KindType, + } + allowedInEdges := map[graph.NodeKind][]graph.EdgeKind{ + graph.KindFunction: {graph.EdgeCalls, graph.EdgeReferences}, + graph.KindMethod: {graph.EdgeCalls, graph.EdgeImplements}, + graph.KindType: {graph.EdgeReferences, graph.EdgeInstantiates}, + } + + got := dc.DeadCodeCandidates(allowedKinds, allowedInEdges) + gotIDs := sortNodeIDs(got) + // Caller has zero in-edges of any kind, so it surfaces too — the + // analyzer's per-kind allowlist would also flag it as a candidate + // here. The backend's job is just the candidate set; post-filters + // (exported / test / entry-point) run in Go. + want := []string{"Caller", "DeadFunc", "DeadMethod", "DeadType", "ReadOnlyFunc"} + if fmt.Sprint(gotIDs) != fmt.Sprint(want) { + t.Fatalf("DeadCodeCandidates = %v\nwant %v", gotIDs, want) + } + + // Empty kind list returns nothing — never the whole graph. + if got := dc.DeadCodeCandidates(nil, allowedInEdges); len(got) != 0 { + t.Fatalf("DeadCodeCandidates(nil) = %d, want 0", len(got)) + } + + // Empty per-kind allowlist means "any incoming edge counts as + // usage" — AliveFunc and ReadOnlyFunc (both have *some* in-edge) + // drop out; only DeadFunc + Caller remain among functions. + anyKind := map[graph.NodeKind][]graph.EdgeKind{ + graph.KindFunction: nil, + } + gotAny := dc.DeadCodeCandidates([]graph.NodeKind{graph.KindFunction}, anyKind) + gotAnyIDs := sortNodeIDs(gotAny) + wantAny := []string{"Caller", "DeadFunc"} + if fmt.Sprint(gotAnyIDs) != fmt.Sprint(wantAny) { + t.Fatalf("DeadCodeCandidates(any-kind) = %v\nwant %v", gotAnyIDs, wantAny) + } +} + +// testIfaceImplementsScanner exercises the optional +// graph.IfaceImplementsScanner capability. Seeds two interfaces (one +// with methods Meta, one without) plus a type that implements each; +// the row set must include only the (type, iface) tuple whose target +// has a Meta["methods"] payload — the no-meta interface drops out. +func testIfaceImplementsScanner(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scanner, ok := s.(graph.IfaceImplementsScanner) + if !ok { + t.Skip("backend does not implement graph.IfaceImplementsScanner") + } + + // Interface with required methods. + ifaceA := mkNode("iface_A", "Reader", "a.go", graph.KindInterface) + ifaceA.Meta = map[string]any{"methods": []string{"Read", "Close"}} + s.AddNode(ifaceA) + // Interface with no Meta — must not appear in the row set. + ifaceB := mkNode("iface_B", "Empty", "a.go", graph.KindInterface) + s.AddNode(ifaceB) + // Implementing type for each. + s.AddNode(mkNode("type_A", "ReaderImpl", "a.go", graph.KindType)) + s.AddNode(mkNode("type_B", "EmptyImpl", "a.go", graph.KindType)) + s.AddEdge(mkEdge("type_A", "iface_A", graph.EdgeImplements)) + s.AddEdge(mkEdge("type_B", "iface_B", graph.EdgeImplements)) + + rows := scanner.IfaceImplementsRows() + if len(rows) != 1 { + t.Fatalf("IfaceImplementsRows len = %d, want 1 (iface_B has no Meta)", len(rows)) + } + r := rows[0] + if r.TypeID != "type_A" || r.IfaceID != "iface_A" { + t.Fatalf("row = %+v, want type_A → iface_A", r) + } + if r.IfaceMeta == nil { + t.Fatalf("IfaceMeta is nil") + } + raw, ok := r.IfaceMeta["methods"] + if !ok { + t.Fatalf("IfaceMeta missing methods key: %+v", r.IfaceMeta) + } + // Meta encoding round-trips lists differently between backends + // (in-memory keeps []string; gob-encoded comes back as []any). + // Accept either. + var methods []string + switch v := raw.(type) { + case []string: + methods = v + case []any: + for _, m := range v { + if str, ok := m.(string); ok { + methods = append(methods, str) + } + } + default: + t.Fatalf("unexpected methods type %T: %v", raw, raw) + } + sort.Strings(methods) + if fmt.Sprint(methods) != fmt.Sprint([]string{"Close", "Read"}) { + t.Fatalf("methods = %v, want [Close Read]", methods) + } +} + +// testNodeDegreeAggregator exercises the optional +// graph.NodeDegreeAggregator capability. Builds a small graph with +// nodes that cover every classification branch +// graph.GraphConnectivity / graph.ClassifyZeroEdge care about: +// +// - isolated (zero edges). +// - leaf (exactly one edge in either direction). +// - usage-edge in-bound only (alive — at least one EdgeCalls in). +// - non-usage-edge in-bound only (no EdgeCalls / EdgeReferences / +// etc — counts as "likely unused"). +// - usage-edge mixed with non-usage in-edges (still alive). +// - unknown id (must be elided). +func testNodeDegreeAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + dc, ok := s.(graph.NodeDegreeAggregator) + if !ok { + t.Skip("backend does not implement graph.NodeDegreeAggregator") + } + + s.AddNode(mkNode("Isolated", "Isolated", "a.go", graph.KindFunction)) + s.AddNode(mkNode("LeafSink", "LeafSink", "a.go", graph.KindFunction)) + s.AddNode(mkNode("LeafSource", "LeafSource", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Alive", "Alive", "a.go", graph.KindFunction)) + s.AddNode(mkNode("StructuralOnly", "StructuralOnly", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Mixed", "Mixed", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Caller", "Caller", "a.go", graph.KindFunction)) + s.AddNode(mkNode("FileNode", "FileNode", "a.go", graph.KindFile)) + + // One incoming call into LeafSink → leaf (in_count=1, out_count=0). + e1 := mkEdge("Caller", "LeafSink", graph.EdgeCalls) + e1.Line = 1 + s.AddEdge(e1) + // One outgoing reference from LeafSource → leaf (in=0, out=1). + e2 := mkEdge("LeafSource", "Caller", graph.EdgeReferences) + e2.Line = 2 + s.AddEdge(e2) + // Alive: incoming call → alive (in=1 usage). + e3 := mkEdge("Caller", "Alive", graph.EdgeCalls) + e3.Line = 3 + s.AddEdge(e3) + // StructuralOnly: incoming EdgeDefines (NOT a usage kind) → + // classified as "likely unused" but not isolated. + e4 := mkEdge("FileNode", "StructuralOnly", graph.EdgeDefines) + e4.Line = 4 + s.AddEdge(e4) + // Mixed: incoming EdgeDefines (non-usage) + incoming EdgeCalls + // (usage). UsageInCount must reflect ONLY the usage edge. + e5 := mkEdge("FileNode", "Mixed", graph.EdgeDefines) + e5.Line = 5 + s.AddEdge(e5) + e6 := mkEdge("Caller", "Mixed", graph.EdgeCalls) + e6.Line = 6 + s.AddEdge(e6) + + ids := []string{ + "Isolated", + "LeafSink", + "LeafSource", + "Alive", + "StructuralOnly", + "Mixed", + "unknown::id", + } + usage := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + rows := dc.NodeDegreeCounts(ids, usage) + + byID := make(map[string]graph.NodeDegreeRow, len(rows)) + for _, r := range rows { + byID[r.NodeID] = r + } + // Unknown id MUST be elided. + if _, ok := byID["unknown::id"]; ok { + t.Fatalf("NodeDegreeCounts must elide unknown ids, got row") + } + + type want struct{ in, out, usageIn int } + cases := map[string]want{ + "Isolated": {0, 0, 0}, + "LeafSink": {1, 0, 1}, + "LeafSource": {0, 1, 0}, + "Alive": {1, 0, 1}, + "StructuralOnly": {1, 0, 0}, + "Mixed": {2, 0, 1}, + } + for id, w := range cases { + got, ok := byID[id] + if !ok { + t.Errorf("missing row for %s", id) + continue + } + if got.InCount != w.in || got.OutCount != w.out || got.UsageInCount != w.usageIn { + t.Errorf("row %s = in=%d out=%d usage=%d, want in=%d out=%d usage=%d", + id, got.InCount, got.OutCount, got.UsageInCount, + w.in, w.out, w.usageIn) + } + } + + // Empty ids returns nil — never the whole graph. + if got := dc.NodeDegreeCounts(nil, usage); len(got) != 0 { + t.Fatalf("NodeDegreeCounts(nil) = %d, want 0", len(got)) + } + + // Empty usage kinds means UsageInCount is always 0 (totals + // still populated). + noUsage := dc.NodeDegreeCounts([]string{"Mixed"}, nil) + if len(noUsage) != 1 { + t.Fatalf("NodeDegreeCounts(Mixed, nil) = %d rows, want 1", len(noUsage)) + } + if noUsage[0].InCount != 2 || noUsage[0].UsageInCount != 0 { + t.Fatalf("NodeDegreeCounts(Mixed, nil) = in=%d usage=%d, want in=2 usage=0", + noUsage[0].InCount, noUsage[0].UsageInCount) + } +} + +// testNodeFanAggregator exercises the optional +// graph.NodeFanAggregator capability. Builds a small graph that +// exercises the per-direction kind filter independently: +// +// - Hub: high fan-in (Calls + References) AND high fan-out (Calls). +// - Leaf: zero fan in either direction. +// - ReadHeavy: incoming Reads only — fan-in must be 0 when the +// filter is Calls+References. +// - CallerOnly: outgoing Calls only — fan-out non-zero, fan-in 0. +// - Unknown id elided. +func testNodeFanAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + fa, ok := s.(graph.NodeFanAggregator) + if !ok { + t.Skip("backend does not implement graph.NodeFanAggregator") + } + + s.AddNode(mkNode("Hub", "Hub", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Leaf", "Leaf", "a.go", graph.KindFunction)) + s.AddNode(mkNode("ReadHeavy", "ReadHeavy", "a.go", graph.KindFunction)) + s.AddNode(mkNode("CallerOnly", "CallerOnly", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Target1", "Target1", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Target2", "Target2", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Src1", "Src1", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Src2", "Src2", "a.go", graph.KindFunction)) + + // Hub: 2 incoming Calls + 1 incoming Reference + 2 outgoing + // Calls + 1 outgoing Reference. With fan-in=Calls+Refs and + // fan-out=Calls: fan_in=3, fan_out=2. + add := func(from, to string, kind graph.EdgeKind, line int) { + e := mkEdge(from, to, kind) + e.Line = line + s.AddEdge(e) + } + add("Src1", "Hub", graph.EdgeCalls, 1) + add("Src2", "Hub", graph.EdgeCalls, 2) + add("Src1", "Hub", graph.EdgeReferences, 3) + add("Hub", "Target1", graph.EdgeCalls, 4) + add("Hub", "Target2", graph.EdgeCalls, 5) + add("Hub", "Target1", graph.EdgeReferences, 6) + + // ReadHeavy: incoming Reads only. + add("Src1", "ReadHeavy", graph.EdgeReads, 7) + add("Src2", "ReadHeavy", graph.EdgeReads, 8) + + // CallerOnly: outgoing Calls only. + add("CallerOnly", "Target1", graph.EdgeCalls, 9) + + ids := []string{"Hub", "Leaf", "ReadHeavy", "CallerOnly", "unknown::id"} + rows := fa.NodeFanCounts(ids, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + []graph.EdgeKind{graph.EdgeCalls}, + ) + + byID := make(map[string]graph.NodeFanRow, len(rows)) + for _, r := range rows { + byID[r.NodeID] = r + } + if _, ok := byID["unknown::id"]; ok { + t.Fatalf("NodeFanCounts must elide unknown ids, got row") + } + + type want struct{ in, out int } + cases := map[string]want{ + "Hub": {3, 2}, + "Leaf": {0, 0}, + "ReadHeavy": {0, 0}, + "CallerOnly": {0, 1}, + } + for id, w := range cases { + got, ok := byID[id] + if !ok { + t.Errorf("missing row for %s", id) + continue + } + if got.FanIn != w.in || got.FanOut != w.out { + t.Errorf("row %s = in=%d out=%d, want in=%d out=%d", + id, got.FanIn, got.FanOut, w.in, w.out) + } + } + + // Empty ids returns nil. + if got := fa.NodeFanCounts(nil, []graph.EdgeKind{graph.EdgeCalls}, nil); len(got) != 0 { + t.Fatalf("NodeFanCounts(nil) = %d, want 0", len(got)) + } + + // Empty kind sets → all-zero rows for known ids only. + zeros := fa.NodeFanCounts([]string{"Hub", "unknown::id"}, nil, nil) + if len(zeros) != 1 { + t.Fatalf("NodeFanCounts(empty kinds) = %d rows, want 1 (Hub only)", len(zeros)) + } + if zeros[0].NodeID != "Hub" || zeros[0].FanIn != 0 || zeros[0].FanOut != 0 { + t.Fatalf("NodeFanCounts(empty kinds) = %+v, want Hub/0/0", zeros[0]) + } +} + +// testFileImporters exercises the optional graph.FileImporters +// capability. Seeds two importing files (one production, one test) +// plus an unrelated import edge that targets a different file. The +// returned rows must include exactly the importers of the target +// file — both via the file-node ID and via the FilePath-on-symbol +// shape — and must not surface the unrelated edge. +func testFileImporters(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + fi, ok := s.(graph.FileImporters) + if !ok { + t.Skip("backend does not implement graph.FileImporters") + } + + // target file node + a symbol inside it. + s.AddNode(mkNode("pkg/target.go", "target.go", "pkg/target.go", graph.KindFile)) + s.AddNode(mkNode("TargetFunc", "TargetFunc", "pkg/target.go", graph.KindFunction)) + + // Two importing files: one production, one test. Each has an + // import edge — one targets the file node by id, the other + // targets a symbol inside the file (FilePath match path). + s.AddNode(mkNode("pkg/prod.go", "prod.go", "pkg/prod.go", graph.KindFile)) + s.AddNode(mkNode("pkg/test_test.go", "test_test.go", "pkg/test_test.go", graph.KindFile)) + + // And an unrelated importer that points elsewhere — must NOT + // surface in the results. + s.AddNode(mkNode("pkg/other.go", "other.go", "pkg/other.go", graph.KindFile)) + s.AddNode(mkNode("pkg/elsewhere.go", "elsewhere.go", "pkg/elsewhere.go", graph.KindFile)) + + s.AddEdge(mkEdge("pkg/prod.go", "pkg/target.go", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/test_test.go", "TargetFunc", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/other.go", "pkg/elsewhere.go", graph.EdgeImports)) + // A non-imports edge to the target file must also drop out. + s.AddEdge(mkEdge("pkg/prod.go", "TargetFunc", graph.EdgeCalls)) + + rows := fi.FileImporters("pkg/target.go") + got := make([]string, 0, len(rows)) + for _, r := range rows { + got = append(got, r.FromFile) + } + sort.Strings(got) + want := []string{"pkg/prod.go", "pkg/test_test.go"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("FileImporters = %v, want %v", got, want) + } + + if got := fi.FileImporters(""); len(got) != 0 { + t.Fatalf("FileImporters(empty) = %d rows, want 0", len(got)) + } + if got := fi.FileImporters("pkg/no_such.go"); len(got) != 0 { + t.Fatalf("FileImporters(unknown) = %d rows, want 0", len(got)) + } +} + +// testInEdgeCounter exercises the optional graph.InEdgeCounter +// capability. Seeds a small graph and asserts the per-To fan-in +// count matches what an AllEdges-bucketing loop would compute for +// the same edge-kind set. +func testInEdgeCounter(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ic, ok := s.(graph.InEdgeCounter) + if !ok { + t.Skip("backend does not implement graph.InEdgeCounter") + } + + s.AddNode(mkNode("A", "A", "a.go", graph.KindFunction)) + s.AddNode(mkNode("B", "B", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C", "C", "a.go", graph.KindFunction)) + s.AddNode(mkNode("T", "T", "a.go", graph.KindType)) + + // B is called twice (from A and C), referenced once (from A). + e1 := mkEdge("A", "B", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("C", "B", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("A", "B", graph.EdgeReferences) + e3.Line = 3 + // T is referenced once and held by an import edge that should + // not be counted under {calls,references}. + e4 := mkEdge("A", "T", graph.EdgeReferences) + e4.Line = 4 + e5 := mkEdge("A", "T", graph.EdgeImports) + e5.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + + got := ic.InEdgeCountsByKind([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) + if got["B"] != 3 { + t.Fatalf("count[B] = %d, want 3", got["B"]) + } + if got["T"] != 1 { + t.Fatalf("count[T] = %d, want 1", got["T"]) + } + if _, ok := got["A"]; ok { + t.Fatalf("A should have zero matching incoming edges, got %d", got["A"]) + } + + // Empty kind list must return nil — never the whole graph. + if got := ic.InEdgeCountsByKind(nil); got != nil { + t.Fatalf("InEdgeCountsByKind(nil) = %v, want nil", got) + } + + // Single-kind filter dedups when callers pass duplicates. + got2 := ic.InEdgeCountsByKind([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeCalls}) + if got2["B"] != 2 { + t.Fatalf("count[B] (calls only, deduped) = %d, want 2", got2["B"]) + } +} + +// testNodesInFilesByKindFinder exercises the optional +// graph.NodesInFilesByKindFinder capability. Seeds a graph spanning +// three files and three kinds; the result must include only the +// requested-kind nodes whose FilePath sits in the requested file +// set. +func testNodesInFilesByKindFinder(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + fn, ok := s.(graph.NodesInFilesByKindFinder) + if !ok { + t.Skip("backend does not implement graph.NodesInFilesByKindFinder") + } + + // f1.go: function + method + type. + s.AddNode(mkNode("f1::F1", "F1", "f1.go", graph.KindFunction)) + s.AddNode(mkNode("f1::M1", "M1", "f1.go", graph.KindMethod)) + s.AddNode(mkNode("f1::T1", "T1", "f1.go", graph.KindType)) + // f2.go: function only. + s.AddNode(mkNode("f2::F2", "F2", "f2.go", graph.KindFunction)) + // f3.go: drops out of every result — not in the requested files. + s.AddNode(mkNode("f3::F3", "F3", "f3.go", graph.KindFunction)) + + got := fn.NodesInFilesByKind( + []string{"f1.go", "f2.go"}, + []graph.NodeKind{graph.KindFunction, graph.KindMethod}, + ) + gotIDs := sortNodeIDs(got) + want := []string{"f1::F1", "f1::M1", "f2::F2"} + if fmt.Sprint(gotIDs) != fmt.Sprint(want) { + t.Fatalf("NodesInFilesByKind = %v, want %v", gotIDs, want) + } + + // Empty files / kinds must return nil — never a whole-graph scan. + if got := fn.NodesInFilesByKind(nil, []graph.NodeKind{graph.KindFunction}); got != nil { + t.Fatalf("NodesInFilesByKind(nil files) = %v, want nil", got) + } + if got := fn.NodesInFilesByKind([]string{"f1.go"}, nil); got != nil { + t.Fatalf("NodesInFilesByKind(nil kinds) = %v, want nil", got) + } + + // Dedup: passing the same file / kind twice must not double-yield. + gotDup := fn.NodesInFilesByKind( + []string{"f1.go", "f1.go"}, + []graph.NodeKind{graph.KindType, graph.KindType}, + ) + if len(gotDup) != 1 || gotDup[0].ID != "f1::T1" { + t.Fatalf("NodesInFilesByKind(dup) = %v, want [f1::T1]", sortNodeIDs(gotDup)) + } +} + +// testEdgesByKindsScanner exercises the optional +// graph.EdgesByKindsScanner capability. Builds a small graph with a +// mix of edge kinds, then verifies the streaming filter returns +// exactly the union of the requested kinds in any order. Covers the +// edge cases that the edge-driven analyzers rely on: zero-match (no +// edge matches the requested kinds), empty filter (yields nothing — +// never a whole-table scan), and early stop honouring the iterator +// contract. +func testEdgesByKindsScanner(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + + s.AddNode(mkNode("a", "A", "x.go", graph.KindFunction)) + s.AddNode(mkNode("b", "B", "x.go", graph.KindFunction)) + s.AddNode(mkNode("c", "C", "y.go", graph.KindType)) + s.AddNode(mkNode("d", "D", "y.go", graph.KindField)) + + calls1 := mkEdge("a", "b", graph.EdgeCalls) + calls1.Line = 1 + calls2 := mkEdge("a", "b", graph.EdgeCalls) + calls2.Line = 2 + refs := mkEdge("a", "c", graph.EdgeReferences) + writes := mkEdge("a", "d", graph.EdgeWrites) + throws := mkEdge("a", "c", graph.EdgeThrows) + s.AddEdge(calls1) + s.AddEdge(calls2) + s.AddEdge(refs) + s.AddEdge(writes) + s.AddEdge(throws) + + es, ok := s.(graph.EdgesByKindsScanner) + if !ok { + t.Skip("backend does not implement graph.EdgesByKindsScanner") + } + + // Multi-kind: union of Calls + References must surface all three + // calls/refs edges; counts (not pointers) compared so the in-memory + // and disk backends agree without relying on edge identity. + counts := map[graph.EdgeKind]int{} + for e := range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) { + counts[e.Kind]++ + } + if counts[graph.EdgeCalls] != 2 || counts[graph.EdgeReferences] != 1 { + t.Fatalf("EdgesByKinds(Calls,References) = %+v, want Calls:2 References:1", counts) + } + if got := len(counts); got != 2 { + t.Fatalf("EdgesByKinds(Calls,References) yielded %d distinct kinds, want 2", got) + } + + // Single-kind via the multi-kind path must match EdgesByKind. + single := 0 + for e := range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeWrites}) { + if e.Kind != graph.EdgeWrites { + t.Fatalf("EdgesByKinds(Writes) yielded kind=%s, want Writes", e.Kind) + } + single++ + } + if single != 1 { + t.Fatalf("EdgesByKinds(Writes) yielded %d, want 1", single) + } + + // Dedupe: repeating a kind must not double-yield. The backend's + // IN-list MUST collapse duplicates. + dup := 0 + for range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeCalls}) { + dup++ + } + if dup != 2 { + t.Fatalf("EdgesByKinds(Calls,Calls) yielded %d, want 2 (no double-yield)", dup) + } + + // Empty kinds yields nothing — never a whole-table scan. + empty := 0 + for range es.EdgesByKinds(nil) { + empty++ + } + if empty != 0 { + t.Fatalf("EdgesByKinds(nil) yielded %d, want 0", empty) + } + emptySlice := 0 + for range es.EdgesByKinds([]graph.EdgeKind{}) { + emptySlice++ + } + if emptySlice != 0 { + t.Fatalf("EdgesByKinds([]) yielded %d, want 0", emptySlice) + } + + // Empty string kinds get elided (matches dedupeEdgeKinds contract). + blank := 0 + for range es.EdgesByKinds([]graph.EdgeKind{"", "", ""}) { + blank++ + } + if blank != 0 { + t.Fatalf("EdgesByKinds(blank) yielded %d, want 0", blank) + } + + // Zero-match: a kind nothing in the graph uses yields nothing. + zero := 0 + for range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeKind("nonexistent")}) { + zero++ + } + if zero != 0 { + t.Fatalf("EdgesByKinds(nonexistent) yielded %d, want 0", zero) + } + + // Early stop honours the iterator contract. + stopped := 0 + for range es.EdgesByKinds([]graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +// testNodesByKindsScanner exercises the optional graph.NodesByKindsScanner +// capability. Seeds nodes of several kinds, including ones whose Meta +// holds the keys the metadata analyzers read, and asserts: +// - the IN-list returns exactly the union of the requested kinds +// (with nodes' Meta intact so post-filtering still works); +// - kinds the caller did not request never surface; +// - empty / nil kinds returns nil without scanning; +// - duplicate kinds in the input never duplicate the output. +// +// The Meta-preservation assertion is the load-bearing one: every +// downstream handler still runs its meta gate in Go after the kind +// pushdown, so the capability is worthless if Meta doesn't round-trip +// through the backend. +func testNodesByKindsScanner(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.NodesByKindsScanner) + if !ok { + t.Skip("backend does not implement graph.NodesByKindsScanner") + } + + // Two functions (one with coverage meta), one method, one type, + // one file (with cgo meta), one todo (with assignee meta), one + // table. Mix of meta-bearing and meta-bare nodes so the + // round-trip assertion covers both shapes. Meta values stay + // scalar — testMetaPreserved already covers flat round-trip, and + // the ladybug backend's gob encoder needs gob.Register for nested + // map shapes (out of scope for a kind-pushdown capability test). + fn1 := mkNode("pkg/a.go::Fn1", "Fn1", "pkg/a.go", graph.KindFunction) + fn1.Meta = map[string]any{ + "coverage_pct": 42.5, + "author_email": "alice@example.com", + } + fn2 := mkNode("pkg/a.go::Fn2", "Fn2", "pkg/a.go", graph.KindFunction) + method := mkNode("pkg/a.go::T.M", "M", "pkg/a.go", graph.KindMethod) + typ := mkNode("pkg/a.go::T", "T", "pkg/a.go", graph.KindType) + file := mkNode("pkg/a.go", "a.go", "pkg/a.go", graph.KindFile) + file.Meta = map[string]any{"uses_cgo": true} + todo := mkNode("pkg/a.go::TODO:7", "TODO", "pkg/a.go", graph.KindTodo) + todo.Meta = map[string]any{ + "tag": "TODO", + "assignee": "alice", + "text": "wire this up", + } + tbl := mkNode("table::users", "users", "schema/001.sql", graph.KindTable) + tbl.Meta = map[string]any{"table": "users", "dialect": "postgres"} + + for _, n := range []*graph.Node{fn1, fn2, method, typ, file, todo, tbl} { + s.AddNode(n) + } + + // Function + method — the stale_code/ownership/coverage default. + gotFnM := scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + wantFnM := []string{"pkg/a.go::Fn1", "pkg/a.go::Fn2", "pkg/a.go::T.M"} + if got := sortNodeIDs(gotFnM); fmt.Sprint(got) != fmt.Sprint(wantFnM) { + t.Fatalf("NodesByKinds(function,method) = %v, want %v", got, wantFnM) + } + + // Meta round-trip: pick up Fn1 and assert flat scalar meta survived. + var fn1Got *graph.Node + for _, n := range gotFnM { + if n.ID == "pkg/a.go::Fn1" { + fn1Got = n + break + } + } + if fn1Got == nil { + t.Fatalf("Fn1 missing from result") + } + if pct, _ := fn1Got.Meta["coverage_pct"].(float64); pct != 42.5 { + t.Fatalf("Fn1.Meta.coverage_pct = %v, want 42.5", fn1Got.Meta["coverage_pct"]) + } + if email, _ := fn1Got.Meta["author_email"].(string); email != "alice@example.com" { + t.Fatalf("Fn1.Meta.author_email = %q, want alice@example.com", email) + } + + // Single kind on a kind with meta — todo/file. + gotTodo := scan.NodesByKinds([]graph.NodeKind{graph.KindTodo}) + if len(gotTodo) != 1 || gotTodo[0].ID != "pkg/a.go::TODO:7" { + t.Fatalf("NodesByKinds(todo) = %v, want [pkg/a.go::TODO:7]", sortNodeIDs(gotTodo)) + } + if tag, _ := gotTodo[0].Meta["tag"].(string); tag != "TODO" { + t.Fatalf("Todo.Meta.tag = %q, want TODO", tag) + } + + gotFile := scan.NodesByKinds([]graph.NodeKind{graph.KindFile}) + if len(gotFile) != 1 || gotFile[0].ID != "pkg/a.go" { + t.Fatalf("NodesByKinds(file) = %v, want [pkg/a.go]", sortNodeIDs(gotFile)) + } + if cgo, _ := gotFile[0].Meta["uses_cgo"].(bool); !cgo { + t.Fatalf("File.Meta.uses_cgo = false, want true") + } + + // Table kind — for orphan/unreferenced analyzers. + gotTbl := scan.NodesByKinds([]graph.NodeKind{graph.KindTable}) + if len(gotTbl) != 1 || gotTbl[0].ID != "table::users" { + t.Fatalf("NodesByKinds(table) = %v, want [table::users]", sortNodeIDs(gotTbl)) + } + + // Empty / nil kinds — nil result, no scan. + if got := scan.NodesByKinds(nil); got != nil { + t.Fatalf("NodesByKinds(nil) = %v, want nil", got) + } + if got := scan.NodesByKinds([]graph.NodeKind{}); got != nil { + t.Fatalf("NodesByKinds([]) = %v, want nil", got) + } + + // Unknown kind — no rows, but still nil/empty, never the full table. + if got := scan.NodesByKinds([]graph.NodeKind{graph.NodeKind("no_such_kind")}); len(got) != 0 { + t.Fatalf("NodesByKinds(unknown) = %v, want 0 rows", got) + } + + // Dedup: passing the same kind twice must not double-yield. + gotDup := scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindFunction}) + wantDup := []string{"pkg/a.go::Fn1", "pkg/a.go::Fn2"} + if got := sortNodeIDs(gotDup); fmt.Sprint(got) != fmt.Sprint(wantDup) { + t.Fatalf("NodesByKinds(dup function) = %v, want %v", got, wantDup) + } +} + +// testEdgeKindCounter exercises the optional graph.EdgeKindCounter +// capability. Seeds a graph with several kinds in different +// frequencies and asserts the per-kind tally matches what an +// AllEdges()+map[kind]++ loop would compute. +func testEdgeKindCounter(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ek, ok := s.(graph.EdgeKindCounter) + if !ok { + t.Skip("backend does not implement graph.EdgeKindCounter") + } + + // Empty graph returns nil or empty — both are valid per the + // contract; callers must treat them the same. + if got := ek.EdgeKindCounts(); len(got) != 0 { + t.Fatalf("EdgeKindCounts(empty) = %v, want empty", got) + } + + s.AddNode(mkNode("A", "A", "a.go", graph.KindFunction)) + s.AddNode(mkNode("B", "B", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C", "C", "a.go", graph.KindFunction)) + s.AddNode(mkNode("f1", "a.go", "a.go", graph.KindFile)) + + // 3 calls, 2 references, 1 imports. + e1 := mkEdge("A", "B", graph.EdgeCalls) + e2 := mkEdge("A", "C", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("B", "C", graph.EdgeCalls) + e3.Line = 3 + e4 := mkEdge("A", "C", graph.EdgeReferences) + e4.Line = 4 + e5 := mkEdge("B", "C", graph.EdgeReferences) + e5.Line = 5 + e6 := mkEdge("A", "f1", graph.EdgeImports) + e6.Line = 6 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + s.AddEdge(e6) + + got := ek.EdgeKindCounts() + if got[graph.EdgeCalls] != 3 { + t.Fatalf("EdgeKindCounts[calls] = %d, want 3", got[graph.EdgeCalls]) + } + if got[graph.EdgeReferences] != 2 { + t.Fatalf("EdgeKindCounts[references] = %d, want 2", got[graph.EdgeReferences]) + } + if got[graph.EdgeImports] != 1 { + t.Fatalf("EdgeKindCounts[imports] = %d, want 1", got[graph.EdgeImports]) + } + // No extends edge was added; absence must produce 0 via the + // zero value (callers index with `m[k]`). + if got[graph.EdgeExtends] != 0 { + t.Fatalf("EdgeKindCounts[extends] = %d, want 0", got[graph.EdgeExtends]) + } +} + +// testCrossRepoEdgeAggregator exercises the optional +// graph.CrossRepoEdgeAggregator capability. Seeds a two-repo graph +// with one cross_repo_calls + one cross_repo_implements and two +// same-repo edges of other kinds. Asserts the per-triple counts and +// that single-repo edges drop out. +func testCrossRepoEdgeAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ag, ok := s.(graph.CrossRepoEdgeAggregator) + if !ok { + t.Skip("backend does not implement graph.CrossRepoEdgeAggregator") + } + + // Empty graph -> nil. + if got := ag.CrossRepoEdgeCounts(); got != nil { + t.Fatalf("CrossRepoEdgeCounts(empty) = %v, want nil", got) + } + + s.AddNode(mkRepoNode("repoA::Caller", "Caller", "a/c.go", "repoA", graph.KindFunction)) + s.AddNode(mkRepoNode("repoA::Callee2", "Callee2", "a/d.go", "repoA", graph.KindFunction)) + s.AddNode(mkRepoNode("repoB::Callee", "Callee", "b/d.go", "repoB", graph.KindFunction)) + s.AddNode(mkRepoNode("repoB::Iface", "Iface", "b/i.go", "repoB", graph.KindType)) + s.AddNode(mkRepoNode("repoA::Impl", "Impl", "a/i.go", "repoA", graph.KindType)) + + // Two cross-repo edges to the same (kind, fromRepo, toRepo) + + // one cross-repo implements + one non-cross edge. + e1 := mkEdge("repoA::Caller", "repoB::Callee", graph.EdgeCrossRepoCalls) + e2 := mkEdge("repoA::Caller", "repoB::Callee", graph.EdgeCrossRepoCalls) + e2.Line = 2 + e3 := mkEdge("repoA::Impl", "repoB::Iface", graph.EdgeCrossRepoImplements) + e3.Line = 3 + e4 := mkEdge("repoA::Caller", "repoA::Callee2", graph.EdgeCalls) + e4.Line = 4 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + + rows := ag.CrossRepoEdgeCounts() + // Sort for stable assertions — capability output order is + // unspecified. + sort.Slice(rows, func(i, j int) bool { + if rows[i].Kind != rows[j].Kind { + return rows[i].Kind < rows[j].Kind + } + if rows[i].FromRepo != rows[j].FromRepo { + return rows[i].FromRepo < rows[j].FromRepo + } + return rows[i].ToRepo < rows[j].ToRepo + }) + if len(rows) != 2 { + t.Fatalf("CrossRepoEdgeCounts: got %d rows, want 2 (rows=%v)", len(rows), rows) + } + if rows[0].Kind != graph.EdgeCrossRepoCalls || rows[0].FromRepo != "repoA" || rows[0].ToRepo != "repoB" || rows[0].Count != 2 { + t.Fatalf("CrossRepoEdgeCounts[0] = %+v, want {cross_repo_calls,repoA,repoB,2}", rows[0]) + } + if rows[1].Kind != graph.EdgeCrossRepoImplements || rows[1].FromRepo != "repoA" || rows[1].ToRepo != "repoB" || rows[1].Count != 1 { + t.Fatalf("CrossRepoEdgeCounts[1] = %+v, want {cross_repo_implements,repoA,repoB,1}", rows[1]) + } +} + +// testFileImportAggregator exercises the optional +// graph.FileImportAggregator capability. Seeds a graph with several +// import edges and asserts the per-target-file counts. Covers both +// the unscoped and the scope-bound paths plus the file-node-by-ID +// vs symbol-FilePath import shapes. +func testFileImportAggregator(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ag, ok := s.(graph.FileImportAggregator) + if !ok { + t.Skip("backend does not implement graph.FileImportAggregator") + } + + if got := ag.FileImportCounts(nil); got != nil { + t.Fatalf("FileImportCounts(empty graph) = %v, want nil", got) + } + + // Two targets, three importing files, mixed shapes. + s.AddNode(mkNode("pkg/popular.go", "popular.go", "pkg/popular.go", graph.KindFile)) + s.AddNode(mkNode("PopularFn", "PopularFn", "pkg/popular.go", graph.KindFunction)) + s.AddNode(mkNode("pkg/lonely.go", "lonely.go", "pkg/lonely.go", graph.KindFile)) + s.AddNode(mkNode("pkg/a.go", "a.go", "pkg/a.go", graph.KindFile)) + s.AddNode(mkNode("pkg/b.go", "b.go", "pkg/b.go", graph.KindFile)) + s.AddNode(mkNode("pkg/c.go", "c.go", "pkg/c.go", graph.KindFile)) + + // pkg/popular.go imported by 3 files (two via file-id, one via symbol-FilePath). + s.AddEdge(mkEdge("pkg/a.go", "pkg/popular.go", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/b.go", "pkg/popular.go", graph.EdgeImports)) + s.AddEdge(mkEdge("pkg/c.go", "PopularFn", graph.EdgeImports)) + // pkg/lonely.go imported once. + s.AddEdge(mkEdge("pkg/a.go", "pkg/lonely.go", graph.EdgeImports)) + // A calls edge — must drop out of imports counts. + s.AddEdge(mkEdge("pkg/a.go", "PopularFn", graph.EdgeCalls)) + + rows := ag.FileImportCounts(nil) + got := map[string]int{} + for _, r := range rows { + got[r.FilePath] = r.Count + } + if got["pkg/popular.go"] != 3 { + t.Fatalf("FileImportCounts[popular.go] = %d, want 3", got["pkg/popular.go"]) + } + if got["pkg/lonely.go"] != 1 { + t.Fatalf("FileImportCounts[lonely.go] = %d, want 1", got["pkg/lonely.go"]) + } + + // Scope-bound: only count edges whose target is in the allow set. + scoped := ag.FileImportCounts([]string{"pkg/lonely.go"}) + if len(scoped) != 1 || scoped[0].FilePath != "pkg/lonely.go" || scoped[0].Count != 1 { + t.Fatalf("FileImportCounts(scope=lonely) = %v, want [lonely.go:1]", scoped) + } + + // Scope-bound with file-id + symbol shape both targeting popular. + scopedPop := ag.FileImportCounts([]string{"pkg/popular.go", "PopularFn"}) + gotPop := map[string]int{} + for _, r := range scopedPop { + gotPop[r.FilePath] = r.Count + } + if gotPop["pkg/popular.go"] != 3 { + t.Fatalf("FileImportCounts(scope=popular+sym) = %v, want popular.go:3", scopedPop) + } + + // Empty (non-nil) scope MUST return nil — never a whole-graph scan. + if got := ag.FileImportCounts([]string{}); got != nil { + t.Fatalf("FileImportCounts(empty scope) = %v, want nil", got) + } +} + +// testInDegreeForNodes exercises the optional graph.InDegreeForNodes +// capability. Seeds a tiny graph with three targets carrying 0 / 1 / 3 +// incoming edges (of mixed kinds) and asserts the counter returns the +// per-target count restricted to the caller's id set. +func testInDegreeForNodes(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ic, ok := s.(graph.InDegreeForNodes) + if !ok { + t.Skip("backend does not implement graph.InDegreeForNodes") + } + + s.AddNode(mkNode("Hub", "Hub", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Lonely", "Lonely", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Isolated", "Isolated", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C1", "C1", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C2", "C2", "a.go", graph.KindFunction)) + s.AddNode(mkNode("C3", "C3", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Outside", "Outside", "a.go", graph.KindFunction)) + + e1 := mkEdge("C1", "Hub", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("C2", "Hub", graph.EdgeReferences) + e2.Line = 2 + e3 := mkEdge("C3", "Hub", graph.EdgeReads) + e3.Line = 3 + e4 := mkEdge("C1", "Lonely", graph.EdgeCalls) + e4.Line = 4 + // One incoming edge that targets Outside — must NOT surface when + // Outside is absent from the caller's id list. + e5 := mkEdge("C2", "Outside", graph.EdgeCalls) + e5.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + + got := ic.InDegreeForNodes([]string{"Hub", "Lonely", "Isolated"}) + if got["Hub"] != 3 { + t.Fatalf("InDegreeForNodes[Hub] = %d, want 3", got["Hub"]) + } + if got["Lonely"] != 1 { + t.Fatalf("InDegreeForNodes[Lonely] = %d, want 1", got["Lonely"]) + } + // Isolated and Outside are absent — the contract drops zero-count + // targets from the map. + if _, ok := got["Isolated"]; ok { + t.Fatalf("InDegreeForNodes[Isolated] surfaced with value %d, want absent", got["Isolated"]) + } + if _, ok := got["Outside"]; ok { + t.Fatalf("InDegreeForNodes[Outside] surfaced — caller didn't ask for it") + } + + // Empty ids => nil (never a whole-table scan). + if got := ic.InDegreeForNodes(nil); got != nil { + t.Fatalf("InDegreeForNodes(nil) = %v, want nil", got) + } + if got := ic.InDegreeForNodes([]string{}); got != nil { + t.Fatalf("InDegreeForNodes(empty) = %v, want nil", got) + } + // Duplicated ids dedup naturally. + dup := ic.InDegreeForNodes([]string{"Hub", "Hub", "Hub"}) + if dup["Hub"] != 3 { + t.Fatalf("InDegreeForNodes(dup Hub) = %d, want 3", dup["Hub"]) + } +} + +// testReachableForwardByKinds exercises the optional +// graph.ReachableForwardByKinds capability. Seeds a small directed +// graph mixing allowed and disallowed edge kinds, then asserts the +// closure from the seed set is the transitive subset reachable +// through only the allowed kinds. +func testReachableForwardByKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + rf, ok := s.(graph.ReachableForwardByKinds) + if !ok { + t.Skip("backend does not implement graph.ReachableForwardByKinds") + } + + // Layout: + // Test -> A (calls) + // A -> B (calls) + // B -> C (references) + // C -> D (reads) <-- disallowed kind: D unreachable + // X -> Y (calls) <-- disjoint subgraph: neither in closure + for _, id := range []string{"Test", "A", "B", "C", "D", "X", "Y"} { + s.AddNode(mkNode(id, id, "a.go", graph.KindFunction)) + } + e1 := mkEdge("Test", "A", graph.EdgeCalls) + e1.Line = 1 + e2 := mkEdge("A", "B", graph.EdgeCalls) + e2.Line = 2 + e3 := mkEdge("B", "C", graph.EdgeReferences) + e3.Line = 3 + e4 := mkEdge("C", "D", graph.EdgeReads) + e4.Line = 4 + e5 := mkEdge("X", "Y", graph.EdgeCalls) + e5.Line = 5 + s.AddEdge(e1) + s.AddEdge(e2) + s.AddEdge(e3) + s.AddEdge(e4) + s.AddEdge(e5) + + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + got := rf.ReachableForwardByKinds([]string{"Test"}, kinds) + want := map[string]bool{"Test": true, "A": true, "B": true, "C": true} + for id := range want { + if !got[id] { + t.Fatalf("ReachableForwardByKinds: missing %q in closure %v", id, got) + } + } + if got["D"] { + t.Fatalf("ReachableForwardByKinds: D should not be reachable (reads is disallowed)") + } + if got["X"] || got["Y"] { + t.Fatalf("ReachableForwardByKinds: disjoint subgraph leaked: %v", got) + } + + // Empty seeds => nil. + if got := rf.ReachableForwardByKinds(nil, kinds); got != nil { + t.Fatalf("ReachableForwardByKinds(nil) = %v, want nil", got) + } + // Empty kinds => seed set only. + zero := rf.ReachableForwardByKinds([]string{"Test"}, nil) + if !zero["Test"] || zero["A"] { + t.Fatalf("ReachableForwardByKinds(no kinds) = %v, want {Test:true}", zero) + } + // Duplicate seeds dedup naturally. + dup := rf.ReachableForwardByKinds([]string{"Test", "Test"}, kinds) + if !dup["Test"] || !dup["A"] || !dup["B"] || !dup["C"] { + t.Fatalf("ReachableForwardByKinds(dup seeds) = %v, want full closure", dup) + } +} + +// testThrowerErrorSurfacer exercises the optional +// graph.ThrowerErrorSurfacer capability. Seeds throwers with mixed +// error targets and EdgeEmits→KindString attachments, asserts the +// per-thrower row dedup + path-prefix filter both fire. +func testThrowerErrorSurfacer(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + ts, ok := s.(graph.ThrowerErrorSurfacer) + if !ok { + t.Skip("backend does not implement graph.ThrowerErrorSurfacer") + } + + // Throwers ThrowA (in pkg/keep/), ThrowB (in pkg/drop/). Targets + // ErrIO + ErrTimeout. ThrowA also emits two literal error_msg + // strings; one EdgeEmits goes to a non-error_msg context that + // must NOT surface in ErrorMsgs. + s.AddNode(mkNode("ThrowA", "ThrowA", "pkg/keep/a.go", graph.KindFunction)) + s.AddNode(mkNode("ThrowB", "ThrowB", "pkg/drop/b.go", graph.KindFunction)) + s.AddNode(mkNode("ErrIO", "ErrIO", "errors/io.go", graph.KindType)) + s.AddNode(mkNode("ErrTimeout", "ErrTimeout", "errors/io.go", graph.KindType)) + + msgOK1 := mkNode("msg1", "open failed", "pkg/keep/a.go", graph.KindString) + msgOK1.Meta = map[string]any{"context": "error_msg"} + s.AddNode(msgOK1) + msgOK2 := mkNode("msg2", "timeout", "pkg/keep/a.go", graph.KindString) + msgOK2.Meta = map[string]any{"context": "error_msg"} + s.AddNode(msgOK2) + // Wrong context — must be filtered out. + msgWrong := mkNode("msg3", "log line", "pkg/keep/a.go", graph.KindString) + msgWrong.Meta = map[string]any{"context": "log_msg"} + s.AddNode(msgWrong) + + // ThrowA throws ErrIO twice (dedup to one target) + ErrTimeout once. + e1 := mkEdge("ThrowA", "ErrIO", graph.EdgeThrows) + e1.FilePath = "pkg/keep/a.go" + e1.Line = 10 + e2 := mkEdge("ThrowA", "ErrIO", graph.EdgeThrows) + e2.FilePath = "pkg/keep/a.go" + e2.Line = 12 + e3 := mkEdge("ThrowA", "ErrTimeout", graph.EdgeThrows) + e3.FilePath = "pkg/keep/a.go" + e3.Line = 14 + // ThrowB throws ErrIO once. + e4 := mkEdge("ThrowB", "ErrIO", graph.EdgeThrows) + e4.FilePath = "pkg/drop/b.go" + e4.Line = 4 + // EdgeEmits attachments for ThrowA. + e5 := mkEdge("ThrowA", "msg1", graph.EdgeEmits) + e5.Line = 11 + e6 := mkEdge("ThrowA", "msg2", graph.EdgeEmits) + e6.Line = 13 + e7 := mkEdge("ThrowA", "msg3", graph.EdgeEmits) + e7.Line = 15 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5, e6, e7} { + s.AddEdge(e) + } + + rows := ts.ThrowerErrorSurface("") + byID := map[string]graph.ThrowerErrorRow{} + for _, r := range rows { + byID[r.ThrowerID] = r + } + + a, ok := byID["ThrowA"] + if !ok { + t.Fatalf("ThrowerErrorSurface: ThrowA missing from rows %v", rows) + } + if a.Throws != 3 { + t.Fatalf("ThrowA.Throws = %d, want 3", a.Throws) + } + gotTargets := append([]string(nil), a.ErrorTargets...) + sort.Strings(gotTargets) + if fmt.Sprint(gotTargets) != fmt.Sprint([]string{"ErrIO", "ErrTimeout"}) { + t.Fatalf("ThrowA.ErrorTargets = %v, want [ErrIO ErrTimeout]", gotTargets) + } + gotMsgs := append([]string(nil), a.ErrorMsgs...) + sort.Strings(gotMsgs) + if fmt.Sprint(gotMsgs) != fmt.Sprint([]string{"open failed", "timeout"}) { + t.Fatalf("ThrowA.ErrorMsgs = %v, want [open failed timeout]", gotMsgs) + } + + b, ok := byID["ThrowB"] + if !ok || b.Throws != 1 || len(b.ErrorTargets) != 1 || b.ErrorTargets[0] != "ErrIO" { + t.Fatalf("ThrowB row = %+v, want Throws=1 ErrorTargets=[ErrIO]", b) + } + if len(b.ErrorMsgs) != 0 { + t.Fatalf("ThrowB.ErrorMsgs = %v, want empty", b.ErrorMsgs) + } + + // Path-prefix filter drops ThrowB (under pkg/drop/) and keeps ThrowA. + keep := ts.ThrowerErrorSurface("pkg/keep/") + if len(keep) != 1 || keep[0].ThrowerID != "ThrowA" { + t.Fatalf("ThrowerErrorSurface(pkg/keep/) = %v, want only ThrowA", keep) + } + drop := ts.ThrowerErrorSurface("pkg/missing/") + if len(drop) != 0 { + t.Fatalf("ThrowerErrorSurface(pkg/missing/) = %v, want empty", drop) + } +} + +// testEdgeAdjacencyForKinds exercises the optional +// graph.EdgeAdjacencyForKinds capability. Seeds a graph mixing +// function/method/type nodes joined by Calls / References / Writes +// edges and asserts the iterator yields only (from, to) pairs whose +// edge kind is in the allowed set AND whose endpoints both fall in +// the allowed node-kind set. +func testEdgeAdjacencyForKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.EdgeAdjacencyForKinds) + if !ok { + t.Skip("backend does not implement graph.EdgeAdjacencyForKinds") + } + + s.AddNode(mkNode("F1", "F1", "x.go", graph.KindFunction)) + s.AddNode(mkNode("F2", "F2", "x.go", graph.KindFunction)) + s.AddNode(mkNode("M1", "M1", "x.go", graph.KindMethod)) + s.AddNode(mkNode("T1", "T1", "y.go", graph.KindType)) + s.AddNode(mkNode("V1", "V1", "y.go", graph.KindVariable)) + + // F1 → F2 Calls (function→function, in-set) + e1 := mkEdge("F1", "F2", graph.EdgeCalls) + e1.Line = 1 + // F2 → M1 References (function→method, in-set) + e2 := mkEdge("F2", "M1", graph.EdgeReferences) + e2.Line = 2 + // F1 → T1 References (function→type, NOT in-set: T1 excluded) + e3 := mkEdge("F1", "T1", graph.EdgeReferences) + e3.Line = 3 + // T1 → F2 References (type→function, NOT in-set: T1 excluded) + e4 := mkEdge("T1", "F2", graph.EdgeReferences) + e4.Line = 4 + // M1 → F1 Writes (method→function, edge kind excluded) + e5 := mkEdge("M1", "F1", graph.EdgeWrites) + e5.Line = 5 + // F1 → V1 References (function→variable, NOT in-set: V1 excluded) + e6 := mkEdge("F1", "V1", graph.EdgeReferences) + e6.Line = 6 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5, e6} { + s.AddEdge(e) + } + + eKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + nKinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + + got := make(map[[2]string]int) + for pair := range scan.EdgeAdjacencyForKinds(eKinds, nKinds) { + got[pair]++ + } + want := map[[2]string]int{ + {"F1", "F2"}: 1, + {"F2", "M1"}: 1, + } + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("EdgeAdjacencyForKinds = %v, want %v", got, want) + } + + // Empty edge kinds yields nothing — never a whole-table scan. + empty := 0 + for range scan.EdgeAdjacencyForKinds(nil, nKinds) { + empty++ + } + if empty != 0 { + t.Fatalf("EdgeAdjacencyForKinds(nil edges) yielded %d, want 0", empty) + } + // Empty node kinds yields nothing. + for range scan.EdgeAdjacencyForKinds(eKinds, nil) { + empty++ + } + if empty != 0 { + t.Fatalf("EdgeAdjacencyForKinds(nil nodes) yielded %d, want 0", empty) + } + // Zero-match: edge kind absent from graph yields nothing. + zero := 0 + for range scan.EdgeAdjacencyForKinds([]graph.EdgeKind{graph.EdgeKind("nonexistent")}, nKinds) { + zero++ + } + if zero != 0 { + t.Fatalf("EdgeAdjacencyForKinds(nonexistent edge) yielded %d, want 0", zero) + } + // Node-kind filter actually narrows: asking only for {Type} drops every pair. + narrowed := 0 + for range scan.EdgeAdjacencyForKinds(eKinds, []graph.NodeKind{graph.KindType}) { + narrowed++ + } + if narrowed != 0 { + t.Fatalf("EdgeAdjacencyForKinds(Type only) yielded %d, want 0", narrowed) + } + // Early stop honours the iterator contract. + stopped := 0 + for range scan.EdgeAdjacencyForKinds(eKinds, nKinds) { + stopped++ + if stopped == 1 { + break + } + } + if stopped != 1 { + t.Fatalf("early stop yielded %d before break, want 1", stopped) + } +} + +// testCommunityCrossingsByKind exercises the optional +// graph.CommunityCrossingsByKind capability. Seeds a small graph +// with a known community partition and asserts per-source crossing +// counts match for: no edges, all-same-community, all-cross, mixed. +func testCommunityCrossingsByKind(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.CommunityCrossingsByKind) + if !ok { + t.Skip("backend does not implement graph.CommunityCrossingsByKind") + } + + s.AddNode(mkNode("A1", "A1", "x.go", graph.KindFunction)) + s.AddNode(mkNode("A2", "A2", "x.go", graph.KindFunction)) + s.AddNode(mkNode("B1", "B1", "y.go", graph.KindFunction)) + s.AddNode(mkNode("B2", "B2", "y.go", graph.KindFunction)) + s.AddNode(mkNode("C1", "C1", "z.go", graph.KindFunction)) + + // A1 → A2 Calls (same community A — NOT a crossing) + e1 := mkEdge("A1", "A2", graph.EdgeCalls) + e1.Line = 1 + // A1 → B1 Calls (A→B — crossing) + e2 := mkEdge("A1", "B1", graph.EdgeCalls) + e2.Line = 2 + // A1 → C1 References (A→C — crossing, second from A1) + e3 := mkEdge("A1", "C1", graph.EdgeReferences) + e3.Line = 3 + // B1 → B2 References (same community B — NOT a crossing) + e4 := mkEdge("B1", "B2", graph.EdgeReferences) + e4.Line = 4 + // B2 → C1 Calls (B→C — crossing) + e5 := mkEdge("B2", "C1", graph.EdgeCalls) + e5.Line = 5 + // A2 → B2 Writes (different community but edge kind excluded) + e6 := mkEdge("A2", "B2", graph.EdgeWrites) + e6.Line = 6 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5, e6} { + s.AddEdge(e) + } + + communities := map[string]string{ + "A1": "A", "A2": "A", + "B1": "B", "B2": "B", + "C1": "C", + } + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + + got := scan.CommunityCrossingsByKind(kinds, communities) + want := map[string]int{ + "A1": 2, // → B1 + → C1 + "B2": 1, // → C1 + } + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("CommunityCrossingsByKind(mixed) = %v, want %v", got, want) + } + + // All-same-community partition: no crossings at all. + same := map[string]string{ + "A1": "A", "A2": "A", "B1": "A", "B2": "A", "C1": "A", + } + if r := scan.CommunityCrossingsByKind(kinds, same); len(r) != 0 { + t.Fatalf("CommunityCrossingsByKind(all-same) = %v, want empty", r) + } + + // All-cross-community partition: every edge in scope is a crossing. + allCross := map[string]string{ + "A1": "1", "A2": "2", "B1": "3", "B2": "4", "C1": "5", + } + allGot := scan.CommunityCrossingsByKind(kinds, allCross) + allWant := map[string]int{ + "A1": 3, // A1 has 3 in-scope out-edges + "B1": 1, // B1 → B2 (now also a crossing) + "B2": 1, // B2 → C1 + } + if fmt.Sprint(allGot) != fmt.Sprint(allWant) { + t.Fatalf("CommunityCrossingsByKind(all-cross) = %v, want %v", allGot, allWant) + } + + // Empty kinds returns nil — never a whole-table scan. + if r := scan.CommunityCrossingsByKind(nil, communities); r != nil { + t.Fatalf("CommunityCrossingsByKind(nil kinds) = %v, want nil", r) + } + // Empty community map returns nil. + if r := scan.CommunityCrossingsByKind(kinds, nil); r != nil { + t.Fatalf("CommunityCrossingsByKind(nil comm) = %v, want nil", r) + } + // Kind absent from graph yields nil. + if r := scan.CommunityCrossingsByKind([]graph.EdgeKind{graph.EdgeKind("nonexistent")}, communities); r != nil { + t.Fatalf("CommunityCrossingsByKind(nonexistent) = %v, want nil", r) + } +} + +// testNodeIDsByKinds exercises the optional graph.NodeIDsByKinds +// capability. Seeds nodes of several kinds and asserts the +// projection returns just the IDs of the requested kinds, with +// duplicates collapsed and empty input returning nil. +func testNodeIDsByKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.NodeIDsByKinds) + if !ok { + t.Skip("backend does not implement graph.NodeIDsByKinds") + } + + s.AddNode(mkNode("F1", "F1", "x.go", graph.KindFunction)) + s.AddNode(mkNode("F2", "F2", "x.go", graph.KindFunction)) + s.AddNode(mkNode("M1", "M1", "x.go", graph.KindMethod)) + s.AddNode(mkNode("T1", "T1", "y.go", graph.KindType)) + s.AddNode(mkNode("V1", "V1", "y.go", graph.KindVariable)) + + got := scan.NodeIDsByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + sort.Strings(got) + want := []string{"F1", "F2", "M1"} + if fmt.Sprint(got) != fmt.Sprint(want) { + t.Fatalf("NodeIDsByKinds(Function,Method) = %v, want %v", got, want) + } + + // Empty kinds returns nil. + if r := scan.NodeIDsByKinds(nil); r != nil { + t.Fatalf("NodeIDsByKinds(nil) = %v, want nil", r) + } + if r := scan.NodeIDsByKinds([]graph.NodeKind{}); r != nil { + t.Fatalf("NodeIDsByKinds(empty) = %v, want nil", r) + } + + // Blank kinds are elided. + if r := scan.NodeIDsByKinds([]graph.NodeKind{"", ""}); r != nil { + t.Fatalf("NodeIDsByKinds(blank) = %v, want nil", r) + } + + // Duplicates collapse — the IN-list must dedupe. + dup := scan.NodeIDsByKinds([]graph.NodeKind{graph.KindFunction, graph.KindFunction}) + sort.Strings(dup) + wantDup := []string{"F1", "F2"} + if fmt.Sprint(dup) != fmt.Sprint(wantDup) { + t.Fatalf("NodeIDsByKinds(Function,Function) = %v, want %v", dup, wantDup) + } + + // Kinds absent from the graph yield an empty slice (or nil). + miss := scan.NodeIDsByKinds([]graph.NodeKind{graph.KindInterface}) + if len(miss) != 0 { + t.Fatalf("NodeIDsByKinds(Interface) = %v, want empty", miss) + } +} + +// testMemberMethodsByType exercises the optional +// graph.MemberMethodsByType capability. Seeds a graph with multiple +// types, their methods, and a non-method EdgeMemberOf edge to verify +// the kind gate. +func testMemberMethodsByType(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.MemberMethodsByType) + if !ok { + t.Skip("backend does not implement graph.MemberMethodsByType") + } + + // Two types with method members + a noise field. + s.AddNode(mkNode("T1", "T1", "a.go", graph.KindType)) + s.AddNode(mkNode("T2", "T2", "b.go", graph.KindType)) + s.AddNode(mkNode("M1", "Foo", "a.go", graph.KindMethod)) + s.AddNode(mkNode("M2", "Bar", "a.go", graph.KindMethod)) + s.AddNode(mkNode("M3", "Foo", "b.go", graph.KindMethod)) + s.AddNode(mkNode("F1", "Field1", "a.go", graph.KindField)) + + s.AddEdge(mkEdge("M1", "T1", graph.EdgeMemberOf)) + s.AddEdge(mkEdge("M2", "T1", graph.EdgeMemberOf)) + s.AddEdge(mkEdge("M3", "T2", graph.EdgeMemberOf)) + // Non-method source — must NOT appear. + s.AddEdge(mkEdge("F1", "T1", graph.EdgeMemberOf)) + + got := scan.MemberMethodsByType() + t1Names := map[string]bool{} + for _, m := range got["T1"] { + t1Names[m.Name] = true + } + if !t1Names["Foo"] || !t1Names["Bar"] { + t.Fatalf("MemberMethodsByType T1 = %v, want {Foo, Bar}", got["T1"]) + } + if len(got["T1"]) != 2 { + t.Fatalf("MemberMethodsByType T1 size = %d, want 2", len(got["T1"])) + } + t2Names := map[string]bool{} + for _, m := range got["T2"] { + t2Names[m.Name] = true + } + if !t2Names["Foo"] || len(got["T2"]) != 1 { + t.Fatalf("MemberMethodsByType T2 = %v, want {Foo}", got["T2"]) + } + // Verify FilePath / StartLine columns are projected. + for _, m := range got["T1"] { + if m.MethodID == "" || m.FilePath == "" { + t.Fatalf("MemberMethodsByType T1 row missing columns: %+v", m) + } + } + + // Empty store returns nil. + empty := factory(t) + if r := empty.(graph.MemberMethodsByType).MemberMethodsByType(); r != nil { + t.Fatalf("MemberMethodsByType(empty) = %v, want nil", r) + } +} + +// testStructuralParentEdges exercises the optional +// graph.StructuralParentEdges capability. Seeds a mix of extends / +// implements / composes edges with varying endpoint kinds. +func testStructuralParentEdges(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.StructuralParentEdges) + if !ok { + t.Skip("backend does not implement graph.StructuralParentEdges") + } + + // Types / interfaces (in-set endpoints). + s.AddNode(mkNode("C1", "Child", "a.go", graph.KindType)) + s.AddNode(mkNode("P1", "Parent", "a.go", graph.KindType)) + s.AddNode(mkNode("I1", "Iface", "a.go", graph.KindInterface)) + // A method (NOT in-set). + s.AddNode(mkNode("M1", "Foo", "a.go", graph.KindMethod)) + + // In-set: type → type extends. + e1 := mkEdge("C1", "P1", graph.EdgeExtends) + e1.Line = 1 + e1.Origin = graph.OriginASTResolved + // In-set: type → interface implements. + e2 := mkEdge("C1", "I1", graph.EdgeImplements) + e2.Line = 2 + e2.Origin = graph.OriginASTInferred + // In-set: type → type composes. + e3 := mkEdge("C1", "P1", graph.EdgeComposes) + e3.Line = 3 + // OUT: extends with a method on one side. + e4 := mkEdge("M1", "P1", graph.EdgeExtends) + e4.Line = 4 + // OUT: irrelevant kind. + e5 := mkEdge("C1", "P1", graph.EdgeCalls) + e5.Line = 5 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5} { + s.AddEdge(e) + } + + rows := scan.StructuralParentEdges() + if len(rows) != 3 { + t.Fatalf("StructuralParentEdges len = %d, want 3 (rows=%v)", len(rows), rows) + } + // Verify origin propagation on the ast_resolved row. + var sawResolved, sawInferred bool + for _, r := range rows { + if r.FromID != "C1" { + t.Fatalf("unexpected FromID %q in row %v", r.FromID, r) + } + if r.FromKind != graph.KindType { + t.Fatalf("unexpected FromKind %q in row %v", r.FromKind, r) + } + if r.Origin == graph.OriginASTResolved { + sawResolved = true + } + if r.Origin == graph.OriginASTInferred { + sawInferred = true + } + } + if !sawResolved || !sawInferred { + t.Fatalf("origin not propagated: resolved=%v inferred=%v", sawResolved, sawInferred) + } + + // Empty graph returns nil/empty. + empty := factory(t) + if r := empty.(graph.StructuralParentEdges).StructuralParentEdges(); len(r) != 0 { + t.Fatalf("StructuralParentEdges(empty) = %v, want empty", r) + } +} + +// testCrossRepoCandidates exercises the optional +// graph.CrossRepoCandidates capability. Seeds same-repo and +// cross-repo edges and asserts only the distinct, non-empty +// repo-prefix pairs survive. +func testCrossRepoCandidates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.CrossRepoCandidates) + if !ok { + t.Skip("backend does not implement graph.CrossRepoCandidates") + } + + // Repo A. + s.AddNode(mkRepoNode("A1", "fnA1", "a.go", "repoA", graph.KindFunction)) + s.AddNode(mkRepoNode("A2", "fnA2", "a.go", "repoA", graph.KindFunction)) + // Repo B. + s.AddNode(mkRepoNode("B1", "fnB1", "b.go", "repoB", graph.KindFunction)) + // No repo. + s.AddNode(mkNode("X1", "fnX1", "x.go", graph.KindFunction)) + + // Same-repo calls — must NOT appear. + e1 := mkEdge("A1", "A2", graph.EdgeCalls) + e1.Line = 1 + // Cross-repo call — in. + e2 := mkEdge("A1", "B1", graph.EdgeCalls) + e2.Line = 2 + // Cross-repo implements — in. + e3 := mkEdge("A1", "B1", graph.EdgeImplements) + e3.Line = 3 + // Cross-repo edge but kind not in baseKinds — out. + e4 := mkEdge("A1", "B1", graph.EdgeReferences) + e4.Line = 4 + // Either endpoint missing repo — out. + e5 := mkEdge("A1", "X1", graph.EdgeCalls) + e5.Line = 5 + for _, e := range []*graph.Edge{e1, e2, e3, e4, e5} { + s.AddEdge(e) + } + + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeImplements, graph.EdgeExtends} + rows := scan.CrossRepoCandidates(kinds) + if len(rows) != 2 { + t.Fatalf("CrossRepoCandidates len = %d, want 2 (rows=%v)", len(rows), rows) + } + for _, r := range rows { + if r.FromRepo != "repoA" || r.ToRepo != "repoB" { + t.Fatalf("unexpected repos in row %v", r) + } + if r.Edge == nil || r.Edge.From != "A1" || r.Edge.To != "B1" { + t.Fatalf("unexpected edge in row %v", r) + } + } + + // Empty kinds returns nil — never a whole-table scan. + if r := scan.CrossRepoCandidates(nil); r != nil { + t.Fatalf("CrossRepoCandidates(nil) = %v, want nil", r) + } +} + +// testExtractCandidates exercises the optional +// graph.ExtractCandidatesScanner capability. Builds a graph with +// three functions: +// - Long+Hot: long body, 3 distinct callers, 6 distinct callees +// (passes every threshold). +// - Long+Cold: long body, 1 caller, 6 callees (fails minCallers). +// - Short+Hot: short body, 3 callers, 6 callees (fails minLines). +func testExtractCandidates(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.ExtractCandidatesScanner) + if !ok { + t.Skip("backend does not implement graph.ExtractCandidatesScanner") + } + + mk := func(id string, kind graph.NodeKind, start, end int) *graph.Node { + n := mkNode(id, id, "p/a.go", kind) + n.StartLine = start + n.EndLine = end + return n + } + s.AddNode(mk("LongHot", graph.KindFunction, 1, 60)) + s.AddNode(mk("LongCold", graph.KindFunction, 100, 160)) + s.AddNode(mk("ShortHot", graph.KindFunction, 200, 205)) + // Callers + callees as plain function nodes. + for i := 0; i < 6; i++ { + c := mkNode(fmt.Sprintf("C%d", i), fmt.Sprintf("C%d", i), "p/c.go", graph.KindFunction) + s.AddNode(c) + t := mkNode(fmt.Sprintf("T%d", i), fmt.Sprintf("T%d", i), "p/t.go", graph.KindFunction) + s.AddNode(t) + } + // LongHot: 3 distinct callers, 6 distinct callees. + for i := 0; i < 3; i++ { + e := mkEdge(fmt.Sprintf("C%d", i), "LongHot", graph.EdgeCalls) + e.Line = i + 1 + s.AddEdge(e) + } + for i := 0; i < 6; i++ { + e := mkEdge("LongHot", fmt.Sprintf("T%d", i), graph.EdgeCalls) + e.Line = 100 + i + s.AddEdge(e) + } + // LongCold: 1 caller, 6 callees. + e := mkEdge("C0", "LongCold", graph.EdgeCalls) + e.Line = 200 + s.AddEdge(e) + for i := 0; i < 6; i++ { + e := mkEdge("LongCold", fmt.Sprintf("T%d", i), graph.EdgeCalls) + e.Line = 300 + i + s.AddEdge(e) + } + // ShortHot: 3 callers, 6 callees but too short. + for i := 0; i < 3; i++ { + e := mkEdge(fmt.Sprintf("C%d", i), "ShortHot", graph.EdgeCalls) + e.Line = 400 + i + s.AddEdge(e) + } + for i := 0; i < 6; i++ { + e := mkEdge("ShortHot", fmt.Sprintf("T%d", i), graph.EdgeCalls) + e.Line = 500 + i + s.AddEdge(e) + } + + rows := scan.ExtractCandidates( + []graph.EdgeKind{graph.EdgeCalls}, + 20, // minLines + 2, // minCallers + 5, // minFanOut + "", // no prefix + ) + byID := make(map[string]graph.ExtractCandidateRow) + for _, r := range rows { + byID[r.NodeID] = r + } + r, ok := byID["LongHot"] + if !ok { + t.Fatalf("expected LongHot in result, got %v", rows) + } + if r.CallerCount != 3 || r.FanOut != 6 || r.LineCount != 60 { + t.Fatalf("LongHot row mismatch: %+v", r) + } + if _, present := byID["LongCold"]; present { + t.Fatalf("LongCold should have been filtered (caller count < 2)") + } + if _, present := byID["ShortHot"]; present { + t.Fatalf("ShortHot should have been filtered (lines < 20)") + } + + // Path prefix narrows to only LongHot (it's the one in p/a.go; + // LongCold and ShortHot also are in p/a.go so use a prefix that + // doesn't match). + none := scan.ExtractCandidates( + []graph.EdgeKind{graph.EdgeCalls}, 20, 2, 5, "no/such/", + ) + if len(none) != 0 { + t.Fatalf("ExtractCandidates with non-matching prefix = %d, want 0", len(none)) + } + // Empty kinds returns nil. + if r := scan.ExtractCandidates(nil, 0, 0, 0, ""); r != nil { + t.Fatalf("ExtractCandidates(nil kinds) = %v, want nil", r) + } +} + +// testFileSymbolNamesByPaths exercises the optional +// graph.FileSymbolNamesByPaths capability. +func testFileSymbolNamesByPaths(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.FileSymbolNamesByPaths) + if !ok { + t.Skip("backend does not implement graph.FileSymbolNamesByPaths") + } + + s.AddNode(mkNode("Alpha", "Alpha", "a.go", graph.KindFunction)) + s.AddNode(mkNode("Beta", "Beta", "a.go", graph.KindType)) + s.AddNode(mkNode("Gamma", "Gamma", "a.go", graph.KindMethod)) + s.AddNode(mkNode("LowCardField", "LowCardField", "a.go", graph.KindField)) + s.AddNode(mkNode("Delta", "Delta", "b.go", graph.KindFunction)) + + rows := scan.FileSymbolNamesByPaths( + []string{"a.go", "b.go"}, + []graph.NodeKind{graph.KindFunction, graph.KindMethod, graph.KindType, graph.KindInterface}, + ) + byFile := make(map[string]map[string]struct{}) + for _, r := range rows { + seen := byFile[r.FilePath] + if seen == nil { + seen = make(map[string]struct{}) + byFile[r.FilePath] = seen + } + seen[r.Name] = struct{}{} + } + want := map[string]map[string]struct{}{ + "a.go": {"Alpha": {}, "Beta": {}, "Gamma": {}}, + "b.go": {"Delta": {}}, + } + for file, names := range want { + got := byFile[file] + if len(got) != len(names) { + t.Fatalf("file %q: got %v, want %v", file, got, names) + } + for n := range names { + if _, ok := got[n]; !ok { + t.Errorf("file %q: missing name %q (got %v)", file, n, got) + } + } + } + // LowCardField (KindField) must not appear because it's not in + // the requested kinds. + if _, ok := byFile["a.go"]["LowCardField"]; ok { + t.Fatalf("kind filter leaked KindField row") + } + + // Empty paths returns nil. + if r := scan.FileSymbolNamesByPaths(nil, nil); r != nil { + t.Fatalf("FileSymbolNamesByPaths(nil) = %v, want nil", r) + } +} + +// testClassHierarchyTraverser exercises the optional +// graph.ClassHierarchyTraverser capability. +func testClassHierarchyTraverser(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.ClassHierarchyTraverser) + if !ok { + t.Skip("backend does not implement graph.ClassHierarchyTraverser") + } + + s.AddNode(mkNode("Animal", "Animal", "z.go", graph.KindInterface)) + s.AddNode(mkNode("Dog", "Dog", "z.go", graph.KindType)) + s.AddNode(mkNode("Puppy", "Puppy", "z.go", graph.KindType)) + // Dog implements Animal; Puppy extends Dog. + e1 := mkEdge("Dog", "Animal", graph.EdgeImplements) + e1.Line = 1 + s.AddEdge(e1) + e2 := mkEdge("Puppy", "Dog", graph.EdgeExtends) + e2.Line = 2 + s.AddEdge(e2) + + upRows := scan.ClassHierarchyTraverse( + "Puppy", "up", + []graph.EdgeKind{graph.EdgeExtends, graph.EdgeImplements, graph.EdgeComposes}, + 5, + ) + if len(upRows) != 2 { + t.Fatalf("Puppy up: %d rows, want 2 (Dog, Animal). rows=%v", len(upRows), upRows) + } + visited := map[string]bool{} + for _, r := range upRows { + for _, id := range r.Path { + visited[id] = true + } + } + if !visited["Dog"] || !visited["Animal"] { + t.Fatalf("Puppy up: missing Dog or Animal in visited set: %v", visited) + } + downRows := scan.ClassHierarchyTraverse( + "Animal", "down", + []graph.EdgeKind{graph.EdgeExtends, graph.EdgeImplements, graph.EdgeComposes}, + 5, + ) + visited = map[string]bool{} + for _, r := range downRows { + for _, id := range r.Path { + visited[id] = true + } + } + if !visited["Dog"] || !visited["Puppy"] { + t.Fatalf("Animal down: missing Dog or Puppy in visited set: %v", visited) + } + + // Empty kinds / depth=0 / unknown seed must return nil. + if r := scan.ClassHierarchyTraverse("Puppy", "up", nil, 5); r != nil { + t.Fatalf("nil kinds: got %v", r) + } + if r := scan.ClassHierarchyTraverse("Puppy", "up", + []graph.EdgeKind{graph.EdgeExtends}, 0); r != nil { + t.Fatalf("depth=0: got %v", r) + } + if r := scan.ClassHierarchyTraverse("nope", "up", + []graph.EdgeKind{graph.EdgeExtends}, 5); r != nil { + t.Fatalf("unknown seed: got %v", r) + } +} + +// testFileEditingContext exercises the optional +// graph.FileEditingContext capability. +func testFileEditingContext(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.FileEditingContext) + if !ok { + t.Skip("backend does not implement graph.FileEditingContext") + } + // File node + two functions inside it; an importing file with one + // function that calls into the file; a downstream file with a + // function the file's function calls. + s.AddNode(mkNode("a.go", "a.go", "a.go", graph.KindFile)) + s.AddNode(mkNode("a.go::Foo", "Foo", "a.go", graph.KindFunction)) + s.AddNode(mkNode("a.go::Bar", "Bar", "a.go", graph.KindMethod)) + s.AddNode(mkNode("b.go", "b.go", "b.go", graph.KindFile)) + s.AddNode(mkNode("b.go::Caller", "Caller", "b.go", graph.KindFunction)) + s.AddNode(mkNode("c.go::Callee", "Callee", "c.go", graph.KindFunction)) + + // Import edge: a.go imports b.go. + e := mkEdge("a.go", "b.go", graph.EdgeImports) + e.Line = 1 + s.AddEdge(e) + // Caller in b.go calls Foo in a.go. + e = mkEdge("b.go::Caller", "a.go::Foo", graph.EdgeCalls) + e.Line = 2 + s.AddEdge(e) + // Foo in a.go calls Callee in c.go. + e = mkEdge("a.go::Foo", "c.go::Callee", graph.EdgeCalls) + e.Line = 3 + s.AddEdge(e) + + res := scan.FileEditingContext("a.go", []graph.NodeKind{graph.KindFunction, graph.KindMethod}) + if res == nil { + t.Fatalf("FileEditingContext returned nil for a.go") + } + if res.FileNode == nil || res.FileNode.ID != "a.go" { + t.Fatalf("FileNode missing or wrong: %+v", res.FileNode) + } + defineIDs := map[string]bool{} + for _, n := range res.Defines { + defineIDs[n.ID] = true + } + if !defineIDs["a.go::Foo"] || !defineIDs["a.go::Bar"] { + t.Fatalf("defines missing entries: got %v", defineIDs) + } + if len(res.Imports) != 1 || res.Imports[0].To != "b.go" { + t.Fatalf("imports = %v, want one edge a.go→b.go", res.Imports) + } + calledBy := map[string]bool{} + for _, n := range res.CalledBy { + calledBy[n.ID] = true + } + if !calledBy["b.go::Caller"] { + t.Fatalf("called_by missing Caller: %v", calledBy) + } + calls := map[string]bool{} + for _, n := range res.Calls { + calls[n.ID] = true + } + if !calls["c.go::Callee"] { + t.Fatalf("calls missing Callee: %v", calls) + } + + // Empty path returns nil. + if r := scan.FileEditingContext("", nil); r != nil { + t.Fatalf("empty path: got %v, want nil", r) + } +} + +// testNodeDegreeByKinds exercises the optional +// graph.NodeDegreeByKinds capability. +func testNodeDegreeByKinds(t *testing.T, factory Factory) { + t.Helper() + s := factory(t) + scan, ok := s.(graph.NodeDegreeByKinds) + if !ok { + t.Skip("backend does not implement graph.NodeDegreeByKinds") + } + s.AddNode(mkNode("Iso", "Iso", "pkg/iso.go", graph.KindFunction)) + s.AddNode(mkNode("Hub", "Hub", "pkg/hub.go", graph.KindFunction)) + s.AddNode(mkNode("Leaf", "Leaf", "pkg/leaf.go", graph.KindMethod)) + s.AddNode(mkNode("Other", "Other", "pkg/other.go", graph.KindType)) + s.AddNode(mkNode("Caller", "Caller", "pkg/caller.go", graph.KindFunction)) + // 2 incoming + 1 outgoing on Hub. + for i, from := range []string{"Caller", "Leaf"} { + e := mkEdge(from, "Hub", graph.EdgeCalls) + e.Line = i + 1 + s.AddEdge(e) + } + e := mkEdge("Hub", "Leaf", graph.EdgeCalls) + e.Line = 3 + s.AddEdge(e) + + rows := scan.NodeDegreeByKinds( + []graph.NodeKind{graph.KindFunction, graph.KindMethod}, + "", + ) + byID := make(map[string]graph.NodeDegreeRow) + for _, r := range rows { + byID[r.NodeID] = r + } + if got := byID["Hub"]; got.InCount != 2 || got.OutCount != 1 { + t.Fatalf("Hub: %+v, want in=2 out=1", got) + } + if got, ok := byID["Iso"]; !ok || got.InCount != 0 || got.OutCount != 0 { + t.Fatalf("Iso: ok=%v got=%+v, want in=0 out=0", ok, got) + } + if _, ok := byID["Other"]; ok { + t.Fatalf("Other (KindType) leaked into kind-filtered result") + } + // Empty kinds returns nil. + if r := scan.NodeDegreeByKinds(nil, ""); r != nil { + t.Fatalf("NodeDegreeByKinds(nil) = %v, want nil", r) + } + // Path prefix narrows. + rows = scan.NodeDegreeByKinds( + []graph.NodeKind{graph.KindFunction, graph.KindMethod}, + "pkg/leaf", + ) + if len(rows) != 1 || rows[0].NodeID != "Leaf" { + t.Fatalf("pathPrefix scope mismatch: got %v", rows) + } +} diff --git a/internal/graph/stub.go b/internal/graph/stub.go new file mode 100644 index 00000000..c4d8a464 --- /dev/null +++ b/internal/graph/stub.go @@ -0,0 +1,203 @@ +package graph + +import "strings" + +// Stub-node identifier conventions. +// +// A "stub" is a placeholder Node the resolver materialises for a +// symbol the indexer can see referenced but not defined in the +// current repo's source: a stdlib call, a language builtin, an +// external module import, etc. Stubs let the graph hold edges +// to "external" targets uniformly with edges to first-party +// nodes. +// +// Format (all stubs): +// +// :::: +// +// where: +// +// repoPrefix — the owning repo's RepoPrefix (Indexer.RepoPrefix). +// Empty only when the stub is created outside a +// per-repo context (legacy single-repo daemons). +// kind — one of: stdlib, builtin, external_call, module. +// rest — kind-specific (e.g. "fmt::Errorf" for stdlib). +// +// Why per-repo? Two repos pinned to different language SDK +// versions have semantically distinct stdlib symbols. Go 1.21's +// `min` is a builtin; in 1.20 it isn't. A global `builtin::go::min` +// node would conflate them and produce wrong cross-repo edges. +// Per-repo prefix keeps them as distinct nodes; a future +// "same-as" edge can union them when the workspace knows the +// versions actually match. +const ( + StubKindStdlib = "stdlib" + StubKindBuiltin = "builtin" + StubKindExternalCall = "external_call" + StubKindModule = "module" +) + +// StubID composes a stub identifier with the per-repo prefix. +// Pass repoPrefix = "" when the caller is outside a per-repo +// context (single-repo daemons that haven't set a prefix). +func StubID(repoPrefix, kind string, parts ...string) string { + var b strings.Builder + if repoPrefix != "" { + b.WriteString(repoPrefix) + b.WriteString("::") + } + b.WriteString(kind) + for _, p := range parts { + b.WriteString("::") + b.WriteString(p) + } + return b.String() +} + +// IsStub reports whether id is any stub kind. Cheaper than +// StubKind when callers only need a yes/no. +func IsStub(id string) bool { + return StubKind(id) != "" +} + +// StubKind extracts the stub category (stdlib / builtin / +// external_call / module) from id. Returns "" if id is not a +// stub. +// +// Format dispatch: +// - "::" — legacy, no repo prefix +// - "::::" — per-repo prefix +// +// We match by looking for one of the known kind segments +// anywhere in the first two "::"-separated positions. +func StubKind(id string) string { + for _, k := range stubKinds { + // Without repo prefix: "::..." + if strings.HasPrefix(id, k+"::") { + return k + } + } + // With repo prefix: "::::..." + // Find the second "::" segment. + first := strings.Index(id, "::") + if first < 0 { + return "" + } + rest := id[first+2:] + for _, k := range stubKinds { + if strings.HasPrefix(rest, k+"::") { + return k + } + } + return "" +} + +// stubKinds is the closed set of stub categories. Ordered by +// expected frequency so the lookup loop bails early in the +// common case. +var stubKinds = []string{ + StubKindStdlib, + StubKindExternalCall, + StubKindBuiltin, + StubKindModule, +} + +// IsStdlibStub etc are convenience predicates that don't make +// the caller compare StubKind's return against a literal. +func IsStdlibStub(id string) bool { return StubKind(id) == StubKindStdlib } +func IsBuiltinStub(id string) bool { return StubKind(id) == StubKindBuiltin } +func IsExternalCallStub(id string) bool { return StubKind(id) == StubKindExternalCall } +func IsModuleStub(id string) bool { return StubKind(id) == StubKindModule } + +// StubRest returns the kind-specific tail of a stub id (the +// portion after "::::" or "::"). Returns "" if +// id is not a stub. Useful for the "fmt::Errorf" portion of a +// stdlib stub when callers need to inspect the symbol identity. +func StubRest(id string) string { + kind := StubKind(id) + if kind == "" { + return "" + } + prefix := kind + "::" + if idx := strings.Index(id, prefix); idx >= 0 { + return id[idx+len(prefix):] + } + return "" +} + +// UnresolvedMarker is the prefix the extractor emits for a call/ +// reference target the resolver still needs to bind to a concrete +// Node. +// +// Forms: +// +// unresolved::Name — legacy / single-repo +// ::unresolved::Name — multi-repo COPY rewrite (in +// copyBulkLocked, to dodge +// cross-repo PK collisions) +// +// IsUnresolvedTarget / UnresolvedName / UnresolvedRepoPrefix +// normalise over both shapes so callers (resolver, MCP filters, +// data-flow tracker) don't have to know the encoding. +const UnresolvedMarker = "unresolved::" + +// IsUnresolvedTarget reports whether id names an unresolved +// extractor stub in either the bare or the multi-repo form. +func IsUnresolvedTarget(id string) bool { + if id == "" { + return false + } + if strings.HasPrefix(id, UnresolvedMarker) { + return true + } + return strings.Contains(id, "::"+UnresolvedMarker) +} + +// UnresolvedName returns the bare symbol name encoded in an +// unresolved target id, stripping the `unresolved::` prefix (and +// any leading `::`). Returns "" when id is not an +// unresolved stub. +func UnresolvedName(id string) string { + if id == "" { + return "" + } + if strings.HasPrefix(id, UnresolvedMarker) { + return id[len(UnresolvedMarker):] + } + idx := strings.Index(id, "::"+UnresolvedMarker) + if idx < 0 { + return "" + } + return id[idx+len("::"+UnresolvedMarker):] +} + +// UnresolvedRepoPrefix returns the per-repo prefix encoded in an +// unresolved target id, or "" if the id is bare or not an +// unresolved stub. +func UnresolvedRepoPrefix(id string) string { + if id == "" || strings.HasPrefix(id, UnresolvedMarker) { + return "" + } + idx := strings.Index(id, "::"+UnresolvedMarker) + if idx <= 0 { + return "" + } + return id[:idx] +} + +// StubRepoPrefix returns the per-repo prefix of a stub id, or +// "" if the id has no prefix or isn't a stub. +func StubRepoPrefix(id string) string { + kind := StubKind(id) + if kind == "" { + return "" + } + // If id starts with the kind directly, there's no repo prefix. + if strings.HasPrefix(id, kind+"::") { + return "" + } + if idx := strings.Index(id, "::"); idx > 0 { + return id[:idx] + } + return "" +} diff --git a/internal/graph/unresolved_helpers_test.go b/internal/graph/unresolved_helpers_test.go new file mode 100644 index 00000000..bf494a54 --- /dev/null +++ b/internal/graph/unresolved_helpers_test.go @@ -0,0 +1,45 @@ +package graph + +import "testing" + +// TestUnresolvedHelpers locks in the multi-repo unresolved target +// normalisation: a literal `unresolved::Foo` (legacy single-repo) and +// a per-repo `gortex::unresolved::Foo` (multi-repo COPY rewrite) must +// both be recognised by IsUnresolvedTarget and decoded to "Foo" by +// UnresolvedName. Pre-fix, every caller used strings.HasPrefix on the +// literal form, which silently missed the prefixed form and left +// every multi-repo call edge dangling. +func TestUnresolvedHelpers(t *testing.T) { + t.Parallel() + + cases := []struct { + id string + isU bool + name string + prefix string + }{ + // Legacy / single-repo form + {"unresolved::AddNode", true, "AddNode", ""}, + {"unresolved::*.Foo", true, "*.Foo", ""}, + {"unresolved::import::fmt", true, "import::fmt", ""}, + // Multi-repo COPY-rewrite form + {"gortex::unresolved::AddNode", true, "AddNode", "gortex"}, + {"tree-sitter-dart::unresolved::ACCEPT_TOKEN", true, "ACCEPT_TOKEN", "tree-sitter-dart"}, + // Non-stubs + {"gortex/internal/graph/graph.go::Graph.AddNode", false, "", ""}, + {"", false, "", ""}, + {"stdlib::fmt::Errorf", false, "", ""}, + {"gortex::stdlib::fmt::Errorf", false, "", ""}, + } + for _, c := range cases { + if got := IsUnresolvedTarget(c.id); got != c.isU { + t.Errorf("IsUnresolvedTarget(%q) = %v, want %v", c.id, got, c.isU) + } + if got := UnresolvedName(c.id); got != c.name { + t.Errorf("UnresolvedName(%q) = %q, want %q", c.id, got, c.name) + } + if got := UnresolvedRepoPrefix(c.id); got != c.prefix { + t.Errorf("UnresolvedRepoPrefix(%q) = %q, want %q", c.id, got, c.prefix) + } + } +} diff --git a/internal/hooks/probe_e2e_test.go b/internal/hooks/probe_e2e_test.go index 9f54422a..139c6bc3 100644 --- a/internal/hooks/probe_e2e_test.go +++ b/internal/hooks/probe_e2e_test.go @@ -38,6 +38,12 @@ func (f *fakeController) Shutdown(_ context.Context) error { return nil } func (f *fakeController) SearchSymbols(_ context.Context, _ daemon.SearchSymbolsParams) (daemon.SearchSymbolsResult, error) { return daemon.SearchSymbolsResult{Hits: f.hits}, nil } +func (f *fakeController) EnrichChurn(_ context.Context, _ daemon.EnrichChurnParams) (daemon.EnrichChurnResult, error) { + return daemon.EnrichChurnResult{}, nil +} +func (f *fakeController) EnrichReleases(_ context.Context, _ daemon.EnrichReleasesParams) (daemon.EnrichReleasesResult, error) { + return daemon.EnrichReleasesResult{}, nil +} // startTestDaemon spins up a real daemon on a short-path unix socket and // points GORTEX_DAEMON_SOCKET at it so daemon.Dial finds it. diff --git a/internal/indexer/clones.go b/internal/indexer/clones.go index dd2de4a5..0524e1e2 100644 --- a/internal/indexer/clones.go +++ b/internal/indexer/clones.go @@ -234,7 +234,7 @@ func bodyText(lines []string, startLine, endLine int) string { // (deletes clone_shingles, sets clone_sig) across nodes that other // graph-wide passes (markTestSymbolsAndEmitEdges, ResolveTemporalCalls, // reach.BuildIndex) also touch under the same mutex. -func finaliseCloneSignatures(g *graph.Graph) { +func finaliseCloneSignatures(g graph.Store) { // First pass: collect every body that has stashed shingles. We // capture the *graph.Node pointers up front so the CMS-build pass // and the signature-compute pass don't both re-walk g.AllNodes(). @@ -342,7 +342,7 @@ type CloneDetectionStats struct { // edges cannot survive — when either endpoint's file is reindexed, // EvictFile removes that node's edges in both directions before this // pass re-runs. -func detectClonesAndEmitEdges(g *graph.Graph, threshold float64) CloneDetectionStats { +func detectClonesAndEmitEdges(g graph.Store, threshold float64) CloneDetectionStats { return detectClonesAndEmitEdgesCtx(context.Background(), g, threshold) } @@ -353,7 +353,7 @@ func detectClonesAndEmitEdges(g *graph.Graph, threshold float64) CloneDetectionS // without intra-stage reporters an operator sees just one // "clone detection pass" marker followed by minutes of silence — no // way to tell finalise-signatures from LSH from edge-emission. -func detectClonesAndEmitEdgesCtx(ctx context.Context, g *graph.Graph, threshold float64) CloneDetectionStats { +func detectClonesAndEmitEdgesCtx(ctx context.Context, g graph.Store, threshold float64) CloneDetectionStats { var stats CloneDetectionStats if g == nil { return stats @@ -527,7 +527,7 @@ type diffusionEdge struct { // directPairs carries the canonicalised clone pairs already emitted as // EdgeSimilarTo; any pair in that set is skipped so semantically_related // and similar_to partition cleanly. -func diffuseSimilarityEdges(g *graph.Graph, pairs []clones.Pair, directPairs map[[2]string]struct{}) (diffusedPairs, diffusedEdges int) { +func diffuseSimilarityEdges(g graph.Store, pairs []clones.Pair, directPairs map[[2]string]struct{}) (diffusedPairs, diffusedEdges int) { if g == nil || len(pairs) < 2 { return 0, 0 } @@ -633,7 +633,7 @@ func diffuseSimilarityEdges(g *graph.Graph, pairs []clones.Pair, directPairs map // node's file/line for locality. Origin is ast_inferred — the // relationship is a statistical estimate over normalised tokens, not a // structural fact. -func emitSimilarEdge(g *graph.Graph, from, to *graph.Node, similarity float64) { +func emitSimilarEdge(g graph.Store, from, to *graph.Node, similarity float64) { g.AddEdge(&graph.Edge{ From: from.ID, To: to.ID, @@ -651,7 +651,7 @@ func emitSimilarEdge(g *graph.Graph, from, to *graph.Node, similarity float64) { // edge is anchored at the source node's file/line and origin is // ast_inferred — the score is a statistical estimate over normalised // tokens, here additionally smoothed across the similarity graph. -func emitSemanticallyRelatedEdge(g *graph.Graph, from, to *graph.Node, similarity float64) { +func emitSemanticallyRelatedEdge(g graph.Store, from, to *graph.Node, similarity float64) { g.AddEdge(&graph.Edge{ From: from.ID, To: to.ID, diff --git a/internal/indexer/clones_indexer_test.go b/internal/indexer/clones_indexer_test.go index 632c61bb..b3f10ead 100644 --- a/internal/indexer/clones_indexer_test.go +++ b/internal/indexer/clones_indexer_test.go @@ -63,7 +63,7 @@ func openAndScan(conn *Conn, statement string) error { } ` -func similarToEdges(g *graph.Graph) []*graph.Edge { +func similarToEdges(g graph.Store) []*graph.Edge { var out []*graph.Edge for _, e := range g.AllEdges() { if e.Kind == graph.EdgeSimilarTo { diff --git a/internal/indexer/contract_import_resolve.go b/internal/indexer/contract_import_resolve.go index ebf5bd5c..78026329 100644 --- a/internal/indexer/contract_import_resolve.go +++ b/internal/indexer/contract_import_resolve.go @@ -31,7 +31,7 @@ import ( // Languages other than TS / JS are skipped — Go disambiguates // bare-name collisions via package qualification (`pkg.Type`) and the // in-file resolveTypeInFile pass already handles those. -func (mi *MultiIndexer) disambiguateBareTypesViaImports(cr *contracts.Registry, g *graph.Graph) { +func (mi *MultiIndexer) disambiguateBareTypesViaImports(cr *contracts.Registry, g graph.Store) { srcCache := map[string][]byte{} importCache := map[string]map[string]string{} @@ -74,7 +74,7 @@ func (mi *MultiIndexer) disambiguateBareTypesViaImports(cr *contracts.Registry, // (so the caller leaves the bare name in place). func (mi *MultiIndexer) resolveBareTypeViaImports( srcFile, name string, - g *graph.Graph, + g graph.Store, srcCache map[string][]byte, importCache map[string]map[string]string, ) string { diff --git a/internal/indexer/contracts_bulk_commit_test.go b/internal/indexer/contracts_bulk_commit_test.go new file mode 100644 index 00000000..375e1abd --- /dev/null +++ b/internal/indexer/contracts_bulk_commit_test.go @@ -0,0 +1,209 @@ +package indexer + +import ( + "os" + "path/filepath" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/contracts" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/parser" +) + +// recordingBulkGraph embeds *graph.Graph (auto-satisfying graph.Store) +// and adds the BulkLoader methods so it also satisfies +// graph.BulkLoader. It records the order of BeginBulkLoad / AddBatch +// / FlushBulk calls so a test can assert that the contracts commit +// path routes through the bulk fast lane instead of per-row +// AddNode / AddEdge writes. +type recordingBulkGraph struct { + *graph.Graph + + calls []string + addNode atomic.Int64 + addEdge atomic.Int64 +} + +func newRecordingBulkGraph() *recordingBulkGraph { + return &recordingBulkGraph{Graph: graph.New()} +} + +func (r *recordingBulkGraph) BeginBulkLoad() { + r.calls = append(r.calls, "BeginBulkLoad") +} + +func (r *recordingBulkGraph) FlushBulk() error { + r.calls = append(r.calls, "FlushBulk") + return nil +} + +func (r *recordingBulkGraph) AddNode(n *graph.Node) { + r.addNode.Add(1) + r.Graph.AddNode(n) +} + +func (r *recordingBulkGraph) AddEdge(e *graph.Edge) { + r.addEdge.Add(1) + r.Graph.AddEdge(e) +} + +func (r *recordingBulkGraph) AddBatch(nodes []*graph.Node, edges []*graph.Edge) { + r.calls = append(r.calls, "AddBatch") + r.Graph.AddBatch(nodes, edges) +} + +// TestCommitContracts_BatchesViaAddBatch asserts that the final +// write phase of commitContracts emits all contract nodes and +// edges through a single AddBatch call and does NOT engage the +// BulkLoader COPY bracket. Contract IDs frequently coincide with +// existing source-symbol IDs (a handler appears as both a Go +// function and an HTTP-contract anchor), and Ladybug's COPY FROM +// is INSERT-only on the node table — wrapping the contracts pass +// in BeginBulkLoad/FlushBulk would crash on the first collision. +// AddBatch's per-call MERGE path absorbs duplicates safely. +func TestCommitContracts_BatchesViaAddBatch(t *testing.T) { + g := newRecordingBulkGraph() + require.Implements(t, (*graph.BulkLoader)(nil), graph.Store(g)) + + // Anchor symbol the contract's provides-edge will point from. + g.Graph.AddNode(&graph.Node{ + ID: "pkg/foo.go::Handler.List", + Kind: graph.KindMethod, + Name: "List", + FilePath: "pkg/foo.go", + Language: "go", + }) + + idx := New(g, parser.NewRegistry(), config.Default().Index, zap.NewNop()) + + reg := contracts.NewRegistry() + reg.Add(contracts.Contract{ + ID: "http::GET::/v1/items", + Type: contracts.ContractHTTP, + Role: contracts.RoleProvider, + SymbolID: "pkg/foo.go::Handler.List", + FilePath: "pkg/foo.go", + Line: 42, + }) + reg.Add(contracts.Contract{ + ID: "http::POST::/v1/items", + Type: contracts.ContractHTTP, + Role: contracts.RoleConsumer, + SymbolID: "pkg/foo.go::Handler.List", + FilePath: "pkg/foo.go", + Line: 58, + }) + + idx.commitContracts(reg) + + require.Equal(t, + []string{"AddBatch"}, + g.calls, + "contracts commit must batch through a single AddBatch call", + ) + require.Zero(t, g.addNode.Load(), "no per-row AddNode calls expected") + require.Zero(t, g.addEdge.Load(), "no per-row AddEdge calls expected") + + require.NotNil(t, g.GetNode("http::GET::/v1/items")) + require.NotNil(t, g.GetNode("http::POST::/v1/items")) + + // Provider contract emits both EdgeProvides and EdgeHandlesRoute; + // consumer contract emits only EdgeConsumes. + provides := g.GetOutEdges("pkg/foo.go::Handler.List") + var nProvides, nConsumes, nHandles int + for _, e := range provides { + switch e.Kind { + case graph.EdgeProvides: + nProvides++ + case graph.EdgeConsumes: + nConsumes++ + case graph.EdgeHandlesRoute: + nHandles++ + } + } + require.Equal(t, 1, nProvides, "expected 1 EdgeProvides for the provider contract") + require.Equal(t, 1, nConsumes, "expected 1 EdgeConsumes for the consumer contract") + require.Equal(t, 1, nHandles, "expected 1 EdgeHandlesRoute for the HTTP provider") +} + +// TestCommitContracts_NoBulkLoader_FallsBackToAddBatch asserts that +// when the backend does not implement graph.BulkLoader (the +// in-memory *graph.Graph case) commitContracts still issues a +// single AddBatch — not the per-row AddNode / AddEdge writes — and +// does not attempt to call BeginBulkLoad / FlushBulk. +func TestCommitContracts_NoBulkLoader_FallsBackToAddBatch(t *testing.T) { + g := graph.New() + require.NotImplements(t, (*graph.BulkLoader)(nil), graph.Store(g)) + + g.AddNode(&graph.Node{ + ID: "pkg/foo.go::Handler.List", + Kind: graph.KindMethod, + Name: "List", + FilePath: "pkg/foo.go", + Language: "go", + }) + + idx := New(g, parser.NewRegistry(), config.Default().Index, zap.NewNop()) + + reg := contracts.NewRegistry() + reg.Add(contracts.Contract{ + ID: "http::GET::/v1/items", + Type: contracts.ContractHTTP, + Role: contracts.RoleProvider, + SymbolID: "pkg/foo.go::Handler.List", + FilePath: "pkg/foo.go", + Line: 42, + }) + + idx.commitContracts(reg) + + require.NotNil(t, g.GetNode("http::GET::/v1/items")) + out := g.GetOutEdges("pkg/foo.go::Handler.List") + var nProvides, nHandles int + for _, e := range out { + switch e.Kind { + case graph.EdgeProvides: + nProvides++ + case graph.EdgeHandlesRoute: + nHandles++ + } + } + require.Equal(t, 1, nProvides) + require.Equal(t, 1, nHandles) +} + +// TestExtractGoModContracts_UsesAddBatch asserts that go.mod +// dependency-contract emission goes through a single AddBatch +// call (with the bulk path engaged when the backend supports it) +// instead of the per-row AddNode loop that previously did one +// cgo round-trip per dependency on the Ladybug backend. +func TestExtractGoModContracts_UsesAddBatch(t *testing.T) { + dir := t.TempDir() + goMod := []byte(`module example.com/test + +go 1.22 + +require ( + github.com/dep/one v1.0.0 + github.com/dep/two v0.5.0 +) +`) + require.NoError(t, os.WriteFile(filepath.Join(dir, "go.mod"), goMod, 0o644)) + + g := newRecordingBulkGraph() + idx := New(g, parser.NewRegistry(), config.Default().Index, zap.NewNop()) + idx.rootPath = dir + + reg := contracts.NewRegistry() + idx.extractGoModContracts(reg) + + require.Contains(t, g.calls, "AddBatch", + "extractGoModContracts must emit dep nodes via a single AddBatch") + require.Zero(t, g.addNode.Load(), "no per-row AddNode calls expected") +} + diff --git a/internal/indexer/dataflow.go b/internal/indexer/dataflow.go index c8c7679d..83622dd0 100644 --- a/internal/indexer/dataflow.go +++ b/internal/indexer/dataflow.go @@ -55,7 +55,7 @@ func (idx *Indexer) materializeDataflowParams() { // and lifts the edge target from the function node to the param // node at the recorded position. Edges that already point at a // param node are left alone. -func rewriteArgOf(g *graph.Graph, e *graph.Edge) { +func rewriteArgOf(g graph.Store, e *graph.Edge) { if e == nil || e.Meta == nil { return } @@ -83,7 +83,7 @@ func rewriteArgOf(g *graph.Graph, e *graph.Edge) { // rewriteReturnsTo lifts the placeholder From by joining on the // resolved EdgeCalls edge from the same caller and line. -func rewriteReturnsTo(g *graph.Graph, e *graph.Edge) { +func rewriteReturnsTo(g graph.Store, e *graph.Edge) { if e == nil || e.Meta == nil { return } @@ -112,7 +112,7 @@ func rewriteReturnsTo(g *graph.Graph, e *graph.Edge) { // unresolved target string so we don't lift to the wrong call when // two calls live on the same line. Falls back to the first match // otherwise. -func findCallTarget(g *graph.Graph, callerID string, line int, calleeText string) string { +func findCallTarget(g graph.Store, callerID string, line int, calleeText string) string { out := g.GetOutEdges(callerID) var fallback string for _, e := range out { @@ -163,7 +163,7 @@ func callTargetMatches(call *graph.Edge, calleeText string) bool { // paramNodeAtPosition returns the param node ID with the recorded // position attached to ownerID via EdgeParamOf. -func paramNodeAtPosition(g *graph.Graph, ownerID string, pos int) string { +func paramNodeAtPosition(g graph.Store, ownerID string, pos int) string { in := g.GetInEdges(ownerID) for _, e := range in { if e.Kind != graph.EdgeParamOf { diff --git a/internal/indexer/dataflow_test.go b/internal/indexer/dataflow_test.go index deb223aa..25293959 100644 --- a/internal/indexer/dataflow_test.go +++ b/internal/indexer/dataflow_test.go @@ -14,7 +14,7 @@ import ( // indexAll indexes a single-file Go fixture and runs the global // resolve + dataflow materialisation pass. Returns the graph for // assertions. -func indexAll(t *testing.T, src string) *graph.Graph { +func indexAll(t *testing.T, src string) graph.Store { t.Helper() dir := t.TempDir() require.NoError(t, os.WriteFile(filepath.Join(dir, "main.go"), []byte(src), 0o644)) @@ -28,7 +28,7 @@ func indexAll(t *testing.T, src string) *graph.Graph { } // findEdges returns all edges matching the predicate. -func findEdges(g *graph.Graph, kind graph.EdgeKind, match func(*graph.Edge) bool) []*graph.Edge { +func findEdges(g graph.Store, kind graph.EdgeKind, match func(*graph.Edge) bool) []*graph.Edge { var out []*graph.Edge for _, e := range g.AllEdges() { if e.Kind != kind { @@ -172,7 +172,7 @@ func Driver(z int) int { } } -func findFuncID(t *testing.T, g *graph.Graph, name string) string { +func findFuncID(t *testing.T, g graph.Store, name string) string { t.Helper() candidates := g.FindNodesByName(name) for _, n := range candidates { @@ -184,7 +184,7 @@ func findFuncID(t *testing.T, g *graph.Graph, name string) string { return "" } -func dumpAllEdges(g *graph.Graph) string { +func dumpAllEdges(g graph.Store) string { var b strings.Builder for _, e := range g.AllEdges() { b.WriteString(string(e.Kind)) diff --git a/internal/indexer/di_contracts.go b/internal/indexer/di_contracts.go index 6447eb53..550b61ab 100644 --- a/internal/indexer/di_contracts.go +++ b/internal/indexer/di_contracts.go @@ -36,15 +36,17 @@ func (idx *Indexer) extractDIContracts(reg *contracts.Registry) { var discovered []contracts.Contract if idx.repoPrefix != "" { - // Multi-repo: walk only this repo's outgoing edges. - for _, n := range idx.graph.GetRepoNodes(idx.repoPrefix) { - for _, e := range idx.graph.GetOutEdges(n.ID) { - c, ok := diContractFromEdge(e) - if !ok { - continue - } - discovered = append(discovered, c) + // Multi-repo: walk only this repo's outgoing edges via a + // single backend query. The previous GetRepoNodes × + // GetOutEdges nested walk was O(repo_nodes) per-node round- + // trips on disk backends — at ~68k repo nodes that meant + // 68k Cypher queries per pass on Ladybug. + for _, e := range idx.graph.GetRepoEdges(idx.repoPrefix) { + c, ok := diContractFromEdge(e) + if !ok { + continue } + discovered = append(discovered, c) } } else { // Single-repo: every edge belongs to this repo. @@ -96,10 +98,11 @@ func (idx *Indexer) linkSpringBeans() { } if idx.repoPrefix != "" { - for _, n := range idx.graph.GetRepoNodes(idx.repoPrefix) { - for _, e := range idx.graph.GetOutEdges(n.ID) { - collectBean(e) - } + // Single backend query instead of one GetOutEdges per + // repo node — see extractDIContracts above for the round- + // trip math. + for _, e := range idx.graph.GetRepoEdges(idx.repoPrefix) { + collectBean(e) } } else { for _, e := range idx.graph.AllEdges() { diff --git a/internal/indexer/diffusion_test.go b/internal/indexer/diffusion_test.go index b72702da..3dc1a684 100644 --- a/internal/indexer/diffusion_test.go +++ b/internal/indexer/diffusion_test.go @@ -12,7 +12,7 @@ import ( // semanticallyRelatedEdges collects every EdgeSemanticallyRelated edge // in the graph — the diffusion-pass output surface. -func semanticallyRelatedEdges(g *graph.Graph) []*graph.Edge { +func semanticallyRelatedEdges(g graph.Store) []*graph.Edge { var out []*graph.Edge for _, e := range g.AllEdges() { if e.Kind == graph.EdgeSemanticallyRelated { @@ -24,7 +24,7 @@ func semanticallyRelatedEdges(g *graph.Graph) []*graph.Edge { // addFnNode registers a bare function node so diffuseSimilarityEdges // has real endpoints to attach edges to. -func addFnNode(g *graph.Graph, id string) { +func addFnNode(g graph.Store, id string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindFunction, Name: id, FilePath: id, StartLine: 1, Language: "go", @@ -169,7 +169,7 @@ func TestDiffuseSimilarityEdges_Chain(t *testing.T) { // diffusedScoreFor returns the similarity carried by the directed // semantically_related edge from→to, and whether such an edge exists. -func diffusedScoreFor(g *graph.Graph, from, to string) (float64, bool) { +func diffusedScoreFor(g graph.Store, from, to string) (float64, bool) { for _, e := range semanticallyRelatedEdges(g) { if e.From == from && e.To == to { return e.Meta["similarity"].(float64), true diff --git a/internal/indexer/grpc_resolve_test.go b/internal/indexer/grpc_resolve_test.go index 44568451..9b942e16 100644 --- a/internal/indexer/grpc_resolve_test.go +++ b/internal/indexer/grpc_resolve_test.go @@ -12,7 +12,7 @@ import ( ) // outEdgeTo returns the first out-edge of fromID whose target is toID. -func outEdgeTo(g *graph.Graph, fromID, toID string) *graph.Edge { +func outEdgeTo(g graph.Store, fromID, toID string) *graph.Edge { for _, e := range g.GetOutEdges(fromID) { if e.To == toID { return e diff --git a/internal/indexer/incremental_reindex_test.go b/internal/indexer/incremental_reindex_test.go index 1f3daae0..c9ca51db 100644 --- a/internal/indexer/incremental_reindex_test.go +++ b/internal/indexer/incremental_reindex_test.go @@ -87,7 +87,7 @@ func Gone() {} // of its structural identity (node identities + edge triples). Two // graphs with an equal projection are byte-identical for every query // the engine can answer. -func canonicalGraph(g *graph.Graph) string { +func canonicalGraph(g graph.Store) string { var lines []string for _, n := range g.AllNodes() { if n == nil { diff --git a/internal/indexer/indexer.go b/internal/indexer/indexer.go index 1a9e6e52..90623460 100644 --- a/internal/indexer/indexer.go +++ b/internal/indexer/indexer.go @@ -76,8 +76,17 @@ type IndexResult struct { // file node carrying skipped_due_to_size / skipped_due_to_timeout // telemetry. Zero unless one of those caps is set. SkippedFiles int `json:"skipped_files,omitempty"` - DurationMs int64 `json:"duration_ms"` - Errors []IndexError `json:"errors,omitempty"` + // DeletedFileCount is the number of previously-indexed files that + // were evicted this pass because they no longer exist on disk (only + // populated by IncrementalReindex). Together with StaleFileCount it + // lets a batch caller — the daemon warmup loop in particular — decide + // whether a repo actually changed since the last shutdown: when both + // are zero across every repo, the persisted graph already carries + // every resolved / derived edge and the global resolution passes can + // be skipped entirely (the warm-restart fast path). + DeletedFileCount int `json:"deleted_file_count,omitempty"` + DurationMs int64 `json:"duration_ms"` + Errors []IndexError `json:"errors,omitempty"` } // EdgeSanityViolated reports the post-reindex sanity-check failure: an @@ -101,7 +110,14 @@ type IndexError struct { // Indexer walks a repository and populates the graph. type Indexer struct { - graph *graph.Graph + graph graph.Store + // indexCount tracks how many IndexCtx calls this Indexer has + // completed. Gates the cold-start shadow-swap: each per-repo + // Indexer in MultiIndexer is fresh (indexCount==0), so all of + // them take the shadow path regardless of what sibling repos + // have already drained into the shared disk store. Per-repo- + // prefixed stub IDs make the concurrent drains conflict-free. + indexCount atomic.Int32 registry *parser.Registry resolver *resolver.Resolver search search.Backend @@ -281,8 +297,12 @@ type contractCacheEntry struct { contracts []contracts.Contract } -// New creates an Indexer. -func New(g *graph.Graph, reg *parser.Registry, cfg config.IndexConfig, logger *zap.Logger) *Indexer { +// New creates an Indexer that writes through the supplied graph.Store. +// Any backend (in-memory, ladybug-on-disk, remote) is acceptable — the +// indexer's mutation paths go through the Store interface methods only, +// so swapping backends is a zero-code-change configuration choice for +// callers. +func New(g graph.Store, reg *parser.Registry, cfg config.IndexConfig, logger *zap.Logger) *Indexer { idx := &Indexer{ graph: g, registry: reg, @@ -291,7 +311,15 @@ func New(g *graph.Graph, reg *parser.Registry, cfg config.IndexConfig, logger *z // corpus sizes can happen in a background goroutine without // racing with concurrent searches. Subsequent reassignments to // idx.search (Hybrid wrap, etc.) should use swap helpers below. - search: search.NewSwappable(search.NewAuto()), + // + // When the backing store implements graph.SymbolSearcher + // (today only store_ladybug), the initial backend is a thin + // adapter that forwards Search to the store's native FTS. + // The in-process Bleve / BM25 build path is then bypassed + // entirely — saving ~100MB heap on a Vscode-scale repo and + // putting search in the same address space as the rest of + // the graph queries. + search: search.NewSwappable(initialSearchBackend(g)), config: cfg, transforms: newTransformPipeline(cfg.Transforms, logger), logger: logger, @@ -345,6 +373,87 @@ func searchIndexFields(n *graph.Node) []string { return []string{n.Name, n.FilePath, sig} } +// vectorSearcherDelegate is the search.VectorDelegate-shaped +// adapter the indexer hands to VectorBackend.SetDelegate when the +// underlying store implements graph.VectorSearcher. SimilarTo just +// forwards — search.VectorDelegate is defined to return +// graph.VectorHit slices directly, so there's no translation work +// here, just a small struct so the in-process search package +// doesn't depend on graph.VectorSearcher's full surface. +type vectorSearcherDelegate struct { + s graph.VectorSearcher +} + +func (d *vectorSearcherDelegate) SimilarTo(vec []float32, limit int) ([]graph.VectorHit, error) { + if d == nil || d.s == nil { + return nil, nil + } + return d.s.SimilarTo(vec, limit) +} + +// initialSearchBackend picks the search.Backend the indexer wraps +// in its Swappable on construction. When the underlying store +// implements graph.SymbolSearcher (today only store_ladybug), a +// thin adapter routes Search calls through the store's native FTS +// — the in-process BM25 / Bleve build path is bypassed entirely. +// Otherwise falls through to search.NewAuto which picks BM25 for +// small corpora and auto-upgrades to Bleve once the size warrants +// it. +func initialSearchBackend(g graph.Store) search.Backend { + if s, ok := g.(graph.SymbolSearcher); ok { + return search.NewSymbolSearcherBackend(s) + } + return search.NewAuto() +} + +// isSymbolSearcherBackend reports whether the swappable's currently +// active backend is the SymbolSearcher adapter. Used to suppress +// the Bleve auto-upgrade goroutine — if the active backend is +// already a native FTS, upgrading to Bleve would re-index the same +// corpus into a parallel in-process Bleve and silently swap it in, +// defeating the FTS path and pinning the ~100MB heap the FTS +// integration was meant to release. +func isSymbolSearcherBackend(b search.Backend) bool { + if b == nil { + return false + } + if sw, ok := b.(*search.Swappable); ok { + b = sw.Inner() + } + _, ok := b.(*search.SymbolSearcherBackend) + return ok +} + +// ftsTokensFor produces the pre-tokenised text the backend FTS path +// indexes. Mirrors searchIndexFields' field selection but joins +// every field through search.Tokenize (camelCase / snake_case / +// path-segment splitter) so the resulting token list matches the +// in-process BM25 corpus contract — the same query produces the +// same recall against either backend. Joined with spaces so the +// downstream COPY FROM sees a single STRING column value. +func ftsTokensFor(n *graph.Node) string { + fields := searchIndexFields(n) + if n.QualName != "" { + // QualName carries the dotted form (`pkg.Sub.Type.Method`) + // that adds qualifier-hop recall ("auth" matching + // "auth.ValidateToken"). searchIndexFields omits it for + // the legacy BM25 path (which folds qual into the + // name-token bag separately), so we add it explicitly here. + fields = append(fields, n.QualName) + } + tokens := make([]string, 0, 16) + for _, f := range fields { + if f == "" { + continue + } + tokens = append(tokens, search.Tokenize(f)...) + } + if len(tokens) == 0 { + return "" + } + return strings.Join(tokens, " ") +} + // shouldIndexForSearch reports whether a node should be added to the // text search index (BM25/Bleve). File and Import nodes are never // searchable symbols. Beyond that, config.SkipSearch filters out @@ -357,6 +466,22 @@ func (idx *Indexer) shouldIndexForSearch(n *graph.Node) bool { if n.Kind == graph.KindFile || n.Kind == graph.KindImport { return false } + // KindLocal nodes are intra-function bindings emitted to satisfy + // rel-table FK constraints on the dataflow edges that target + // locals. They have a real Name (the variable identifier) but + // surfacing them in BM25 would flood every search for common + // names like `err`, `data`, `n`, `i`. Excluded unconditionally. + if n.Kind == graph.KindLocal { + return false + } + // KindBuiltin nodes are language intrinsics (append / len / + // string / int / ...). Surfacing them in name search would + // drown every other hit on common identifiers — agents already + // know `string` / `append`. They remain queryable by kind and + // by ID for the analytics passes that care. + if n.Kind == graph.KindBuiltin { + return false + } // Prose-section nodes are searchable only when prose indexing is // enabled (search.index_prose); the rest of the graph is // unaffected by the toggle. @@ -485,7 +610,7 @@ func (idx *Indexer) upgradeSearchToBleve(snapshot []bleveUpgradeEntry) { } // Graph returns the underlying graph. -func (idx *Indexer) Graph() *graph.Graph { return idx.graph } +func (idx *Indexer) Graph() graph.Store { return idx.graph } // Search returns the search backend. func (idx *Indexer) Search() search.Backend { return idx.search } @@ -1506,7 +1631,7 @@ func (idx *Indexer) Index(root string) (*IndexResult, error) { // is pulled from ctx via progress.FromContext — attach one with // progress.WithReporter to receive stage updates. If no reporter is attached, // stage calls are silently dropped. -func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, error) { +func (idx *Indexer) IndexCtx(ctx context.Context, root string) (result *IndexResult, retErr error) { start := time.Now() reporter := progress.FromContext(ctx) @@ -1584,6 +1709,214 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er } reporter.Report("walking files", len(files), len(files)) + // In-memory shadow for cold-start indexing on disk-backed stores. + // Disk backends pay ms-level per-call cost on every read; running + // the resolver against the disk store turns its ~100k+ point + // lookups into many minutes of wall time. Instead, swap idx.graph + // to an in-memory *Graph for the whole IndexCtx pipeline — parse, + // resolve, all subpasses, every per-edge MERGE/MATCH stays in + // memory at nanosecond latency. At the end, dump the final state + // to the disk backend via one BulkLoad cycle, so the disk has the + // post-resolve graph and the bench's query workload runs against + // the persisted state. + // + // Guards: + // - Backend must implement graph.BulkLoader (ladybug opts in). + // - Store must be empty (NodeCount == 0 && EdgeCount == 0). The + // final dump is BulkLoad's INSERT-only fast path — running it + // against a non-empty store would corrupt or duplicate. + // Incremental / re-index flows fall through to the per-call + // AddBatch path against the disk store directly. + // - File count is below the shadow-max threshold (see + // shadowMaxFileCount). Above the threshold the shadow's RAM + // footprint would exceed available memory — Linux / Firefox + // at full scale (~10M+ edges) would push the shadow past + // 20GB. Override with GORTEX_SHADOW_MAX_FILES. + // - The swap happens before the parse worker pool starts and is + // committed before IndexCtx returns. retErr from the named + // return suppresses the commit when the pipeline errored — + // the disk store stays empty rather than capturing partial + // state. + var diskTarget graph.Store + var inMemShadow *graph.Graph + bl, blOK := idx.graph.(graph.BulkLoader) + // Per-Indexer sentinel: each *Indexer is constructed fresh + // (per-repo in MultiIndexer, once in single-repo daemons), so + // "this Indexer has indexed before" is the right question to + // gate the shadow-swap on. The legacy gate looked at the + // disk store's NodeCount, but in MultiIndexer the disk store + // holds data from sibling repos that already drained — the + // gate would mis-fire and force the big repo onto the per-row + // path. With per-repo-prefixed stub IDs (internal/graph/stub.go) + // concurrent shadow drains no longer conflict on PRIMARY KEY, + // so disk-non-empty is safe. + firstIndex := idx.indexCount.Load() == 0 + belowShadowMax := len(files) <= shadowMaxFileCount() + preNodes := idx.graph.NodeCount() + preEdges := idx.graph.EdgeCount() + idx.logger.Info("indexer: shadow-swap decision", + zap.String("repo", idx.RepoPrefix()), + zap.Bool("bulk_loader", blOK), + zap.Bool("first_index", firstIndex), + zap.Int("pre_nodes", preNodes), + zap.Int("pre_edges", preEdges), + zap.Int("files", len(files)), + zap.Int("shadow_max_files", shadowMaxFileCount()), + zap.Bool("below_shadow_max", belowShadowMax), + zap.Bool("shadow_taken", blOK && firstIndex && belowShadowMax), + ) + if blOK && firstIndex && belowShadowMax { + // Warm-restart safety. `firstIndex` is a PER-INDEXER sentinel, and + // a fresh per-repo Indexer is constructed on every daemon restart, + // so firstIndex is true on every restart — even when the + // persistent disk store already holds this repo's nodes from a + // prior run. The shadow drain below ends in BulkLoad's INSERT-only + // COPY, which (per this function's own contract) "running against a + // non-empty store would corrupt or duplicate". On the ladybug + // backend a duplicate-primary-key COPY does not error cleanly — it + // SIGSEGVs inside lbug_connection_query and takes the whole daemon + // down, then re-fires on the next restart (the repo's mtimes never + // got persisted because warmup died first): a crash loop. Evicting + // the repo's existing rows first makes the COPY land on a clean + // slate. EvictRepo self-guards with a count query, so this is a + // cheap no-op for the genuine first-index cases (true cold start, + // a newly-tracked repo) where the disk store has no rows for this + // prefix. preNodes>0 short-circuits the call entirely on the + // first repo of a cold start (empty store). + if preNodes > 0 { + if n, e := idx.graph.EvictRepo(idx.RepoPrefix()); n > 0 || e > 0 { + idx.logger.Info("indexer: evicted stale repo rows before bulk reload (warm restart)", + zap.String("repo", idx.RepoPrefix()), + zap.Int("nodes", n), zap.Int("edges", e)) + } + } + idx.indexCount.Add(1) + diskTarget = idx.graph + inMemShadow = graph.New() + idx.graph = inMemShadow + // The resolver was constructed at indexer.New with the disk + // Store. Redirect it at the shadow too, otherwise ResolveAll + // reads from the empty disk Store, finds no pending edges, + // and short-circuits — silently disabling every resolver pass + // (module attribution, relative imports, edge in-place + // resolution, …) for any backend that takes the shadow path. + if idx.resolver != nil { + idx.resolver.SetGraph(inMemShadow) + } + defer func() { + if retErr != nil { + idx.graph = diskTarget + if idx.resolver != nil { + idx.resolver.SetGraph(diskTarget) + } + return + } + reporter.Report("persisting bulk graph", 0, 0) + drainStart := time.Now() + shadowNodeCount := inMemShadow.NodeCount() + shadowEdgeCount := inMemShadow.EdgeCount() + idx.logger.Info("indexer: drain start (shadow → disk)", + zap.String("repo", idx.RepoPrefix()), + zap.Int("shadow_nodes", shadowNodeCount), + zap.Int("shadow_edges", shadowEdgeCount), + ) + bl.BeginBulkLoad() + // Drain the shadow shard-by-shard so the indexer's hold on + // the 11-GB Linux-scale graph is released progressively + // instead of pinned until persist returns. The drain + // iterators free each shard's node/edge maps as they + // advance, so peak RAM during the persist window is + // roughly the chunk buffer + the backend's working set, + // not full shadow + the disk backend's bulk-COPY buffer. + // + // Collect (id, tokens) for every search-eligible node as + // the drain yields them — feeds the backend's native FTS + // at FlushBulk time when the store implements + // graph.SymbolSearcher. Nodes that fail + // shouldIndexForSearch (KindFile / KindImport / + // KindLocal / KindBuiltin / skip-search lang+kind pairs) + // are excluded so the FTS corpus matches the in-process + // BM25 corpus exactly. + searcher, hasFTS := diskTarget.(graph.SymbolSearcher) + var ftsItems []graph.SymbolFTSItem + if hasFTS { + // Pre-size to the shadow's node count to avoid grow + // churn on a 600k-node Vscode-shape repo. + ftsItems = make([]graph.SymbolFTSItem, 0, inMemShadow.NodeCount()) + } + const persistChunk = 100000 + nodeBuf := make([]*graph.Node, 0, persistChunk) + for n := range inMemShadow.DrainNodes() { + if hasFTS && idx.shouldIndexForSearch(n) { + ftsItems = append(ftsItems, graph.SymbolFTSItem{ + NodeID: n.ID, + Tokens: ftsTokensFor(n), + }) + } + nodeBuf = append(nodeBuf, n) + if len(nodeBuf) >= persistChunk { + diskTarget.AddBatch(nodeBuf, nil) + nodeBuf = nodeBuf[:0] + } + } + if len(nodeBuf) > 0 { + diskTarget.AddBatch(nodeBuf, nil) + nodeBuf = nil + } + edgeBuf := make([]*graph.Edge, 0, persistChunk) + for e := range inMemShadow.DrainEdges() { + edgeBuf = append(edgeBuf, e) + if len(edgeBuf) >= persistChunk { + diskTarget.AddBatch(nil, edgeBuf) + edgeBuf = edgeBuf[:0] + } + } + if len(edgeBuf) > 0 { + diskTarget.AddBatch(nil, edgeBuf) + edgeBuf = nil + } + flushStart := time.Now() + idx.logger.Info("indexer: FlushBulk start", + zap.String("repo", idx.RepoPrefix()), + zap.Duration("drain_elapsed", flushStart.Sub(drainStart)), + ) + if ferr := bl.FlushBulk(); ferr != nil { + retErr = fmt.Errorf("indexer: persist bulk graph: %w", ferr) + } + idx.logger.Info("indexer: FlushBulk complete", + zap.String("repo", idx.RepoPrefix()), + zap.Duration("flush_elapsed", time.Since(flushStart)), + zap.Duration("total_drain", time.Since(drainStart)), + zap.Int("nodes", shadowNodeCount), + zap.Int("edges", shadowEdgeCount), + ) + // Build the backend FTS after the bulk load completes so + // CREATE_FTS_INDEX has the full corpus to scan in one + // pass. BulkUpsertSymbolFTS does its own + // extension-install dance, so this is the only place the + // indexer needs to know about SymbolSearcher. + if hasFTS && len(ftsItems) > 0 { + reporter.Report("building symbol fts", 0, 0) + if ferr := searcher.BulkUpsertSymbolFTS(idx.RepoPrefix(), ftsItems); ferr != nil { + idx.logger.Warn("indexer: bulk symbol FTS upsert failed", + zap.Error(ferr)) + } else if ferr := searcher.BuildSymbolIndex(); ferr != nil { + idx.logger.Warn("indexer: backend FTS build failed", + zap.Error(ferr)) + } + reporter.Report("building symbol fts", 1, 1) + } + reporter.Report("persisting bulk graph", 1, 1) + idx.graph = diskTarget + }() + } else if diskTarget == nil && idx.graph.NodeCount() == 0 && idx.graph.EdgeCount() == 0 { + if _, isBulk := idx.graph.(graph.BulkLoader); isBulk && len(files) > shadowMaxFileCount() { + idx.logger.Info("indexer: skipping in-memory shadow above threshold", + zap.Int("files", len(files)), + zap.Int("threshold", shadowMaxFileCount())) + } + } + // Worker pool. workers := idx.config.Workers if workers <= 0 { @@ -1623,7 +1956,6 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er contractReg := contracts.NewRegistry() var contractMu sync.Mutex - fileCh := make(chan walkedFile, workers*4) var errMu sync.Mutex var errors []IndexError var processed int64 @@ -1631,156 +1963,201 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er var skippedByTimeout int64 var skippedByMinified int64 - var wg sync.WaitGroup - for range workers { - wg.Add(1) - go func() { - defer wg.Done() - var localContracts []contracts.Contract - for wf := range fileCh { - path := wf.path - p := atomic.AddInt64(&processed, 1) - if p == 1 || p%parseReportEvery == 0 { - reporter.Report("parsing", int(p), totalFiles) - } + // parseChunk runs the per-file worker pool over the supplied + // slice. Closure over outer state (errors, counters, contract + // registry, parsePool, quarantine) so it can be called multiple + // times — once for the non-streaming path, repeatedly for the + // streaming-flush large-repo path where each call processes a + // bounded slice into a per-chunk in-memory shadow. + parseChunk := func(chunkFiles []walkedFile) { + fileCh := make(chan walkedFile, workers*4) + var wg sync.WaitGroup + for range workers { + wg.Add(1) + go func() { + defer wg.Done() + var localContracts []contracts.Contract + for wf := range fileCh { + path := wf.path + p := atomic.AddInt64(&processed, 1) + if p == 1 || p%parseReportEvery == 0 { + reporter.Report("parsing", int(p), totalFiles) + } - src, err := os.ReadFile(path) - if err != nil { - errMu.Lock() - errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) - errMu.Unlock() - continue - } + src, err := os.ReadFile(path) + if err != nil { + errMu.Lock() + errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) + errMu.Unlock() + continue + } - relPath, _ := filepath.Rel(absRoot, path) - // Reuse the walk-time language. The walk's - // effectiveLanguage call already consulted shebang - // bytes via readSniffPrefix (512-byte probe), so a - // re-detect against the full src would change the - // answer only on the vanishingly rare case where a - // language marker lives past byte 512 — and any such - // case is content-sniffing-by-luck rather than spec'd - // behaviour. The fallback below covers the truly - // pathological case where the walk-time language has - // no extractor registered (effectively dead code). - lang := wf.lang - ext, _ := idx.registry.GetByLanguage(lang) - if ext == nil { - if relang, ok := idx.effectiveLanguage(path, src); ok { - lang = relang - ext, _ = idx.registry.GetByLanguage(lang) + relPath, _ := filepath.Rel(absRoot, path) + // Reuse the walk-time language. The walk's + // effectiveLanguage call already consulted shebang + // bytes via readSniffPrefix (512-byte probe), so a + // re-detect against the full src would change the + // answer only on the vanishingly rare case where a + // language marker lives past byte 512 — and any such + // case is content-sniffing-by-luck rather than spec'd + // behaviour. The fallback below covers the truly + // pathological case where the walk-time language has + // no extractor registered (effectively dead code). + lang := wf.lang + ext, _ := idx.registry.GetByLanguage(lang) + if ext == nil { + if relang, ok := idx.effectiveLanguage(path, src); ok { + lang = relang + ext, _ = idx.registry.GetByLanguage(lang) + } + } + if ext == nil { + continue } - } - if ext == nil { - continue - } - // Pre-ingestion transforms: rewrite the bytes before - // extraction (BOM strip, minified-bundle expansion, a - // PDF→markdown command, …). - src = idx.transforms.run(relPath, src) + // Pre-ingestion transforms: rewrite the bytes before + // extraction (BOM strip, minified-bundle expansion, a + // PDF→markdown command, …). + src = idx.transforms.run(relPath, src) - result, skipped, err := idx.extractFile(parsePool, quarantine, path, relPath, lang, ext, src) - if err != nil { - errMu.Lock() - errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) - errMu.Unlock() - } - if result == nil { - continue - } - if skipped && len(result.Nodes) > 0 { - if _, ok := result.Nodes[0].Meta["skipped_due_to_timeout"]; ok { - atomic.AddInt64(&skippedByTimeout, 1) + result, skipped, err := idx.extractFile(parsePool, quarantine, path, relPath, lang, ext, src) + if err != nil { + errMu.Lock() + errors = append(errors, IndexError{FilePath: path, Error: err.Error()}) + errMu.Unlock() } - if _, ok := result.Nodes[0].Meta["skipped_due_to_minified"]; ok { - atomic.AddInt64(&skippedByMinified, 1) + if result == nil { + continue + } + if skipped && len(result.Nodes) > 0 { + if _, ok := result.Nodes[0].Meta["skipped_due_to_timeout"]; ok { + atomic.AddInt64(&skippedByTimeout, 1) + } + if _, ok := result.Nodes[0].Meta["skipped_due_to_minified"]; ok { + atomic.AddInt64(&skippedByMinified, 1) + } } - } - // Append coverage artifacts (todos / licenses / - // ownership) before applyRepoPrefix so they get the - // same multi-repo namespacing treatment as - // language-extractor output. Skipped for quarantined / - // timed-out files — the coverage scanners would re-read - // a source the parser could not survive. - if !skipped { - idx.applyCoverageDomains(relPath, lang, src, result) - } + // Append coverage artifacts (todos / licenses / + // ownership) before applyRepoPrefix so they get the + // same multi-repo namespacing treatment as + // language-extractor output. Skipped for quarantined / + // timed-out files — the coverage scanners would re-read + // a source the parser could not survive. + if !skipped { + idx.applyCoverageDomains(relPath, lang, src, result) + } - idx.applyRepoPrefix(result.Nodes, result.Edges) - - // Find the file node (if the extractor produced one) - // and collect its outgoing edges — contract extractors - // take the file-scope edge set (imports, etc.), not - // every intra-file edge. - var fileNodeID, fileGraphPath string - for _, n := range result.Nodes { - if n.Kind == graph.KindFile { - fileNodeID = n.ID - fileGraphPath = n.FilePath - break + idx.applyRepoPrefix(result.Nodes, result.Edges) + + // Find the file node (if the extractor produced one) + // and collect its outgoing edges — contract extractors + // take the file-scope edge set (imports, etc.), not + // every intra-file edge. + var fileNodeID, fileGraphPath string + for _, n := range result.Nodes { + if n.Kind == graph.KindFile { + fileNodeID = n.ID + fileGraphPath = n.FilePath + break + } } - } - var fileScopeEdges []*graph.Edge - if fileNodeID != "" { - for _, e := range result.Edges { - if e.From == fileNodeID { - fileScopeEdges = append(fileScopeEdges, e) + var fileScopeEdges []*graph.Edge + if fileNodeID != "" { + for _, e := range result.Edges { + if e.From == fileNodeID { + fileScopeEdges = append(fileScopeEdges, e) + } } } - } - // Batch the per-file insert into one shard-grouped pass - // so each shard's lock is acquired at most once per - // file instead of N + 2·E times. Profiling showed 69 - // of 102 workers blocked on lockTwoWrite under the - // per-edge path during cold-start warmup. - idx.graph.AddBatch(result.Nodes, result.Edges) - - if !skipped && fileGraphPath != "" { - exts := contractExtractorsByLang[lang] - if len(exts) > 0 { - c := idx.runContractExtractorsForFile( - fileGraphPath, src, result.Nodes, fileScopeEdges, exts, result.Tree) - localContracts = append(localContracts, c...) - - // Populate the per-file contract cache so a - // later IncrementalReindex can skip this file - // on a cache hit. Mtime comes from the walk- - // time d.Info() — no extra stat here. - if wf.mtimeNano > 0 { - idx.contractCacheMu.Lock() - idx.contractCache[fileGraphPath] = &contractCacheEntry{ - mtimeNano: wf.mtimeNano, - contracts: c, + // Batch the per-file insert into one shard-grouped pass + // so each shard's lock is acquired at most once per + // file instead of N + 2·E times. Profiling showed 69 + // of 102 workers blocked on lockTwoWrite under the + // per-edge path during cold-start warmup. + idx.graph.AddBatch(result.Nodes, result.Edges) + + if !skipped && fileGraphPath != "" { + exts := contractExtractorsByLang[lang] + if len(exts) > 0 { + c := idx.runContractExtractorsForFile( + fileGraphPath, src, result.Nodes, fileScopeEdges, exts, result.Tree) + localContracts = append(localContracts, c...) + + // Populate the per-file contract cache so a + // later IncrementalReindex can skip this file + // on a cache hit. Mtime comes from the walk- + // time d.Info() — no extra stat here. + if wf.mtimeNano > 0 { + idx.contractCacheMu.Lock() + idx.contractCache[fileGraphPath] = &contractCacheEntry{ + mtimeNano: wf.mtimeNano, + contracts: c, + } + idx.contractCacheMu.Unlock() } - idx.contractCacheMu.Unlock() } } + // Release the parse tree now that the per-file + // contract pass is done. Post-passes that need a + // tree for this file (cross-file handler resolution) + // re-parse on demand. Nil-safe. + result.Tree.Release() + atomic.AddInt64(&fileCount, 1) } - // Release the parse tree now that the per-file - // contract pass is done. Post-passes that need a - // tree for this file (cross-file handler resolution) - // re-parse on demand. Nil-safe. - result.Tree.Release() - atomic.AddInt64(&fileCount, 1) - } - if len(localContracts) > 0 { - contractMu.Lock() - for _, c := range localContracts { - contractReg.Add(c) + if len(localContracts) > 0 { + contractMu.Lock() + for _, c := range localContracts { + contractReg.Add(c) + } + contractMu.Unlock() } - contractMu.Unlock() + }() + } + + for _, f := range chunkFiles { + fileCh <- f + } + close(fileCh) + wg.Wait() + } + + // Streaming-flush path: above shadowMaxFileCount with a + // BulkLoader-capable backend, we can't fit the whole shadow in + // RAM but we can still amortise the per-file disk-write cost by + // chunking. Each chunk runs against its own throwaway shadow, + // then flushes via BulkLoad to disk. Resolve runs against the + // disk store afterwards (per-call, slower than the shadow path + // but bounded RAM). Activated by GORTEX_STREAMING_FLUSH=1; off + // by default since it requires the disk-only resolver path + // (~tens of minutes on huge repos) that we haven't yet + // optimised end-to-end. + if diskTarget == nil && streamingFlushActive(idx.graph, len(files)) { + bl, _ := idx.graph.(graph.BulkLoader) + streamingDisk := idx.graph + chunkSize := streamingChunkSize() + idx.logger.Info("indexer: streaming-flush parse", + zap.Int("files", len(files)), + zap.Int("chunk_size", chunkSize)) + for chunkStart := 0; chunkStart < len(files); chunkStart += chunkSize { + chunkEnd := min(chunkStart+chunkSize, len(files)) + chunkShadow := graph.New() + idx.graph = chunkShadow + parseChunk(files[chunkStart:chunkEnd]) + // Flush chunk to disk. + bl.BeginBulkLoad() + streamingDisk.AddBatch(chunkShadow.AllNodes(), chunkShadow.AllEdges()) + if err := bl.FlushBulk(); err != nil { + return nil, fmt.Errorf("indexer: streaming-flush chunk %d..%d: %w", chunkStart, chunkEnd, err) } - }() - } - - for _, f := range files { - fileCh <- f + } + // After all chunks, idx.graph points at the disk store so + // the resolver and subpasses read/mutate the merged state. + idx.graph = streamingDisk + } else { + parseChunk(files) } - close(fileCh) - wg.Wait() if processed > 0 { reporter.Report("parsing", int(processed), totalFiles) @@ -1803,8 +2180,43 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er idx.fileMtimes[idx.relKey(f.path)] = f.mtimeNano } } + mtimeSnapshot := make(map[string]int64, len(idx.fileMtimes)) + for k, v := range idx.fileMtimes { + mtimeSnapshot[k] = v + } idx.mtimeMu.Unlock() + // Persist the per-file mtimes through the store's optional + // FileMtime sidecar table. On the ladybug backend this lets warm + // restarts seed ReconcileRepoCtx without having to read them back + // out of the gob+gzip metadata snapshot; on the in-memory + // backend the capability isn't implemented and the assertion + // short-circuits. + // + // Multi-repo bug: when the shadow-swap path is active, idx.graph + // is the in-memory shadow graph at this point — graph.Graph does + // NOT implement FileMtimeWriter, so the type assertion fails and + // persistence is silently skipped. The actual ladybug store is + // the local diskTarget variable; checking it first ensures warm- + // restart-skip-reindex actually works. The defer that swaps + // idx.graph back to diskTarget runs LATER, when IndexCtx returns, + // so we can't rely on it here. Falls through to idx.graph for the + // non-shadow path. + mtimeTarget := graph.Store(idx.graph) + if diskTarget != nil { + mtimeTarget = diskTarget + } + if w, ok := mtimeTarget.(graph.FileMtimeWriter); ok && len(mtimeSnapshot) > 0 { + if err := w.BulkSetFileMtimes(idx.repoPrefix, mtimeSnapshot); err != nil { + idx.logger.Warn("persist file mtimes failed", + zap.String("repo", idx.repoPrefix), zap.Error(err)) + } else { + idx.logger.Info("persisted file mtimes", + zap.String("repo", idx.repoPrefix), + zap.Int("count", len(mtimeSnapshot))) + } + } + // Retain parse errors and record index metadata. idx.parseErrors = errors idx.totalDetected = len(files) @@ -1949,7 +2361,16 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er // upgradeOnce gates the spawn so multi-repo warmup, which calls // IndexCtx once per tracked repo, doesn't launch one upgrade // goroutine per post-threshold repo. One per indexer lifetime. - if idx.search.Count() >= search.AutoThreshold { + // + // Skip the upgrade when the active search backend is the + // SymbolSearcher adapter: the disk store's native FTS is + // already serving search at engine-native latency, and + // spawning a parallel Bleve build would (a) waste ~100MB heap + // re-indexing the same corpus and (b) silently swap the + // adapter out for Bleve on completion — defeating the whole + // FTS path. The Swappable's current backend tells us which + // branch we're on. + if !isSymbolSearcherBackend(idx.search) && idx.search.Count() >= search.AutoThreshold { idx.upgradeOnce.Do(func() { reporter.Report("scheduling search backend upgrade", 0, 0) idx.upgradeSpawnedMu.Lock() @@ -1988,7 +2409,7 @@ func (idx *Indexer) IndexCtx(ctx context.Context, root string) (*IndexResult, er idx.indexGen.Add(1) // invalidate the trigram search cache nodes, edges := idx.repoNodeEdgeCount() - result := &IndexResult{ + result = &IndexResult{ NodeCount: nodes, EdgeCount: edges, FileCount: int(fileCount), @@ -2128,11 +2549,23 @@ func (idx *Indexer) indexFile(filePath string, resolve bool) error { // Add new symbols to search index. shouldIndexForSearch enforces // the same SkipSearch filter used by the bulk and upgrade paths. + // When the backing store implements graph.SymbolSearcher we + // also mirror each upsert into its native FTS, so an + // incremental reindex doesn't fall out of sync with the + // bulk-built corpus. + searcher, _ := idx.graph.(graph.SymbolSearcher) for _, n := range result.Nodes { if !idx.shouldIndexForSearch(n) { continue } idx.search.Add(n.ID, searchIndexFields(n)...) + if searcher != nil { + if err := searcher.UpsertSymbolFTS(n.ID, ftsTokensFor(n)); err != nil { + idx.logger.Debug("indexer: backend FTS upsert failed", + zap.String("id", n.ID), + zap.Error(err)) + } + } } if resolve { @@ -2158,9 +2591,18 @@ func (idx *Indexer) indexFile(filePath string, resolve bool) error { // key (relKey applied slash + NFC), so the mtime entry lines up // with the graph file-node key and with the bulk-walk mtimes. if info, err := os.Stat(absPath); err == nil { + mtime := info.ModTime().UnixNano() idx.mtimeMu.Lock() - idx.fileMtimes[relPath] = info.ModTime().UnixNano() + idx.fileMtimes[relPath] = mtime idx.mtimeMu.Unlock() + // Also persist through the store's FileMtime sidecar so the + // next warm restart sees this incremental update without + // having to wait for the periodic gob snapshot to roll it. + // Per-file MERGE is ~1ms on ladybug; trivial under steady- + // state file-watcher load. + if w, ok := idx.graph.(graph.FileMtimeWriter); ok { + _ = w.BulkSetFileMtimes(idx.repoPrefix, map[string]int64{relPath: mtime}) + } } return nil @@ -2786,9 +3228,38 @@ func (idx *Indexer) buildSearchIndex() { } vecBackend := search.NewVector(dims) + // VectorSearcher capability bridging: if the underlying store + // has a native HNSW, install it as the in-process backend's + // delegate — Add becomes a no-op, Search forwards to the + // engine, and we don't allocate `dim × 4 × N` bytes of heap + // for a parallel in-process HNSW. The indexer still drives + // the writes (BulkUpsertEmbeddings below) so the engine + // index lands with the same corpus the in-process one would + // have built. + vecSearcher, _ := idx.graph.(graph.VectorSearcher) + var backendItems []graph.VectorItem + if vecSearcher != nil { + vecBackend.SetDelegate(&vectorSearcherDelegate{s: vecSearcher}) + backendItems = make([]graph.VectorItem, 0, len(vectors)) + } for i, vec := range vectors { if vec != nil { vecBackend.Add(ids[i], vec) + if vecSearcher != nil { + backendItems = append(backendItems, graph.VectorItem{ + NodeID: ids[i], + Vec: vec, + }) + } + } + } + if vecSearcher != nil && len(backendItems) > 0 { + if err := vecSearcher.BulkUpsertEmbeddings(backendItems); err != nil { + idx.logger.Warn("indexer: backend vector bulk upsert failed", + zap.Error(err)) + } else if err := vecSearcher.BuildVectorIndex(dims); err != nil { + idx.logger.Warn("indexer: backend vector index build failed", + zap.Error(err)) } } // Install the chunk → parent-symbol mapping so HybridBackend can @@ -3133,7 +3604,22 @@ func (idx *Indexer) IncrementalReindexPaths(root string, paths []string) (*Index resolver.SynthesizeExternalCalls(idx.graph, idx.externalCallSynthesisEnabled()) } - idx.buildSearchIndex() + // Skip the search-index rebuild on a zero-change reconcile when the + // backend already persists its search structures (ladybug: native + // FTS + native HNSW vectors). buildSearchIndex re-reads every node + // (GetRepoNodes) and re-embeds them, then BulkUpsertEmbeddings does + // a `DELETE all SymbolVec` + COPY into a table that still carries the + // prior run's HNSW index. On a warm restart that work is pure + // recompute of already-persisted data, AND running it concurrently + // across the parallel-warmup workers is a CGo crash site (COPY into + // an indexed table; cross-repo DELETE-all stomp). When nothing + // changed there is nothing to re-embed, so skip it entirely — the + // persisted index is authoritative. The in-memory backends (BM25 / + // Bleve) must still rebuild from the replayed snapshot, so they keep + // the unconditional path. + if len(staleFiles) > 0 || len(deletedFiles) > 0 || !isSymbolSearcherBackend(idx.search) { + idx.buildSearchIndex() + } if len(staleFiles) > 0 || len(deletedFiles) > 0 { idx.extractContracts() @@ -3144,10 +3630,11 @@ func (idx *Indexer) IncrementalReindexPaths(root string, paths []string) (*Index result := &IndexResult{ NodeCount: nodes, EdgeCount: edges, - FileCount: len(diskFiles), - StaleFileCount: len(staleFiles), - FailedFiles: failedFiles, - DurationMs: time.Since(start).Milliseconds(), + FileCount: len(diskFiles), + StaleFileCount: len(staleFiles), + DeletedFileCount: len(deletedFiles), + FailedFiles: failedFiles, + DurationMs: time.Since(start).Milliseconds(), } idx.warnIfEdgeSanityViolated(result) return result, nil @@ -3335,8 +3822,16 @@ func (idx *Indexer) IncrementalReindex(root string) (*IndexResult, error) { // the global clone pass once at the end. } - // Rebuild search index to ensure consistency. - idx.buildSearchIndex() + // Rebuild search index to ensure consistency — but skip it on a + // zero-change reconcile against a backend that persists its search + // structures natively (ladybug). See the matching guard in the + // other incremental path: re-embedding + the DELETE-all-then-COPY + // into the still-indexed SymbolVec table is both wasted work and a + // parallel-warmup CGo crash site, and there is nothing to rebuild + // when no file changed. + if len(staleFiles) > 0 || len(deletedFiles) > 0 || !isSymbolSearcherBackend(idx.search) { + idx.buildSearchIndex() + } // Update totalDetected so index_health reports correctly after cache restore. if idx.totalDetected == 0 { @@ -3353,10 +3848,11 @@ func (idx *Indexer) IncrementalReindex(root string) (*IndexResult, error) { result := &IndexResult{ NodeCount: nodes, EdgeCount: edges, - FileCount: len(diskFiles), - StaleFileCount: len(staleFiles), - FailedFiles: failedFiles, - DurationMs: time.Since(start).Milliseconds(), + FileCount: len(diskFiles), + StaleFileCount: len(staleFiles), + DeletedFileCount: len(deletedFiles), + FailedFiles: failedFiles, + DurationMs: time.Since(start).Milliseconds(), } idx.warnIfEdgeSanityViolated(result) return result, nil @@ -3514,52 +4010,77 @@ func (idx *Indexer) commitContracts(reg *contracts.Registry) { // the wire format. idx.inlineEnvelopeShapes(reg) - for _, c := range reg.All() { - contractNode := &graph.Node{ - ID: c.ID, - Kind: graph.KindContract, - Name: c.ID, - FilePath: c.FilePath, - Language: "contract", - Meta: map[string]any{"type": string(c.Type), "role": string(c.Role)}, + all := reg.All() + nodes := make([]*graph.Node, 0, len(all)) + edges := make([]*graph.Edge, 0, len(all)) + for _, c := range all { + // dep:: nodes were materialised by extractGoModContracts + // before ResolveAll (so the import bridge could find them); + // re-emitting them here would PK-collide on backends whose bulk + // COPY is INSERT-only (Ladybug). The pre-pass is the single + // writer for that contract type. + if c.Type == contracts.ContractDependency { + continue } - idx.graph.AddNode(contractNode) + nodes = append(nodes, &graph.Node{ + ID: c.ID, + Kind: graph.KindContract, + Name: c.ID, + FilePath: c.FilePath, + Language: "contract", + RepoPrefix: c.RepoPrefix, + WorkspaceID: c.EffectiveWorkspace(), + ProjectID: c.EffectiveProject(), + Meta: map[string]any{ + "type": string(c.Type), + "role": string(c.Role), + "symbol_id": c.SymbolID, + "line": c.Line, + "confidence": c.Confidence, + "contract_meta": c.Meta, + }, + }) + if c.SymbolID == "" { + continue + } edgeKind := graph.EdgeProvides if c.Role == contracts.RoleConsumer { edgeKind = graph.EdgeConsumes } - if c.SymbolID != "" { - idx.graph.AddEdge(&graph.Edge{ + edges = append(edges, &graph.Edge{ + From: c.SymbolID, + To: c.ID, + Kind: edgeKind, + FilePath: c.FilePath, + Line: c.Line, + }) + // Framework-layer EdgeHandlesRoute. Emitted alongside + // EdgeProvides for HTTP / gRPC / WS / GraphQL / topic + // providers so `analyze kind=routes` and other + // framework-aware tools walk one targeted edge instead + // of filtering EdgeProvides by contract type. Consumers + // (callers of routes) and non-route contract types (env, + // OpenAPI specs, DI tokens) intentionally skip this + // edge — they aren't route handlers. + if c.Role == contracts.RoleProvider && isRouteContractType(c.Type) { + edges = append(edges, &graph.Edge{ From: c.SymbolID, To: c.ID, - Kind: edgeKind, + Kind: graph.EdgeHandlesRoute, FilePath: c.FilePath, Line: c.Line, + Meta: map[string]any{ + "contract_type": string(c.Type), + }, }) - // Framework-layer EdgeHandlesRoute. Emitted alongside - // EdgeProvides for HTTP / gRPC / WS / GraphQL / topic - // providers so `analyze kind=routes` and other - // framework-aware tools walk one targeted edge instead - // of filtering EdgeProvides by contract type. Consumers - // (callers of routes) and non-route contract types (env, - // OpenAPI specs, DI tokens) intentionally skip this - // edge — they aren't route handlers. - if c.Role == contracts.RoleProvider && isRouteContractType(c.Type) { - idx.graph.AddEdge(&graph.Edge{ - From: c.SymbolID, - To: c.ID, - Kind: graph.EdgeHandlesRoute, - FilePath: c.FilePath, - Line: c.Line, - Meta: map[string]any{ - "contract_type": string(c.Type), - }, - }) - } } } + bulkStart := time.Now() + idx.bulkCommit(nodes, edges) + bulkElapsed := time.Since(bulkStart) + idx.contractRegistry = reg repo := idx.rootPath if idx.repoPrefix != "" { @@ -3567,7 +4088,24 @@ func (idx *Indexer) commitContracts(reg *contracts.Registry) { } idx.logger.Info("contracts extracted", zap.String("repo", repo), - zap.Int("count", len(reg.All()))) + zap.Int("count", len(all)), + zap.Duration("commit_bulk_elapsed", bulkElapsed)) +} + +// bulkCommit writes nodes + edges in one AddBatch call. The bulk +// COPY path is intentionally NOT used here: contract IDs often +// coincide with existing source-symbol IDs (a route handler shows +// up as both a Go function and an HTTP-contract anchor), and +// Ladybug's COPY FROM is INSERT-only on the node table so any +// collision fails the whole batch. AddBatch's non-bulk path runs +// MERGE for every row so duplicates are absorbed in place; the +// per-call cost is amortised by the chunked UNWIND-MERGE path the +// backend uses internally. +func (idx *Indexer) bulkCommit(nodes []*graph.Node, edges []*graph.Edge) { + if len(nodes) == 0 && len(edges) == 0 { + return + } + idx.graph.AddBatch(nodes, edges) } // isRouteContractType reports whether a ContractType corresponds to a @@ -4933,6 +5471,7 @@ func (idx *Indexer) extractGoModContracts(reg *contracts.Registry) { found := goModExtractor.Extract(goModFilePath, goModSrc, nil, nil) reg.AddAllScoped(found, idx.repoPrefix, idx.workspaceID, idx.projectID) + var nodes []*graph.Node for i := range found { c := found[i] if c.Type != contracts.ContractDependency { @@ -4941,7 +5480,7 @@ func (idx *Indexer) extractGoModContracts(reg *contracts.Registry) { if idx.graph.GetNode(c.ID) != nil { continue } - idx.graph.AddNode(&graph.Node{ + nodes = append(nodes, &graph.Node{ ID: c.ID, Kind: graph.KindContract, Name: c.ID, @@ -4951,6 +5490,9 @@ func (idx *Indexer) extractGoModContracts(reg *contracts.Registry) { Meta: map[string]any{"type": string(c.Type), "role": string(c.Role)}, }) } + if len(nodes) > 0 { + idx.graph.AddBatch(nodes, nil) + } } // extractContracts scans all file nodes in the graph and runs contract @@ -4979,6 +5521,33 @@ func (idx *Indexer) extractContracts() { nodes = idx.graph.AllNodes() } + // Pre-bucket the already-fetched node slice by FilePath so the + // per-file body can look up its co-located nodes in O(1) instead + // of firing a fresh GetFileNodes query per file. Likewise pre- + // fetch every out-edge whose source is in this repo as ONE backend + // call and bucket by From so the per-file body can replace + // GetOutEdges(fileNode.ID) — on disk backends the per-file query + // path was the second-largest source of round-trips in + // deferred_passes (after the DI walk). + nodesByFile := make(map[string][]*graph.Node, len(nodes)) + for _, n := range nodes { + if n == nil { + continue + } + nodesByFile[n.FilePath] = append(nodesByFile[n.FilePath], n) + } + var edgesByFrom map[string][]*graph.Edge + if idx.repoPrefix != "" { + repoEdges := idx.graph.GetRepoEdges(idx.repoPrefix) + edgesByFrom = make(map[string][]*graph.Edge, len(nodes)) + for _, e := range repoEdges { + if e == nil { + continue + } + edgesByFrom[e.From] = append(edgesByFrom[e.From], e) + } + } + for _, fileNode := range nodes { if fileNode.Kind != graph.KindFile { continue @@ -5022,8 +5591,15 @@ func (idx *Indexer) extractContracts() { continue } - fileNodes := idx.graph.GetFileNodes(fileNode.FilePath) - fileEdges := idx.graph.GetOutEdges(fileNode.ID) + var fileNodes []*graph.Node + var fileEdges []*graph.Edge + if idx.repoPrefix != "" { + fileNodes = nodesByFile[fileNode.FilePath] + fileEdges = edgesByFrom[fileNode.ID] + } else { + fileNodes = idx.graph.GetFileNodes(fileNode.FilePath) + fileEdges = idx.graph.GetOutEdges(fileNode.ID) + } // Language-filtered dispatch: skip extractors that don't list // this file's language in SupportedLanguages(). On big repos @@ -5074,6 +5650,85 @@ func (idx *Indexer) extractContracts() { // Unicode form than fileMtimes was keyed with still resolves — without // the fold the lookup would miss and the file be reported permanently // stale, re-indexing it under a second key on every pass. +// HasChangesSinceMtimes reports whether any indexable file under root +// changed (mtime differs or is new) or was deleted, relative to the +// indexer's currently-loaded fileMtimes. It runs the SAME walk + +// staleness + deletion logic as IncrementalReindex but writes nothing. +// +// The daemon warmup uses it to choose a reconcile strategy for a +// reopened repo: a repo with zero changes takes the fast no-op +// IncrementalReindex path, while a repo that changed while the daemon +// was down is routed through the shadow/bulk-COPY re-track path instead. +// That routing matters because IncrementalReindex re-resolves changed +// files through per-edge graph.ReindexEdges, and the per-edge ladybug +// write path HANGS inside lbug_connection_prepare on the first write to +// a freshly reopened store — the warm restart wedges at 0% CPU forever. +// The shadow path resolves entirely in an in-memory graph and commits +// the result in one bulk COPY, so it never issues a per-edge write to +// the reopened store. It re-indexes the whole repo (more work than a +// true incremental pass), but it is reliable, and only repos that +// actually changed during downtime pay the cost. +// +// Conservative on error: anything it can't determine (bad root, walk +// error) returns true so the caller re-indexes rather than silently +// serving a stale graph. +func (idx *Indexer) HasChangesSinceMtimes(root string) bool { + absRoot, err := filepath.Abs(root) + if err != nil { + return true + } + idx.rootPath = absRoot + + diskFiles := make(map[string]bool) + errStop := errors.New("stop-walk") + walkErr := filepath.WalkDir(absRoot, func(path string, d os.DirEntry, werr error) error { + if werr != nil { + return nil + } + if d.IsDir() { + if idx.shouldExclude(path, absRoot, true) { + return filepath.SkipDir + } + return nil + } + if _, ok := idx.effectiveLanguage(path, nil); !ok { + return nil + } + if idx.shouldExclude(path, absRoot, false) { + return nil + } + rel := idx.relKey(path) + diskFiles[rel] = true + if idx.IsStale(rel) { + return errStop // a single changed/new file is enough + } + return nil + }) + if errors.Is(walkErr, errStop) { + return true + } + if walkErr != nil { + return true + } + + // Deletion check: a previously-indexed file absent from the walk and + // confirmed gone from disk counts as a change (its edges must drop). + idx.mtimeMu.RLock() + var candidates []string + for rel := range idx.fileMtimes { + if !diskFiles[rel] { + candidates = append(candidates, rel) + } + } + idx.mtimeMu.RUnlock() + for _, rel := range candidates { + if _, err := os.Stat(filepath.Join(absRoot, filepath.FromSlash(rel))); errors.Is(err, os.ErrNotExist) { + return true + } + } + return false +} + func (idx *Indexer) IsStale(relPath string) bool { relPath = pathkey.Normalize(filepath.ToSlash(relPath)) diff --git a/internal/indexer/indexer_test.go b/internal/indexer/indexer_test.go index 2fcba073..1b12e725 100644 --- a/internal/indexer/indexer_test.go +++ b/internal/indexer/indexer_test.go @@ -64,7 +64,7 @@ func writeFile(t *testing.T, path, content string) { require.NoError(t, os.WriteFile(path, []byte(content), 0o644)) } -func newTestIndexer(g *graph.Graph) *Indexer { +func newTestIndexer(g graph.Store) *Indexer { reg := parser.NewRegistry() reg.Register(languages.NewGoExtractor()) cfg := config.Default().Index diff --git a/internal/indexer/multi.go b/internal/indexer/multi.go index d70f7e8f..b40326b9 100644 --- a/internal/indexer/multi.go +++ b/internal/indexer/multi.go @@ -45,7 +45,7 @@ type RepoMetadata struct { // MultiIndexer orchestrates indexing across multiple repositories. type MultiIndexer struct { - graph *graph.Graph + graph graph.Store registry *parser.Registry search search.Backend embedder embedding.Provider @@ -379,6 +379,26 @@ func (mi *MultiIndexer) EndBatch() { mi.RunGlobalGraphPasses(context.Background()) } +// ResetBatch clears deferred-batch mode WITHOUT running the graph-wide +// derivation passes. It is the warm-restart fast-path counterpart to +// EndBatch: when the warmup reconcile loop observed zero changed files +// across every repo, the persistent backend already holds every resolved +// and derived edge from the prior run, so RunGlobalGraphPasses (plus the +// RunDeferredPassesAll / RunGlobalResolve the caller also skips) would +// only recompute what's already on disk — the work that turns a warm +// restart into a 30s–500s stall. The per-Indexer SetDeferGlobalPasses +// flag is still restored so a later watch-triggered TrackRepoCtx / +// IncrementalReindex runs its passes inline as normal. +func (mi *MultiIndexer) ResetBatch() { + mi.mu.Lock() + defer mi.mu.Unlock() + mi.deferGlobalPasses = false + mi.deferResolve = false + for _, idx := range mi.indexers { + idx.SetDeferGlobalPasses(false) + } +} + // RunGlobalGraphPasses runs the graph-wide derivation passes once // against the shared graph: InferImplements (structural interface // satisfaction), InferOverrides (method-level overrides on @@ -491,7 +511,7 @@ func (mi *MultiIndexer) externalCallSynthesisEnabled() bool { // NewMultiIndexer creates a MultiIndexer. func NewMultiIndexer( - g *graph.Graph, + g graph.Store, reg *parser.Registry, s search.Backend, cm *config.ConfigManager, @@ -1085,7 +1105,41 @@ func (mi *MultiIndexer) ReconcileRepoCtx(ctx context.Context, entry config.RepoE idx.SetRootPath(absPath) idx.SetFileMtimes(priorMtimes) - result, err := idx.IncrementalReindex(absPath) + // Choose the reconcile strategy. A repo that changed while the + // daemon was down must NOT take IncrementalReindex's per-file path: + // re-resolving a changed file there goes through per-edge + // graph.ReindexEdges, and the per-edge ladybug write hangs inside + // lbug_connection_prepare on the first write to a freshly reopened + // store (the warm restart wedges forever at 0% CPU). The shadow/bulk + // re-track path (IndexCtx) resolves in an in-memory shadow and + // commits one bulk COPY, so it never issues a per-edge write to the + // reopened store. It re-indexes the whole repo, but only repos that + // actually changed pay it, and it is reliable where the per-edge path + // is not. A repo with zero changes keeps the fast IncrementalReindex + // no-op (walk + 0 stale → return), which is what makes an unchanged + // warm restart near-instant. + // The shadow/bulk re-track workaround for the per-edge ReindexEdges + // hang applies ONLY to disk-backed stores (ladybug), which is where + // the first per-edge write to a reopened store wedges in + // lbug_connection_prepare. The in-memory backend (*graph.Graph) has + // no reopen and no CGo write path, and IncrementalReindex is the + // authoritative path there — it evicts offline-deleted files in place + // (a re-track of a shared in-memory graph would not). Gate on the + // store type so the memory backend keeps its exact prior behaviour. + _, memoryBacked := mi.graph.(*graph.Graph) + var result *IndexResult + if !memoryBacked && idx.HasChangesSinceMtimes(absPath) { + result, err = idx.IndexCtx(ctx, absPath) + if err == nil && result != nil && result.StaleFileCount == 0 { + // Signal "this repo did re-indexing work" to the warmup + // change-detector (which keys on StaleFileCount): a full + // re-track touches every file, so the daemon's global + // resolution passes must run. + result.StaleFileCount = result.FileCount + } + } else { + result, err = idx.IncrementalReindex(absPath) + } if err != nil { return nil, fmt.Errorf("reconciling %s: %w", absPath, err) } @@ -1587,7 +1641,7 @@ func (mi *MultiIndexer) MergedContractRegistry() *contracts.Registry { // re-extract shapes (the type nodes already have them from // snapshotContractShapes if they were referenced anywhere), it just // attaches them to the new contract entries. -func (mi *MultiIndexer) attachInlinedShapes(cr *contracts.Registry, g *graph.Graph) { +func (mi *MultiIndexer) attachInlinedShapes(cr *contracts.Registry, g graph.Store) { idsToTouch := map[string]bool{} for _, c := range cr.All() { if c.Meta == nil { @@ -2036,7 +2090,7 @@ func (mi *MultiIndexer) ReconcileContractEdges() int { // have the contract ID can also look up the topic node directly. // Meta on the node carries the broker family and the raw topic name // for filterless queries. -func emitTopicEdges(g *graph.Graph, m contracts.CrossLink, topicNodes map[string]struct{}) { +func emitTopicEdges(g graph.Store, m contracts.CrossLink, topicNodes map[string]struct{}) { // Trust the matcher to bucket only same-broker contracts together // because Contract.ID already includes the broker token; if the // broker isn't on the provider Meta, fall through to the contract @@ -2136,7 +2190,7 @@ func parseTopicContractID(id string) (broker, name string, ok bool) { } // Graph returns the underlying shared graph. -func (mi *MultiIndexer) Graph() *graph.Graph { +func (mi *MultiIndexer) Graph() graph.Store { return mi.graph } diff --git a/internal/indexer/multi_contract_edges_test.go b/internal/indexer/multi_contract_edges_test.go index d938a068..d6e1ab66 100644 --- a/internal/indexer/multi_contract_edges_test.go +++ b/internal/indexer/multi_contract_edges_test.go @@ -880,7 +880,7 @@ func TestReconcileContractEdges_OpenAPIBridge(t *testing.T) { // matchEdgeSummaries dumps all EdgeMatches as "from → to" strings for // failure-message context when the expected bridges aren't present. -func matchEdgeSummaries(g *graph.Graph) []string { +func matchEdgeSummaries(g graph.Store) []string { var out []string for _, e := range g.AllEdges() { if e.Kind == graph.EdgeMatches { @@ -927,7 +927,7 @@ func TestReconcileContractEdges_PurgesStaleOnUntrack(t *testing.T) { len(remaining), remaining) } -func collectMatchEdges(g *graph.Graph) []string { +func collectMatchEdges(g graph.Store) []string { var out []string for _, e := range g.AllEdges() { if e.Kind == graph.EdgeMatches { diff --git a/internal/indexer/multi_global_passes_test.go b/internal/indexer/multi_global_passes_test.go index b426cc5c..d0b65707 100644 --- a/internal/indexer/multi_global_passes_test.go +++ b/internal/indexer/multi_global_passes_test.go @@ -50,7 +50,7 @@ func TestRunGreet(t *testing.T) { return dir } -func countEdges(g *graph.Graph, kind graph.EdgeKind) int { +func countEdges(g graph.Store, kind graph.EdgeKind) int { n := 0 for _, e := range g.AllEdges() { if e.Kind == kind { diff --git a/internal/indexer/multi_node_id_test.go b/internal/indexer/multi_node_id_test.go index d58a414a..47483359 100644 --- a/internal/indexer/multi_node_id_test.go +++ b/internal/indexer/multi_node_id_test.go @@ -130,7 +130,7 @@ func TestMultiRepo_ResolvesCallEdges(t *testing.T) { } } -func outEdgeSummaries(g *graph.Graph, id string) []string { +func outEdgeSummaries(g graph.Store, id string) []string { var out []string for _, e := range g.GetOutEdges(id) { out = append(out, string(e.Kind)+":"+e.To) @@ -176,9 +176,22 @@ func TestTrackRepoCtx_FirstOfManyStillGetsPrefix(t *testing.T) { // Every node must carry a non-empty RepoPrefix and its FilePath must // live under that prefix. Any violation means a code path bypassed - // applyRepoPrefix. + // applyRepoPrefix. KindModule and KindBuiltin are deliberately + // cross-repo singletons (one `module::pypi:requests` / + // `builtin::go::type::string` shared across every repo that uses + // them) so they're exempt from the per-repo prefix rule. var missingPrefix, badFilePaths []string for _, n := range g.AllNodes() { + if n.Kind == graph.KindModule || n.Kind == graph.KindBuiltin { + continue + } + if ext, _ := n.Meta["external"].(bool); ext { + // External call targets the resolver materialises as + // KindFunction with meta.external=true are cross-repo + // singletons (one `stdlib::fmt::Sprintf` shared across + // every repo that calls it) — same as KindModule. + continue + } if n.RepoPrefix == "" { missingPrefix = append(missingPrefix, n.ID) continue diff --git a/internal/indexer/multi_test.go b/internal/indexer/multi_test.go index 3cc88ad7..2f4c5aae 100644 --- a/internal/indexer/multi_test.go +++ b/internal/indexer/multi_test.go @@ -747,7 +747,7 @@ func TestPropertyReindexIsolation(t *testing.T) { } // countRepoEdges counts edges where at least one endpoint belongs to the given repo prefix. -func countRepoEdges(g *graph.Graph, repoPrefix string) int { +func countRepoEdges(g graph.Store, repoPrefix string) int { prefix := repoPrefix + "/" count := 0 for _, e := range g.AllEdges() { diff --git a/internal/indexer/multi_topic_edges_test.go b/internal/indexer/multi_topic_edges_test.go index 52db7f62..66b06505 100644 --- a/internal/indexer/multi_topic_edges_test.go +++ b/internal/indexer/multi_topic_edges_test.go @@ -25,7 +25,7 @@ import ( // findTopicNode walks the graph for a KindTopic node by ID and // returns it (or nil if absent). Used by topic-edge tests to assert // node materialisation alongside edge presence. -func findTopicNode(g *graph.Graph, id string) *graph.Node { +func findTopicNode(g graph.Store, id string) *graph.Node { for _, n := range g.AllNodes() { if n.Kind == graph.KindTopic && n.ID == id { return n @@ -36,7 +36,7 @@ func findTopicNode(g *graph.Graph, id string) *graph.Node { // collectTopicEdges returns every produces_topic / consumes_topic // edge in the graph as "from→to" strings, for diagnostic output. -func collectTopicEdges(g *graph.Graph, kind graph.EdgeKind) []string { +func collectTopicEdges(g graph.Store, kind graph.EdgeKind) []string { var out []string for _, e := range g.AllEdges() { if e.Kind == kind { @@ -264,7 +264,7 @@ func TestReconcileContractEdges_TopicEdges_CrossWorkspaceIsolation(t *testing.T) } // topicNodeIDs returns the ID of every KindTopic node in the graph. -func topicNodeIDs(g *graph.Graph) []string { +func topicNodeIDs(g graph.Store) []string { var out []string for _, n := range g.AllNodes() { if n.Kind == graph.KindTopic { diff --git a/internal/indexer/npm_alias_resolve_test.go b/internal/indexer/npm_alias_resolve_test.go index 467777d2..c78b7f42 100644 --- a/internal/indexer/npm_alias_resolve_test.go +++ b/internal/indexer/npm_alias_resolve_test.go @@ -116,7 +116,7 @@ func TestNpmAliasIndex_NilRootsYieldsNil(t *testing.T) { // addPackageNode registers a KindPackage node with the given qualified // name — this is what CrossRepoResolver.resolveImport matches an // import path against (mirrors the existing cross-repo import tests). -func addPackageNode(g *graph.Graph, repo, file, qualName string) { +func addPackageNode(g graph.Store, repo, file, qualName string) { g.AddNode(&graph.Node{ ID: file, Kind: graph.KindPackage, Name: qualName, QualName: qualName, FilePath: file, Language: "typescript", RepoPrefix: repo, diff --git a/internal/indexer/shadow_resolver_test.go b/internal/indexer/shadow_resolver_test.go new file mode 100644 index 00000000..aaf87363 --- /dev/null +++ b/internal/indexer/shadow_resolver_test.go @@ -0,0 +1,124 @@ +//go:build ladybug + +package indexer + +import ( + "context" + "path/filepath" + "sort" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/graph/store_ladybug" + "github.com/zzet/gortex/internal/parser" + "github.com/zzet/gortex/internal/parser/languages" +) + +// TestShadowSwap_ResolverFollowsGraphPointer guards against the regression +// where the indexer's in-memory shadow swap reassigned idx.graph but left +// idx.resolver pointing at the empty disk Store. The symptom was that +// every resolver pass (module attribution, relative imports, edge in-place +// resolution, ...) silently no-op'd for any backend that opted into the +// shadow swap — because the resolver's r.graph.EdgesWithUnresolvedTarget() +// returned 0 against the empty disk store and ResolveAll short-circuited +// on len(pending) == 0. +// +// The test indexes the same Python project twice — once into an in-memory +// *Graph (no shadow swap), once into a ladybug *Store (shadow swap engaged) +// — and asserts both produce the same node ID set and the same module +// attribution output (KindModule nodes for pypi imports). +func TestShadowSwap_ResolverFollowsGraphPointer(t *testing.T) { + dir := t.TempDir() + + // A pyproject.toml so the dep scanner discovers pypi:requests as + // an external dependency, which the resolver then materialises as + // a KindModule node via attributeNonGoModuleImports. + writeFile(t, filepath.Join(dir, "pyproject.toml"), ` +[project] +name = "regression" +dependencies = ["requests>=2.0"] +`) + + // Source file imports the pypi package and a stdlib module. Both + // flow through the same module-attribution pass. + writeFile(t, filepath.Join(dir, "app.py"), ` +import os +import requests + +def fetch(url): + return requests.get(url).text +`) + + newIdx := func(t *testing.T, g graph.Store) *Indexer { + t.Helper() + reg := parser.NewRegistry() + reg.Register(languages.NewPythonExtractor()) + cfg := config.Default().Index + cfg.Workers = 2 + return New(g, reg, cfg, zap.NewNop()) + } + + indexAndCollect := func(t *testing.T, g graph.Store) map[string]string { + t.Helper() + _, err := newIdx(t, g).IndexCtx(context.Background(), dir) + require.NoError(t, err) + ids := map[string]string{} + for _, n := range g.AllNodes() { + ids[n.ID] = string(n.Kind) + } + return ids + } + + memG := graph.New() + memIDs := indexAndCollect(t, memG) + + lbugDir := t.TempDir() + lbugStore, err := store_ladybug.Open(filepath.Join(lbugDir, "store.lbug")) + require.NoError(t, err) + t.Cleanup(func() { _ = lbugStore.Close() }) + + // Sanity: ladybug implements BulkLoader so the shadow swap engages. + _, isBulk := graph.Store(lbugStore).(graph.BulkLoader) + require.True(t, isBulk, "ladybug must implement BulkLoader for this regression to exercise the shadow swap") + + dskIDs := indexAndCollect(t, lbugStore) + + // The KindModule node the resolver materialises for `import requests` + // is the canary — without the fix it never gets written, because + // ResolveAll short-circuits before attributeNonGoModuleImports runs. + require.Contains(t, memIDs, "module::pypi:requests", + "baseline: in-memory backend must materialise the pypi module node") + assert.Contains(t, dskIDs, "module::pypi:requests", + "shadow-swap path must materialise the pypi module node — regression: resolver pointed at empty disk store") + + // Stdlib import gets the same treatment. + require.Contains(t, memIDs, "module::python:stdlib::os", + "baseline: in-memory backend must materialise the python stdlib module node") + assert.Contains(t, dskIDs, "module::python:stdlib::os", + "shadow-swap path must materialise the python stdlib module node") + + // Beyond the canary, both backends must produce the same set of + // node IDs. Any divergence means some resolver pass is still missing + // from one of the two paths. + onlyMem := setDiff(memIDs, dskIDs) + onlyDsk := setDiff(dskIDs, memIDs) + sort.Strings(onlyMem) + sort.Strings(onlyDsk) + assert.Empty(t, onlyMem, "nodes only in memory: %v", onlyMem) + assert.Empty(t, onlyDsk, "nodes only in ladybug: %v", onlyDsk) +} + +func setDiff(a, b map[string]string) []string { + out := []string{} + for id := range a { + if _, ok := b[id]; !ok { + out = append(out, id) + } + } + return out +} diff --git a/internal/indexer/shadow_threshold.go b/internal/indexer/shadow_threshold.go new file mode 100644 index 00000000..ea81a1a8 --- /dev/null +++ b/internal/indexer/shadow_threshold.go @@ -0,0 +1,74 @@ +package indexer + +import ( + "os" + "strconv" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// defaultShadowMaxFileCount caps the file count above which IndexCtx +// refuses to swap idx.graph for an in-memory shadow during cold start. +// Picked empirically from the in-memory store's prior profiling: at +// ~35k C files (drivers/) the in-memory store peaked at 8.6GB RSS; at +// 60k+ the peak is well past 16GB. The shadow path doubles that +// footprint (in-memory + persisted disk copy at the FlushBulk step), +// so the safe ceiling for a 32GB dev machine sits around 50k source +// files. Above that we fall through to the per-call disk path — +// slower per IndexCtx but bounded RAM. +const defaultShadowMaxFileCount = 50000 + +// defaultStreamingChunkSize is the per-chunk file count used by the +// streaming-flush path. At ~30 nodes / ~100 edges per file, 5000 +// files per chunk yields a ~600MB shadow that fits comfortably in +// RAM even on 8GB build agents. +const defaultStreamingChunkSize = 5000 + +// shadowMaxFileCount returns the active file-count ceiling for the +// IndexCtx in-memory shadow swap. GORTEX_SHADOW_MAX_FILES overrides +// the default; setting it to 0 disables the shadow entirely (always +// run against the disk store directly), setting it to a high value +// (e.g. 10_000_000) effectively disables the guard. Non-numeric or +// negative values fall back to the default. +func shadowMaxFileCount() int { + if v := os.Getenv("GORTEX_SHADOW_MAX_FILES"); v != "" { + n, err := strconv.Atoi(v) + if err == nil && n >= 0 { + return n + } + } + return defaultShadowMaxFileCount +} + +// streamingFlushActive reports whether the streaming-flush parse path +// should engage for this IndexCtx. Requirements: +// +// - the backing store implements graph.BulkLoader (ladybug does) +// - the file count is above the shadow-max threshold (small repos +// stay on the all-in-memory shadow path) +// - GORTEX_STREAMING_FLUSH is enabled (off by default — the +// streaming path leaves resolve to the disk-only per-call path, +// so it's only useful when shadow swap can't fit in RAM) +func streamingFlushActive(store graph.Store, fileCount int) bool { + if _, ok := store.(graph.BulkLoader); !ok { + return false + } + if fileCount <= shadowMaxFileCount() { + return false + } + v := os.Getenv("GORTEX_STREAMING_FLUSH") + return v == "1" || strings.EqualFold(v, "true") +} + +// streamingChunkSize returns the per-chunk file count for the +// streaming-flush path. Override via GORTEX_STREAMING_CHUNK_SIZE. +func streamingChunkSize() int { + if v := os.Getenv("GORTEX_STREAMING_CHUNK_SIZE"); v != "" { + n, err := strconv.Atoi(v) + if err == nil && n > 0 { + return n + } + } + return defaultStreamingChunkSize +} diff --git a/internal/indexer/should_index_for_search_test.go b/internal/indexer/should_index_for_search_test.go new file mode 100644 index 00000000..d3702666 --- /dev/null +++ b/internal/indexer/should_index_for_search_test.go @@ -0,0 +1,43 @@ +package indexer + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/config" + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/parser" +) + +// TestShouldIndexForSearch_ExcludesKindLocal is the regression that +// guards the search-index default-filter for KindLocal. The Go +// dataflow walker materialises every intra-function binding as a +// KindLocal node; without the search-side exclusion, common names +// (`err` / `data` / `n` / `i`) would flood every search result with +// thousands of per-function copies. +func TestShouldIndexForSearch_ExcludesKindLocal(t *testing.T) { + idx := New(graph.New(), parser.NewRegistry(), config.Default().Index, zap.NewNop()) + + cases := []struct { + name string + node *graph.Node + want bool + }{ + {"function passes", &graph.Node{ID: "f", Kind: graph.KindFunction, Name: "Foo"}, true}, + {"method passes", &graph.Node{ID: "m", Kind: graph.KindMethod, Name: "Bar"}, true}, + {"type passes", &graph.Node{ID: "t", Kind: graph.KindType, Name: "Baz"}, true}, + {"param passes", &graph.Node{ID: "p", Kind: graph.KindParam, Name: "x"}, true}, + {"closure passes", &graph.Node{ID: "c", Kind: graph.KindClosure, Name: "closure@4"}, true}, + {"file excluded", &graph.Node{ID: "fl", Kind: graph.KindFile, Name: "foo.go"}, false}, + {"import excluded", &graph.Node{ID: "im", Kind: graph.KindImport, Name: "fmt"}, false}, + {"local excluded — the regression", &graph.Node{ID: "l", Kind: graph.KindLocal, Name: "err"}, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := idx.shouldIndexForSearch(c.node) + assert.Equal(t, c.want, got) + }) + } +} diff --git a/internal/indexer/test_edges.go b/internal/indexer/test_edges.go index e52b813c..77a16beb 100644 --- a/internal/indexer/test_edges.go +++ b/internal/indexer/test_edges.go @@ -28,7 +28,7 @@ import ( // // Returns counts for telemetry: number of nodes marked as test, // number of EdgeTests emitted. -func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted int) { +func markTestSymbolsAndEmitEdges(g graph.Store) (markedTests int, edgesEmitted int) { if g == nil { return 0, 0 } @@ -40,11 +40,15 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted g.ResolveMutex().Lock() defer g.ResolveMutex().Unlock() - // Pass 1: classify file nodes, then function/method nodes. - testFiles := map[string]bool{} // file node ID → is test file - fileRunners := map[string]string{} // file FilePath → test runner - for _, n := range g.AllNodes() { - if n == nil || n.Kind != graph.KindFile { + // Pass 1: classify file nodes, then function/method nodes. Build + // a local testNodes set keyed by node id so Pass 2 can probe it + // without re-walking the Meta. (Node.Meta mutations on returned + // nodes don't persist back to disk backends, so a later GetNode + // in Pass 2 wouldn't see the is_test flag we set here.) + testFiles := map[string]bool{} // file node ID → is test file + fileRunners := map[string]string{} // file FilePath → test runner + for n := range g.NodesByKind(graph.KindFile) { + if n == nil { continue } if IsTestFile(n.FilePath) { @@ -60,22 +64,10 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted } } - for _, n := range g.AllNodes() { - if n == nil { - continue - } - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } - // Test-file membership is the authoritative signal. No standard - // runner (go test, pytest, ...) picks up a test by name outside - // a test file, so a production function that merely starts with - // "Test"/"Benchmark" (e.g. TestRole) must not be flagged. The - // name convention only refines the *role* — benchmark / fuzz / - // example — for symbols already inside a test file; anything - // else there is test support code: role "test". + testNodes := map[string]bool{} + stampTestSymbol := func(n *graph.Node) { if !testFiles[n.FilePath] { - continue + return } role := TestRole(n.Name, n.Language) if role == "" { @@ -89,31 +81,49 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted if runner := fileRunners[n.FilePath]; runner != "" { n.Meta["test_runner"] = runner } + testNodes[n.ID] = true markedTests++ } + for n := range g.NodesByKind(graph.KindFunction) { + if n != nil { + // Test-file membership is the authoritative signal. No + // standard runner (go test, pytest, ...) picks up a test + // by name outside a test file, so a production function + // that merely starts with "Test"/"Benchmark" (e.g. + // TestRole) must not be flagged. The name convention only + // refines the *role* — benchmark / fuzz / example — for + // symbols already inside a test file; anything else there + // is test support code: role "test". + stampTestSymbol(n) + } + } + for n := range g.NodesByKind(graph.KindMethod) { + if n != nil { + stampTestSymbol(n) + } + } // Pass 2: walk EdgeCalls; for each (test, non-test) pair, emit a // parallel EdgeTests. We dedupe per (From, To) because a single - // test can call the same subject multiple times. + // test can call the same subject multiple times. The testNodes set + // built in Pass 1 is the authoritative source — no inline GetNode + // is needed because the From / To kind filter is already enforced + // by "From must be a test symbol" (only function/method ids land + // in testNodes). seen := map[string]bool{} type pair struct{ from, to string } var pending []struct { pair pair edge *graph.Edge } - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeCalls { - continue - } - fromNode := g.GetNode(e.From) - toNode := g.GetNode(e.To) - if fromNode == nil || toNode == nil { + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil { continue } - if !isTestNode(fromNode) { + if !testNodes[e.From] { continue } - if isTestNode(toNode) { + if testNodes[e.To] { continue // test → test calls are infrastructure, not subject coverage } key := e.From + "\x00" + e.To @@ -141,14 +151,6 @@ func markTestSymbolsAndEmitEdges(g *graph.Graph) (markedTests int, edgesEmitted return markedTests, edgesEmitted } -func isTestNode(n *graph.Node) bool { - if n == nil || n.Meta == nil { - return false - } - v, _ := n.Meta["is_test"].(bool) - return v -} - // detectTestRunnerForFile resolves the runner identifier for a test file // node by consulting three signals, in priority order: // @@ -173,7 +175,7 @@ func isTestNode(n *graph.Node) bool { // // Returns "" when no signal applies; the caller leaves test_runner // unset rather than guessing. -func detectTestRunnerForFile(g *graph.Graph, fileNode *graph.Node) string { +func detectTestRunnerForFile(g graph.Store, fileNode *graph.Node) string { if fileNode == nil { return "" } @@ -215,7 +217,7 @@ func detectTestRunnerForFile(g *graph.Graph, fileNode *graph.Node) string { // (mirrors DetectJSTSTestRunner so files compiled by a non-JS / TS // extractor still classify correctly), Python (pytest / unittest), // and Ruby (rspec / minitest). -func detectRunnerFromImportEdges(g *graph.Graph, fileNode *graph.Node) string { +func detectRunnerFromImportEdges(g graph.Store, fileNode *graph.Node) string { const prefix = "unresolved::import::" for _, e := range g.GetOutEdges(fileNode.ID) { if e == nil || e.Kind != graph.EdgeImports { diff --git a/internal/indexer/unicode_path_test.go b/internal/indexer/unicode_path_test.go index 1973b868..81ffefc7 100644 --- a/internal/indexer/unicode_path_test.go +++ b/internal/indexer/unicode_path_test.go @@ -47,7 +47,7 @@ func goSrc(funcName string) string { // fileKindNodes returns only the file-kind nodes the graph holds for // the given key — used to detect a duplicate file-node leaking after a // re-index. -func fileKindNodes(g *graph.Graph, key string) []*graph.Node { +func fileKindNodes(g graph.Store, key string) []*graph.Node { var out []*graph.Node for _, n := range g.GetFileNodes(key) { if n.Kind == graph.KindFile { diff --git a/internal/mcp/combo_apply.go b/internal/mcp/combo_apply.go index c90cdf32..3dccc3e1 100644 --- a/internal/mcp/combo_apply.go +++ b/internal/mcp/combo_apply.go @@ -1,16 +1,18 @@ package mcp import ( + "time" + "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/search/rerank" ) -// applyRerankBoosts is the I13 entry point that runs the full -// 11-signal rerank.Pipeline over the candidate set with the -// session-aware Context wired in (locality, combo, frecency, -// feedback, churn, community). The structural signals (BM25 rank, -// fan-in / fan-out, MinHash similarity, signature match, recency) -// are computed off the graph + the candidate's current index. +// applyRerankBoostsTimed is the I13 entry point that runs the full +// 11-signal rerank.Pipeline over the candidate set with the session- +// aware Context wired in (locality, combo, frecency, feedback, churn, +// community). Structural signals (BM25 rank, fan-in / fan-out, +// MinHash similarity, signature match, recency) are computed off the +// graph + the candidate's current index. // // rerankCtx is the per-request Context built by the server; pass nil // and the pipeline falls back to a structural-only rerank using just @@ -18,13 +20,19 @@ import ( // candidate slice — when non-nil it carries per-signal contributions // out to the caller for debug / winnow surfacing; pass nil if the // caller only wants the sorted nodes. -func applyRerankBoosts(s *Server, nodes []*graph.Node, query string, rerankCtx *rerank.Context, lastResults *[]*rerank.Candidate) []*graph.Node { +// +// Returns the rerank's prepare and signals phase durations separately +// so the search_symbols handler's per-phase Debug log can attribute +// time honestly between the batched edge fetch (prepare) and the +// in-process scoring loop (signals). Zero durations when there's no +// work to do. +func applyRerankBoostsTimed(s *Server, nodes []*graph.Node, query string, rerankCtx *rerank.Context, lastResults *[]*rerank.Candidate) (result []*graph.Node, prepare time.Duration, signals time.Duration) { if len(nodes) < 2 || s == nil || s.engine == nil { - return nodes + return nodes, 0, 0 } pipeline := s.engine.Rerank() if pipeline == nil { - return nodes + return nodes, 0, 0 } cands := make([]*rerank.Candidate, 0, len(nodes)) for i, n := range nodes { @@ -38,15 +46,27 @@ func applyRerankBoosts(s *Server, nodes []*graph.Node, query string, rerankCtx * if rerankCtx.Graph == nil { rerankCtx.Graph = s.graph } + + // Phase 1: prepare — the batched in/out edge fetch + scratch fields. + // Exposed via the explicit Prepare call; Pipeline.Rerank detects the + // already-prepared slice and skips the duplicate work. + prepStart := time.Now() + rerankCtx.Prepare(cands) + prepare = time.Since(prepStart) + + // Phase 2: signals — the in-process scoring loop + final sort. + sigStart := time.Now() pipeline.Rerank(query, cands, rerankCtx) - out := make([]*graph.Node, 0, len(cands)) + signals = time.Since(sigStart) + + result = make([]*graph.Node, 0, len(cands)) for _, c := range cands { - out = append(out, c.Node) + result = append(result, c.Node) } if lastResults != nil { *lastResults = cands } - return out + return result, prepare, signals } // recordLastSearchFromNodes stores the query + top-limit IDs on the session diff --git a/internal/mcp/etag.go b/internal/mcp/etag.go index 055609bb..ab5d95e6 100644 --- a/internal/mcp/etag.go +++ b/internal/mcp/etag.go @@ -2,23 +2,75 @@ package mcp import ( "crypto/sha256" + "encoding/binary" "encoding/hex" "encoding/json" "sort" "strconv" "github.com/mark3labs/mcp-go/mcp" + "github.com/zzet/gortex/internal/query" ) // computeETag produces a short content hash suitable for conditional fetch. -// The hash is computed from the JSON serialization of the data. +// Streams the JSON serialization straight into the hash so we don't +// allocate the full marshaled byte slice (significant on large +// payloads — a 500-symbol SubGraph used to allocate ~100 KiB just to +// feed sha256). func computeETag(data any) string { - b, err := json.Marshal(data) - if err != nil { + h := sha256.New() + if err := json.NewEncoder(h).Encode(data); err != nil { return "" } - h := sha256.Sum256(b) - return hex.EncodeToString(h[:8]) // 16 hex chars — collision-safe for session use + sum := h.Sum(nil) + return hex.EncodeToString(sum[:8]) // 16 hex chars — collision-safe for session use +} + +// etagSubGraph is a fast structural ETag specialised for query.SubGraph +// payloads (the get_file_summary / get_editing_context hot path). +// Instead of going through json.Marshal on every node + edge + Meta map +// (which is the dominant cost for a 500-symbol file), it hashes a +// stable structural fingerprint: each node's id + line range, each +// edge's (from, to, kind), and the truncation / total counts. That +// keeps the invariant the callers depend on — "the etag changes when +// the file's listing changes" — without paying for the body of every +// Meta map on every call. +func etagSubGraph(sg *query.SubGraph) string { + if sg == nil { + return "" + } + h := sha256.New() + var buf [16]byte + for _, n := range sg.Nodes { + if n == nil { + continue + } + h.Write([]byte(n.ID)) + binary.BigEndian.PutUint32(buf[0:4], uint32(n.StartLine)) + binary.BigEndian.PutUint32(buf[4:8], uint32(n.EndLine)) + h.Write(buf[:8]) + h.Write([]byte{0}) + } + h.Write([]byte{1}) + for _, e := range sg.Edges { + if e == nil { + continue + } + h.Write([]byte(e.From)) + h.Write([]byte{31}) + h.Write([]byte(e.To)) + h.Write([]byte{31}) + h.Write([]byte(e.Kind)) + h.Write([]byte{0}) + } + binary.BigEndian.PutUint64(buf[0:8], uint64(sg.TotalNodes)) + binary.BigEndian.PutUint64(buf[8:16], uint64(sg.TotalEdges)) + h.Write(buf[:16]) + if sg.Truncated { + h.Write([]byte{1}) + } + sum := h.Sum(nil) + return hex.EncodeToString(sum[:8]) } // notModifiedResult returns a minimal "not modified" response with the matching etag. diff --git a/internal/mcp/gcx.go b/internal/mcp/gcx.go index c7f96aed..3c7db20a 100644 --- a/internal/mcp/gcx.go +++ b/internal/mcp/gcx.go @@ -497,13 +497,20 @@ func encodeSubGraph(tool string, sg *query.SubGraph) ([]byte, error) { } // encodeFileSummary emits one row per symbol in a file plus a trailing -// edge-distribution comment. +// edge-distribution comment. Pulls the edge total from sg.TotalEdges +// rather than len(sg.Edges) so the count-only handler path (which +// leaves the Edge slice nil to avoid materialising every adjacent +// edge over cgo) still reports the right number. func encodeFileSummary(sg *query.SubGraph, etag string) ([]byte, error) { var buf bytes.Buffer + totalEdges := sg.TotalEdges + if totalEdges == 0 { + totalEdges = len(sg.Edges) + } enc := newGCX(&buf, "get_file_summary", []string{"id", "kind", "name", "line", "sig"}, "total_nodes", fmt.Sprintf("%d", sg.TotalNodes), - "total_edges", fmt.Sprintf("%d", len(sg.Edges)), + "total_edges", fmt.Sprintf("%d", totalEdges), "truncated", boolString(sg.Truncated), "etag", etag, ) diff --git a/internal/mcp/notes.go b/internal/mcp/notes.go index f2bb5db4..e1b26586 100644 --- a/internal/mcp/notes.go +++ b/internal/mcp/notes.go @@ -602,7 +602,7 @@ func defaultAutoLinkOptions() autoLinkOptions { // // The function never panics — a nil graph or empty body just // returns no links. Results are deduplicated and capped. -func autoLinkBody(body string, g *graph.Graph, workspaceID string, opts autoLinkOptions) []string { +func autoLinkBody(body string, g graph.Store, workspaceID string, opts autoLinkOptions) []string { if g == nil || body == "" { return nil } @@ -628,9 +628,13 @@ func autoLinkBody(body string, g *graph.Graph, workspaceID string, opts autoLink } // (1) Direct ID matches — anything containing "::" is treated as - // a candidate ID. The regexp-free scan keeps this hot path cheap. - for _, candidate := range extractIDCandidates(body) { - node := g.GetNode(candidate) + // a candidate ID. Batch the lookup so even auto-linkers with many + // candidates on long notes only pay one backend round-trip on + // disk-backed stores. + candidates := extractIDCandidates(body) + candidateNodes := g.GetNodesByIDs(candidates) + for _, candidate := range candidates { + node := candidateNodes[candidate] if node == nil { continue } diff --git a/internal/mcp/overlay.go b/internal/mcp/overlay.go index db7c096b..3ce1d35b 100644 --- a/internal/mcp/overlay.go +++ b/internal/mcp/overlay.go @@ -11,6 +11,7 @@ import ( "github.com/mark3labs/mcp-go/mcp" mcpserver "github.com/mark3labs/mcp-go/server" + "go.uber.org/zap" "github.com/zzet/gortex/internal/daemon" ) @@ -73,7 +74,32 @@ func (s *Server) wrapToolHandler(h mcpserver.ToolHandlerFunc) mcpserver.ToolHand // Prompt-injection screening sits closest to the handler so it // sees the real arguments and the real result (see sanitize.go). h = s.sanitizeToolHandler(h) - return func(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + return func(ctx context.Context, req mcp.CallToolRequest) (res *mcp.CallToolResult, retErr error) { + // Last-resort panic firewall around EVERY tool handler. A Go + // panic in any handler (e.g. panicOnFatal when the ladybug + // store surfaces a fatal engine error such as "prepare: mutex + // lock failed: Invalid argument") would otherwise unwind past + // the mcp-go server loop and crash the whole daemon — dropping + // every session's MCP transport, not just the offending call. + // Convert it to a structured tool error so the panicking tool + // fails in isolation and the daemon survives. (A CGo-level + // *fatal error* like "semasleep on Darwin signal stack" is not + // a Go panic and cannot be recovered here — those must be + // fixed at the source by avoiding concurrent liblbug access.) + // This supersedes the per-handler recover that get_file_summary + // carried; every tool now gets the same protection. + defer func() { + if r := recover(); r != nil { + if s.logger != nil { + s.logger.Error("tool handler panic recovered", + zap.String("tool", req.Params.Name), + zap.Any("panic", r), + zap.Stack("stack")) + } + res = mcp.NewToolResultError(fmt.Sprintf("tool %q internal error: %v", req.Params.Name, r)) + retErr = nil + } + }() // Tolerate hallucinated / mistyped parameter names before the // handler reads arguments (e.g. "symbol" accepted as "id"). s.reconcileToolParams(&req) diff --git a/internal/mcp/overlay_view.go b/internal/mcp/overlay_view.go index 19402b8a..42f7da9e 100644 --- a/internal/mcp/overlay_view.go +++ b/internal/mcp/overlay_view.go @@ -447,7 +447,7 @@ func (s *Server) resolveOverlayEdges(layer *graph.OverlayLayer) { // in-place via AddEdge / removal pattern (layer is meant // to be append-only post-construction; the resolver pass runs // before the layer is handed to the View, so we still own it). - for from, edges := range layer.OutEdgesByFromAll() { + for _, edges := range layer.OutEdgesByFromAll() { for _, e := range edges { if !strings.HasPrefix(e.To, unresolvedPrefix) { continue @@ -464,13 +464,12 @@ func (s *Server) resolveOverlayEdges(layer *graph.OverlayLayer) { if target == "" { continue } - resolved := s.lookupOverlayTarget(layer, target, from) + resolved := s.lookupOverlayTarget(layer, target) if resolved == "" { continue } e.To = resolved } - _ = from } // Rebuild the layer's inEdges index now that targets may have // changed. The layer exposes a Rebuild helper so we don't have @@ -482,7 +481,7 @@ func (s *Server) resolveOverlayEdges(layer *graph.OverlayLayer) { // short name in (layer ∪ base). Returns the node ID on a unique // match, empty string otherwise. Tied matches return empty so the // edge stays as a placeholder rather than picking the wrong target. -func (s *Server) lookupOverlayTarget(layer *graph.OverlayLayer, name, _fromID string) string { +func (s *Server) lookupOverlayTarget(layer *graph.OverlayLayer, name string) string { overlay := layer.NodesByName(name) if len(overlay) == 1 { return overlay[0].ID diff --git a/internal/mcp/resources_analyzer.go b/internal/mcp/resources_analyzer.go index 89a12bd8..d6d189ee 100644 --- a/internal/mcp/resources_analyzer.go +++ b/internal/mcp/resources_analyzer.go @@ -113,7 +113,7 @@ func (s *Server) handleResourceReport(ctx context.Context, req mcp.ReadResourceR var hotspotCount int if len(scoped) >= 10 { - for _, h := range analysis.FindHotspots(s.graph, s.getCommunities(), 0) { + for _, h := range s.getHotspots() { if inScope == nil || inScope[h.ID] { hotspotCount++ } @@ -173,7 +173,7 @@ func (s *Server) handleResourceGodNodes(_ context.Context, req mcp.ReadResourceR }) } - entries := analysis.FindHotspots(s.graph, s.getCommunities(), 0) + entries := s.getHotspots() totalCount := len(entries) truncated := false if len(entries) > 20 { @@ -205,7 +205,7 @@ func (s *Server) handleResourceSurprises(_ context.Context, req mcp.ReadResource var topHubs []analysis.HotspotEntry if s.graph.NodeCount() >= 10 { - hot := analysis.FindHotspots(s.graph, communities, 0) + hot := s.getHotspots() // Top hubs == hotspots with at least one community crossing. for _, h := range hot { if h.CommunityCrossings > 0 { diff --git a/internal/mcp/search_equivalence.go b/internal/mcp/search_equivalence.go index f7f97f80..2b367b2d 100644 --- a/internal/mcp/search_equivalence.go +++ b/internal/mcp/search_equivalence.go @@ -54,6 +54,22 @@ func (m expandMode) allowsEquivalenceExpansion() bool { return m == expandBoth || m == expandEquivalenceOnly } +// isIdentifierClass reports whether the query class is one of the +// identifier-shape classes (symbol / path / signature) — the classes +// where the rerank's classWeightTable already proves the semantic +// channel contributes near-zero useful signal (0.65 / 0.45 / 0.80 vs +// the baseline 1.00 for concept). The handler routes these queries +// through the identifier-shape fast path: expansion off, vector +// channel off, fetch slack tightened. +func isIdentifierClass(c rerank.QueryClass) bool { + switch c { + case rerank.QueryClassSymbol, rerank.QueryClassPath, rerank.QueryClassSignature: + return true + default: + return false + } +} + // expandEquivalenceClasses returns the deterministic expansion terms // for a query: for every query token, its curated-equivalence-table // siblings and its per-repo auto-mined concept siblings. The result diff --git a/internal/mcp/server.go b/internal/mcp/server.go index ee4ac548..5026a5c1 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -85,7 +85,7 @@ func (sh *symbolHistory) All() map[string][]SymbolModification { type Server struct { mcpServer *server.MCPServer engine *query.Engine - graph *graph.Graph + graph graph.Store indexer *indexer.Indexer watcher watcherHistory multiIndexer *indexer.MultiIndexer @@ -118,6 +118,21 @@ type Server struct { // of the whole graph. nil until the first clusters request; // guarded by analysisMu. leidenCache *analysis.LeidenPartitionCache + // communitiesToken snapshots the graph identity that backed + // s.communities — (NodeCount, EdgeCount, EdgeIdentityRevisions). + // handleAnalyzeClusters reads this before calling the incremental + // detector: if the token still matches the live graph, the cached + // communities are reused without scanning AllNodes / AllEdges to + // fingerprint packages. On Ladybug the fingerprint scan alone is + // ~140s; the cache check is three scalar reads. + communitiesToken communityCacheToken + // hotspots is the default-threshold (mean + 2*stddev) hotspot + // ranking. FindHotspots' inner ComputeBetweenness pass dominates + // the wall clock of get_repo_outline / get_architecture / + // gortex_wakeup / the analyze(hotspots) resource — caching it + // once per RunAnalysis turn turns repeat calls into a map lookup. + // Rebuilt each RunAnalysis pass; guarded by analysisMu. + hotspots []analysis.HotspotEntry analysisMu sync.RWMutex // cochange caches the git-history co-change graph. cochangeByFile @@ -446,10 +461,7 @@ type tokenStats struct { // returned and fullFile are token counts (cl100k_base via internal/tokens). func (ts *tokenStats) record(node *graph.Node, tool string, returned, fullFile int64) { ts.mu.Lock() - saved := fullFile - returned - if saved < 0 { - saved = 0 - } + saved := max(fullFile-returned, 0) ts.tokensSaved += saved ts.tokensReturned += returned ts.callCount++ @@ -724,7 +736,7 @@ const serverInstructions = `Gortex is a code-intelligence graph server — it in - Pass format:"gcx" to list-shaped tools for a compact, round-trippable wire format (~27% fewer tokens).` // NewServer creates an MCP server with all Gortex tools registered. -func NewServer(engine *query.Engine, g *graph.Graph, idx *indexer.Indexer, watcher *indexer.Watcher, logger *zap.Logger, guardRules []config.GuardRule, opts ...MultiRepoOptions) *Server { +func NewServer(engine *query.Engine, g graph.Store, idx *indexer.Indexer, watcher *indexer.Watcher, logger *zap.Logger, guardRules []config.GuardRule, opts ...MultiRepoOptions) *Server { s := &Server{ engine: engine, graph: g, @@ -836,6 +848,8 @@ func NewServer(engine *query.Engine, g *graph.Graph, idx *indexer.Indexer, watch s.registerGenerateSkillTool() s.registerInspectionsTools() s.registerChurnRateTool() + s.registerEnrichChurnTool() + s.registerEnrichReleasesTool() s.registerCoChangeTool() s.registerArtifactTools() s.registerCouplingMetricsTool() @@ -1146,6 +1160,59 @@ func (s *Server) scopedNodes(ctx context.Context) []*graph.Node { return out } +// scopedNodesByKinds is the kind-pushdown sibling of scopedNodes for +// handlers that only need a specific kind set. When the backend +// implements graph.NodesByKindsScanner the kind predicate runs server- +// side (one Cypher MATCH (n:Node) WHERE n.kind IN $kinds) instead of +// the legacy AllNodes()-then-Go-side filter. The metadata analyzers +// (todos, stale_code, stale_flags, ownership, coverage_gaps, +// coverage_summary, cgo_users, wasm_users, orphan_tables, +// unreferenced_tables) each keep one or two kinds out of the whole +// node table; pushing that filter is the entire win. +// +// Workspace-bound sessions still narrow Go-side: the capability does +// not know about ScopeAllows, and adding workspace_id to every analyze +// query would tie the capability to the session-scope concept. The +// secondary filter is cheap because the kind pushdown already shrank +// the row count by 1-2 orders of magnitude. +// +// Empty kinds returns nil — defensive against caller bugs that would +// otherwise drop into the full-AllNodes fallback path. +func (s *Server) scopedNodesByKinds(ctx context.Context, kinds []graph.NodeKind) []*graph.Node { + if len(kinds) == 0 { + return nil + } + var nodes []*graph.Node + if scan, ok := s.graph.(graph.NodesByKindsScanner); ok { + nodes = scan.NodesByKinds(kinds) + } else { + // Fallback: same behaviour as scopedNodes, kind-filtered Go-side. + all := s.graph.AllNodes() + allowed := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + allowed[k] = struct{}{} + } + nodes = make([]*graph.Node, 0, len(all)) + for _, n := range all { + if _, ok := allowed[n.Kind]; ok { + nodes = append(nodes, n) + } + } + } + sessWS, _, bound := s.sessionScope(ctx) + if !bound { + return nodes + } + opts := query.QueryOptions{WorkspaceID: sessWS} + out := make([]*graph.Node, 0, len(nodes)) + for _, n := range nodes { + if opts.ScopeAllows(n) { + out = append(out, n) + } + } + return out +} + // scopedNodeSlice filters an existing node slice to the session's // workspace. Convenience for handlers that already hold a node list // (engine list methods that don't take QueryOptions). @@ -1395,6 +1462,25 @@ func (s *Server) ResolveToolScope(toolName string, repo any) (*ScopedRepos, *mcp return ResolveScopedRepos(scope, s.bind, repo) } +// communityCacheToken is the per-graph identity tuple +// handleAnalyzeClusters checks before re-running the incremental +// detector. EdgeIdentity moves on any structural mutation; NodeCount +// and EdgeCount cover pure additions / removals that leave the +// identity counter alone. A zero token is "never populated". +type communityCacheToken struct { + edgeIdentity int + nodeCount int + edgeCount int +} + +func (s *Server) currentCommunityToken() communityCacheToken { + return communityCacheToken{ + edgeIdentity: s.graph.EdgeIdentityRevisions(), + nodeCount: s.graph.NodeCount(), + edgeCount: s.graph.EdgeCount(), + } +} + // RunAnalysis performs community detection and process discovery on // the current graph, then pushes a `notifications/resources/updated` // for every bootstrap resource so subscribed clients can refresh @@ -1409,6 +1495,7 @@ func (s *Server) RunAnalysis() { communities, cache, _ := analysis.DetectCommunitiesLeidenIncremental(s.graph, s.leidenCache) s.communities = communities s.leidenCache = cache + s.communitiesToken = s.currentCommunityToken() s.processes = analysis.DiscoverProcesses(s.graph) s.pageRank = analysis.ComputePageRank(s.graph) // Auto-concept vocabulary: mine domain phrases from symbol names @@ -1418,6 +1505,10 @@ func (s *Server) RunAnalysis() { // HITS authority/hub scores -- fed into the search rerank as an // authority signal that complements raw fan-in. s.hits = analysis.ComputeHITS(s.graph) + // Default-threshold hotspot ranking — cached because FindHotspots + // triggers ComputeBetweenness which is the shared wall-clock + // floor for outline / architecture / wakeup / the resource view. + s.hotspots = analysis.FindHotspots(s.graph, communities, 0) s.analysisMu.Unlock() // Bootstrap-resource payloads (graph_stats, index_health, etc.) @@ -1444,11 +1535,59 @@ func (s *Server) getCommunities() *analysis.CommunityResult { // packages. The cache it returns is stored back under analysisMu so // the next clusters request can build on it. The accompanying stats // describe whether the fast path or a full recompute ran. +// +// Short-circuits when the cached communities are still valid for the +// live graph: the (NodeCount, EdgeCount, EdgeIdentityRevisions) token +// captured by the last detector run is compared against the current +// graph identity in three scalar reads. On Ladybug a match skips the +// AllNodes / AllEdges fingerprint scan that otherwise dominates the +// call (~140s on a fresh daemon) and serves the existing partition +// straight from the cache. The reported stats describe a no-op +// incremental run (no changed packages, no repartitioned nodes) so +// callers see the cache hit on the wire. func (s *Server) incrementalCommunities() (*analysis.CommunityResult, analysis.IncrementalCommunityStats) { s.analysisMu.Lock() defer s.analysisMu.Unlock() + cur := s.currentCommunityToken() + if s.communities != nil && s.communitiesToken == cur { + stats := analysis.IncrementalCommunityStats{ + Incremental: true, + } + if s.leidenCache != nil { + stats.TotalPackages = len(s.leidenCache.PackageFingerprints()) + } + if s.logger != nil { + s.logger.Debug("incrementalCommunities cache hit", + zap.Int("nodes", cur.nodeCount), + zap.Int("edges", cur.edgeCount), + zap.Int("edge_identity_rev", cur.edgeIdentity)) + } + return s.communities, stats + } + if s.logger != nil { + // INFO-level on the miss path so a regression that re-introduces + // a steady-state cache miss is visible without flipping the + // daemon to debug. The full token diff is here precisely to + // catch background-mutation regressions (some pass keeps drifting + // the edge count under the cache and the Leiden walk runs every + // call). A real first-call miss is a single line in the log. + s.logger.Info("incrementalCommunities cache miss", + zap.Bool("communities_nil", s.communities == nil), + zap.Int("cached_nodes", s.communitiesToken.nodeCount), + zap.Int("cur_nodes", cur.nodeCount), + zap.Int("cached_edges", s.communitiesToken.edgeCount), + zap.Int("cur_edges", cur.edgeCount), + zap.Int("cached_edge_rev", s.communitiesToken.edgeIdentity), + zap.Int("cur_edge_rev", cur.edgeIdentity)) + } result, cache, stats := analysis.DetectCommunitiesLeidenIncremental(s.graph, s.leidenCache) + s.communities = result s.leidenCache = cache + // Capture the token AFTER the algo finishes — if the graph mutated + // during the (potentially slow) detector run, the token reflects + // the state the result was actually computed against, and the next + // call's token comparison stays meaningful. + s.communitiesToken = s.currentCommunityToken() return result, stats } @@ -1482,6 +1621,17 @@ func (s *Server) getHITS() *analysis.HITSResult { return s.hits } +// getHotspots returns the default-threshold hotspot ranking computed +// by the most recent RunAnalysis pass. Nil/empty until the first +// pass; callers use the live FindHotspots(threshold) path when they +// need a non-default threshold. Returned slice is shared and must +// not be mutated by the caller. +func (s *Server) getHotspots() []analysis.HotspotEntry { + s.analysisMu.RLock() + defer s.analysisMu.RUnlock() + return s.hotspots +} + // SetArchitecture installs the declarative architecture-rules DSL so // check_guards evaluates layered violations alongside the flat guard // rules. Called by the server / daemon entrypoint right after diff --git a/internal/mcp/server_test.go b/internal/mcp/server_test.go index 186acb15..3b36c0eb 100644 --- a/internal/mcp/server_test.go +++ b/internal/mcp/server_test.go @@ -26,16 +26,17 @@ import ( func setupTestServer(t *testing.T) (*Server, string) { t.Helper() dir := t.TempDir() + // Fixture deliberately has zero external imports so the + // resolver's attributeGoExternalCalls pass doesn't auto-add a + // `module::go:*` node — that lets the external-calls analyser + // tests assert on an exact set of manually-added modules. _ = os.WriteFile(filepath.Join(dir, "main.go"), []byte(`package main -import "fmt" - type Config struct { Port int } func main() { - fmt.Println("hello") helper() } diff --git a/internal/mcp/streamable/transport.go b/internal/mcp/streamable/transport.go index 918b542e..2122c24d 100644 --- a/internal/mcp/streamable/transport.go +++ b/internal/mcp/streamable/transport.go @@ -438,14 +438,20 @@ func (t *Transport) localDispatch(r *http.Request, state SessionState, frame []b // roster, and proxy the call there. A return value of (_, _, false) // means "fall through to local dispatch". func (t *Transport) tryRouteToolCall(r *http.Request, state SessionState, frame []byte) ([]byte, int, bool) { + // Decode the JSON-RPC envelope keeping the inbound `arguments` + // object as raw bytes — we MUST forward every caller-supplied key + // (e.g. `query`, `limit`, etc.) to the downstream executor, not + // just the workspace+cwd peek fields. A previous version + // re-marshalled only the typed peek struct, which silently + // stripped every other argument and made every router-routed tool + // call see an empty args map ("X is required" failures). Mirror + // the daemon dispatcher's tryProxyToolCall: peek workspace+cwd + // without dropping the rest. var envelope struct { ID json.RawMessage `json:"id"` Params struct { - Name string `json:"name"` - Arguments struct { - Workspace string `json:"workspace"` - Cwd string `json:"cwd"` - } `json:"arguments"` + Name string `json:"name"` + Arguments json.RawMessage `json:"arguments"` } `json:"params"` } if err := json.Unmarshal(frame, &envelope); err != nil { @@ -454,23 +460,39 @@ func (t *Transport) tryRouteToolCall(r *http.Request, state SessionState, frame if envelope.Params.Name == "" { return nil, 0, false } - scope := envelope.Params.Arguments.Workspace + // Second decode is only used to peek the routing hints. + var peek struct { + Workspace string `json:"workspace"` + Cwd string `json:"cwd"` + } + if len(envelope.Params.Arguments) > 0 { + _ = json.Unmarshal(envelope.Params.Arguments, &peek) + } + scope := peek.Workspace if scope == "" { scope = state.Workspace } - cwd := envelope.Params.Arguments.Cwd + cwd := peek.Cwd if cwd == "" { cwd = state.CWD } if cwd == "" { cwd = strings.TrimSpace(r.Header.Get("X-Gortex-Cwd")) } - argsJSON, err := json.Marshal(envelope.Params.Arguments) + // Wrap the original raw arguments under `{"arguments": {...}}` so + // the local executor's nested-arguments unmarshal path (see + // cmd/gortex/server_router.go newLocalToolExecutor) finds them. + // This matches cmd/gortex/daemon_mcp.go:tryProxyToolCall exactly. + rawArgs := envelope.Params.Arguments + if len(rawArgs) == 0 { + rawArgs = json.RawMessage(`{}`) + } + body, err := json.Marshal(map[string]json.RawMessage{"arguments": rawArgs}) if err != nil { return nil, 0, false } out, status, rerr := t.router.RouteToolCall(r.Context(), - envelope.Params.Name, argsJSON, daemon.RouteContext{ + envelope.Params.Name, body, daemon.RouteContext{ ScopeOverride: scope, Cwd: cwd, }) diff --git a/internal/mcp/streamable/transport_test.go b/internal/mcp/streamable/transport_test.go index c483645f..b12d5bc7 100644 --- a/internal/mcp/streamable/transport_test.go +++ b/internal/mcp/streamable/transport_test.go @@ -16,6 +16,8 @@ import ( "github.com/mark3labs/mcp-go/mcp" mcpserver "github.com/mark3labs/mcp-go/server" + "github.com/zzet/gortex/internal/daemon" + "go.uber.org/zap" ) // newTestMCPServer mints an mcp-go server pre-loaded with an `echo` @@ -713,6 +715,91 @@ func TestMCPServerDispatcherNilFailsCleanly(t *testing.T) { } } +// TestRouterPreservesFullArguments pins the regression fix: when the +// streamable transport routes a tools/call through a daemon.Router +// whose local executor unmarshals to a map, the executor must see the +// caller's ORIGINAL arguments — not a stripped {workspace,cwd} peek. +// +// A previous version of tryRouteToolCall re-marshalled only the typed +// peek struct (workspace+cwd) and dropped every other key on the +// floor, breaking every real MCP usage with "X is required" because +// the args map was effectively empty. This test fails on that bug +// and passes on the fix. +func TestRouterPreservesFullArguments(t *testing.T) { + var seenBody []byte + router := daemon.NewRouter(daemon.RouterConfig{ + LocalExecute: func(_ context.Context, _ string, body []byte) ([]byte, int, error) { + seenBody = append([]byte(nil), body...) + // Mirror the production local executor: unwrap + // `{"arguments": {...}}` then assert every caller key + // survived the round-trip. + var nested struct { + Arguments map[string]any `json:"arguments"` + } + if err := json.Unmarshal(body, &nested); err != nil { + return nil, 500, err + } + if nested.Arguments == nil { + return []byte(`{"error":"no arguments"}`), 200, nil + } + out, _ := json.Marshal(map[string]any{"ok": true, "args": nested.Arguments}) + return out, 200, nil + }, + Logger: zap.NewNop(), + }) + + store := NewMemoryStore(time.Minute) + defer store.Close() + tr := New(Config{ + Dispatcher: MCPServerDispatcher{Server: newTestMCPServer()}, + Store: store, + Router: router, + }) + + // Seed an initialized session so the transport accepts the call. + sid, err := store.Create(SessionState{Initialized: true, ClientName: "test"}) + if err != nil { + t.Fatalf("seed Create: %v", err) + } + + callBody := jsonRPC(7, "tools/call", map[string]any{ + "name": "search_symbols", + "arguments": map[string]any{ + "query": "NewServer", + "limit": 10, + }, + }) + rec := doPOST(t, tr, callBody, map[string]string{HeaderSessionID: sid}) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d, body=%s", rec.Code, rec.Body.String()) + } + + // 1) The local executor must have seen the original args. + var nested struct { + Arguments map[string]any `json:"arguments"` + } + if err := json.Unmarshal(seenBody, &nested); err != nil { + t.Fatalf("local executor body not JSON: %v\nbody=%s", err, string(seenBody)) + } + if nested.Arguments == nil { + t.Fatalf("local executor saw nil arguments — args were stripped. body=%s", string(seenBody)) + } + if got, _ := nested.Arguments["query"].(string); got != "NewServer" { + t.Errorf("query = %q, want %q (args stripped before reaching executor). body=%s", + got, "NewServer", string(seenBody)) + } + // JSON numbers decode to float64 in interface{}; compare as such. + if got, _ := nested.Arguments["limit"].(float64); got != 10 { + t.Errorf("limit = %v, want 10 (args stripped before reaching executor). body=%s", + got, string(seenBody)) + } + + // 2) The wrapped tool result must reach the client too. + if !strings.Contains(rec.Body.String(), "NewServer") { + t.Errorf("client response missing forwarded args: %s", rec.Body.String()) + } +} + // TestHTTPRoundTripEndToEnd — fires the transport behind an // httptest.Server so the body actually flows through net/http; covers // the boundary the per-test recorder can't. diff --git a/internal/mcp/tools_analyze_annotation_users_test.go b/internal/mcp/tools_analyze_annotation_users_test.go index 65b573e8..099ee610 100644 --- a/internal/mcp/tools_analyze_annotation_users_test.go +++ b/internal/mcp/tools_analyze_annotation_users_test.go @@ -30,7 +30,7 @@ func callAnalyzeAnnotationUsers(t *testing.T, srv *Server, args map[string]any) return out } -func addAnnotationNode(g *graph.Graph, id, name string) { +func addAnnotationNode(g graph.Store, id, name string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindType, @@ -39,7 +39,7 @@ func addAnnotationNode(g *graph.Graph, id, name string) { }) } -func addAnnotatedEdge(g *graph.Graph, from, to, args string) { +func addAnnotatedEdge(g graph.Store, from, to, args string) { e := &graph.Edge{From: from, To: to, Kind: graph.EdgeAnnotated, FilePath: "x.go", Line: 1} if args != "" { e.Meta = map[string]any{"args": args} diff --git a/internal/mcp/tools_analyze_channel_ops_test.go b/internal/mcp/tools_analyze_channel_ops_test.go index d5e3cd15..9031f70e 100644 --- a/internal/mcp/tools_analyze_channel_ops_test.go +++ b/internal/mcp/tools_analyze_channel_ops_test.go @@ -30,7 +30,7 @@ func callAnalyzeChannelOps(t *testing.T, srv *Server, args map[string]any) map[s return out } -func addChannelEdge(g *graph.Graph, kind graph.EdgeKind, from, to, file string, line int) { +func addChannelEdge(g graph.Store, kind graph.EdgeKind, from, to, file string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_clusters.go b/internal/mcp/tools_analyze_clusters.go index 699162c2..e94320b4 100644 --- a/internal/mcp/tools_analyze_clusters.go +++ b/internal/mcp/tools_analyze_clusters.go @@ -63,12 +63,6 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ }) } - scoped := s.scopedNodes(ctx) - scopedSet := make(map[string]*graph.Node, len(scoped)) - for _, n := range scoped { - scopedSet[n.ID] = n - } - type clusterRow struct { ID string `json:"id"` Label string `json:"label"` @@ -82,8 +76,18 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ MemberSample []string `json:"member_sample,omitempty"` } - rows := make([]clusterRow, 0, len(cr.Communities)) - for _, c := range cr.Communities { + // First pass: keep only the clusters that survive size + path-prefix + // gates, then sort + truncate to the requested limit. The density, + // language-mix, and top-files work below is bounded by the truncated + // row count instead of every community in the partition — important + // on Ladybug where each member touches the graph store. + type pending struct { + c *analysis.Community + row clusterRow + } + survivors := make([]pending, 0, len(cr.Communities)) + for i := range cr.Communities { + c := &cr.Communities[i] if c.Size < minSize { continue } @@ -99,30 +103,77 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ continue } } - row := clusterRow{ ID: c.ID, Label: c.Label, Hub: c.Hub, Size: c.Size, Files: len(c.Files), Languages: map[string]int{}, } - - // File-spread = files-per-member; 1.0 means every member - // lives in its own file (boundary-heavy), close to 0 means - // many members per file (file-bound cluster). if c.Size > 0 { row.FileSpread = roundScore(float64(len(c.Files)) / float64(c.Size)) } + survivors = append(survivors, pending{c: c, row: row}) + } + sort.Slice(survivors, func(i, j int) bool { + if survivors[i].c.Size != survivors[j].c.Size { + return survivors[i].c.Size > survivors[j].c.Size + } + return survivors[i].c.ID < survivors[j].c.ID + }) + truncated := false + if len(survivors) > limit { + survivors = survivors[:limit] + truncated = true + } - // Density requires the intra-cluster edge count. Use the - // member set + graph in-place; cheap on cluster-sized - // node lists. - memberSet := make(map[string]bool, len(c.Members)) - for _, m := range c.Members { - memberSet[m] = true + // Batch every surviving cluster's member ids and pull their nodes + + // outgoing edges in two calls — one Cypher round-trip each on + // Ladybug, against the per-member GetNode / GetOutEdges loop the + // previous shape ran (N members × 2 cgo trips). Members from + // communities that didn't survive the truncate above never reach + // the store. + // + // Per-cluster member cap: communities can hold thousands of nodes + // each. On Ladybug, fetching tens of thousands of nodes + edges per + // call is several seconds of cgo cost — the rendered response only + // uses these to compute density / language mix / top files, all of + // which converge on a representative sample long before they need + // every member. With a default 50-cluster limit and ~200 sampled + // members per cluster, the IN-list stays under 10k IDs and the + // rendering stays sub-second. The exact `size` field still reflects + // the true cluster size because it comes from c.Size, not from the + // sampled set. + const sampleCap = 200 + sampleMemberIDs := make([]string, 0, len(survivors)*sampleCap) + sampleSets := make([]map[string]bool, 0, len(survivors)) + for _, p := range survivors { + members := p.c.Members + if len(members) > sampleCap { + members = members[:sampleCap] + } + set := make(map[string]bool, len(members)) + for _, m := range members { + set[m] = true } + sampleSets = append(sampleSets, set) + sampleMemberIDs = append(sampleMemberIDs, members...) + } + memberNodes := s.graph.GetNodesByIDs(sampleMemberIDs) + memberOutEdges := s.graph.GetOutEdgesByNodeIDs(sampleMemberIDs) + + rows := make([]clusterRow, 0, len(survivors)) + for i, p := range survivors { + c := p.c + row := p.row + memberSet := sampleSets[i] + sampleSize := len(memberSet) + + // Density on the sample, normalised against (sampleSize · + // (sampleSize-1)) to keep the ratio meaningful when only part + // of the cluster was inspected. Intra-sample edges restricted + // to the call / reference kinds the clusterer cares about. intra := 0 - for _, m := range c.Members { - for _, e := range s.graph.GetOutEdges(m) { + for m := range memberSet { + for _, e := range memberOutEdges[m] { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { continue } @@ -131,16 +182,14 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ } } } - // Density = intra-edges / possible-directed-pairs. - if c.Size > 1 { - possible := c.Size * (c.Size - 1) + if sampleSize > 1 { + possible := sampleSize * (sampleSize - 1) row.Density = roundScore(float64(intra) / float64(possible)) } - // Language mix + top files. fileCounts := map[string]int{} - for _, m := range c.Members { - n := scopedSet[m] + for m := range memberSet { + n := memberNodes[m] if n == nil { continue } @@ -156,17 +205,6 @@ func (s *Server) handleAnalyzeClusters(ctx context.Context, req mcp.CallToolRequ rows = append(rows, row) } - sort.Slice(rows, func(i, j int) bool { - if rows[i].Size != rows[j].Size { - return rows[i].Size > rows[j].Size - } - return rows[i].ID < rows[j].ID - }) - truncated := false - if len(rows) > limit { - rows = rows[:limit] - truncated = true - } resp := map[string]any{ "clusters": rows, diff --git a/internal/mcp/tools_analyze_components.go b/internal/mcp/tools_analyze_components.go new file mode 100644 index 00000000..7dae5680 --- /dev/null +++ b/internal/mcp/tools_analyze_components.go @@ -0,0 +1,164 @@ +// wcc / scc — connected-component diagnostics. +// +// `analyze kind=wcc` returns the weakly connected components: pairs +// of symbols reachable from each other ignoring edge direction. A +// healthy index has a small number of large WCCs (the connected +// codebase) plus a long tail of singletons (isolated extracted +// symbols). A WCC count that explodes between reindexes signals +// extraction drift, not code change. +// +// `analyze kind=scc` returns the strongly connected components: +// pairs of symbols mutually reachable along directed edges. Every +// non-trivial SCC (size > 1) is a recursion ring — mutual +// recursion in calls, two-way references between data types, +// circular module dependencies. Useful for cycle audits beyond +// what kind=cycles surfaces today. +// +// Routing: +// +// - When the backing graph.Store implements graph.ComponentFinder +// (today only store_ladybug), both kinds delegate to the +// engine-native algorithm. +// +// - Otherwise the in-process analysis.ComputeWCC / +// analysis.ComputeSCC runs. SCC uses an iterative Tarjan so a +// deep call graph won't blow the goroutine stack. + +package mcp + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/analysis" + "github.com/zzet/gortex/internal/graph" +) + +// componentRow is the per-component shape the analyzer returns. +type componentRow struct { + ID int `json:"id"` + Size int `json:"size"` + Members []string `json:"members"` +} + +// handleAnalyzeConnectedComponents serves both `analyze kind=wcc` +// and `analyze kind=scc`. The directed flag picks SCC; unset picks +// WCC. +func (s *Server) handleAnalyzeConnectedComponents( + ctx context.Context, req mcp.CallToolRequest, directed bool, +) (*mcp.CallToolResult, error) { + args := req.GetArguments() + + limit := 50 + if v, ok := args["limit"].(float64); ok && v > 0 { + limit = int(v) + } + minSize := 0 + if v, ok := args["min_size"].(float64); ok && v > 0 { + minSize = int(v) + } + memberLimit := 100 + if v, ok := args["member_limit"].(float64); ok && v > 0 { + memberLimit = int(v) + } + + kindLabel := "wcc" + if directed { + kindLabel = "scc" + } + + results := s.runComponents(directed, analysis.ComponentOptions{MinSize: minSize}) + if limit > 0 && limit < len(results) { + results = results[:limit] + } + + rows := make([]componentRow, 0, len(results)) + for _, r := range results { + members := r.Members + if memberLimit > 0 && memberLimit < len(members) { + members = members[:memberLimit] + } + rows = append(rows, componentRow{ID: r.ID, Size: r.Size, Members: members}) + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze(kindLabel, rows)) + } + if isCompact(req) { + var b strings.Builder + for _, r := range rows { + fmt.Fprintf(&b, "id=%d size=%d members=%v\n", r.ID, r.Size, r.Members) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "components": rows, + "total": len(rows), + "kind": kindLabel, + }) +} + +// runComponents picks the engine-native path when the backing +// store implements graph.ComponentFinder, otherwise falls back to +// the in-process analysis.ComputeWCC / ComputeSCC. +func (s *Server) runComponents(directed bool, opts analysis.ComponentOptions) []analysis.ComponentResult { + if store := s.backendStore(); store != nil { + if cf, ok := store.(graph.ComponentFinder); ok { + hits, err := callComponentFinder(cf, directed, graph.ComponentOpts{ + NodeKinds: opts.NodeKinds, + EdgeKinds: opts.EdgeKinds, + }) + if err == nil { + return collectHits(hits, opts.MinSize) + } + // Engine-native error falls through to the in-process + // path rather than returning a half-done result. + } + } + if directed { + return analysis.ComputeSCC(s.graph, opts) + } + return analysis.ComputeWCC(s.graph, opts) +} + +func callComponentFinder(cf graph.ComponentFinder, directed bool, opts graph.ComponentOpts) ([]graph.ComponentHit, error) { + if directed { + return cf.StronglyConnectedComponents(opts) + } + return cf.WeaklyConnectedComponents(opts) +} + +// collectHits groups CommunityHits by ID, applies MinSize, sorts +// for determinism, and renumbers — mirrors analysis.collectComponents +// without exporting that internal helper. +func collectHits(hits []graph.ComponentHit, minSize int) []analysis.ComponentResult { + groups := make(map[int64][]string) + for _, h := range hits { + groups[h.ComponentID] = append(groups[h.ComponentID], h.NodeID) + } + out := make([]analysis.ComponentResult, 0, len(groups)) + for _, members := range groups { + if minSize > 0 && len(members) < minSize { + continue + } + sort.Strings(members) + out = append(out, analysis.ComponentResult{Members: members, Size: len(members)}) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Size != out[j].Size { + return out[i].Size > out[j].Size + } + if len(out[i].Members) > 0 && len(out[j].Members) > 0 { + return out[i].Members[0] < out[j].Members[0] + } + return false + }) + for i := range out { + out[i].ID = i + } + return out +} diff --git a/internal/mcp/tools_analyze_concurrency.go b/internal/mcp/tools_analyze_concurrency.go index b57586ac..14a9de44 100644 --- a/internal/mcp/tools_analyze_concurrency.go +++ b/internal/mcp/tools_analyze_concurrency.go @@ -72,10 +72,7 @@ func (s *Server) handleAnalyzeRaceWrites(ctx context.Context, req mcp.CallToolRe } var rows []raceRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeWrites { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeWrites) { if !goroutineReachable[e.From] { continue } @@ -162,10 +159,7 @@ func (s *Server) handleAnalyzeRaceWrites(ctx context.Context, req mcp.CallToolRe func (s *Server) buildGoroutineReachableSet() map[string]bool { reach := map[string]bool{} var roots []string - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSpawns { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeSpawns) { if !reach[e.To] { reach[e.To] = true roots = append(roots, e.To) @@ -282,10 +276,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call // channel"; the channel arg isn't tracked so the membership test // is per-function, not per-channel. closesIn := map[string]bool{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeCalls { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeCalls) { if callTargetName(e) != "close" { continue } @@ -303,10 +294,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call Line int } byChannel := map[string]*channelInfo{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSends && e.Kind != graph.EdgeRecvs { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeSends, graph.EdgeRecvs) { info := byChannel[e.To] if info == nil { info = &channelInfo{ @@ -366,7 +354,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call if anyCloser { continue } - risk, reason := classifyUnclosed(info.Sends, len(info.Senders), info.Recvs) + risk, reason := classifyUnclosed(len(info.Senders), info.Recvs) rows = append(rows, unclosedRow{ Channel: info.Channel, FilePath: info.FilePath, @@ -434,7 +422,7 @@ func (s *Server) handleAnalyzeUnclosedChannels(ctx context.Context, req mcp.Call // receivers — the receiver may or may not range; without arg flow // we can't tell. Low: senders without receivers, almost always a // fire-and-forget signal. -func classifyUnclosed(sends, senders, recvs int) (string, string) { +func classifyUnclosed(senders, recvs int) (string, string) { switch { case senders >= 2 && recvs >= 1: return "high", "multiple senders with consumer(s) and no detected close — receivers will hang on range" diff --git a/internal/mcp/tools_analyze_concurrency_test.go b/internal/mcp/tools_analyze_concurrency_test.go index 466b57b1..b1db8739 100644 --- a/internal/mcp/tools_analyze_concurrency_test.go +++ b/internal/mcp/tools_analyze_concurrency_test.go @@ -34,15 +34,15 @@ func concurrencyServer(t *testing.T) *Server { return NewServer(eng, g, idx, nil, zap.NewNop(), nil) } -func addFn(g *graph.Graph, id, name, path string) { +func addFn(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: name, FilePath: path, Language: "go"}) } -func addField(g *graph.Graph, id, name, path string) { +func addField(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindField, Name: name, FilePath: path, Language: "go"}) } -func addEdge(g *graph.Graph, from, to string, kind graph.EdgeKind, path string, line int) { +func addEdge(g graph.Store, from, to string, kind graph.EdgeKind, path string, line int) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: kind, FilePath: path, Line: line, Confidence: 1}) } @@ -328,15 +328,15 @@ func TestAnalyzeRaceWrites_GCXEncodesRow(t *testing.T) { // addMethod / addType / addTypedField build the node shapes the // concurrency classifier reads: a method linked to its receiver type // via EdgeMemberOf, and a typed field linked to its owning type. -func addMethod(g *graph.Graph, id, name, path string) { +func addMethod(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindMethod, Name: name, FilePath: path, Language: "go"}) } -func addType(g *graph.Graph, id, name, path string) { +func addType(g graph.Store, id, name, path string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindType, Name: name, FilePath: path, Language: "go"}) } -func addTypedField(g *graph.Graph, id, name, fieldType, path string) { +func addTypedField(g graph.Store, id, name, fieldType, path string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindField, Name: name, FilePath: path, Language: "go", Meta: map[string]any{"field_type": fieldType}, diff --git a/internal/mcp/tools_analyze_config_readers_test.go b/internal/mcp/tools_analyze_config_readers_test.go index c53aed2a..e0a656a2 100644 --- a/internal/mcp/tools_analyze_config_readers_test.go +++ b/internal/mcp/tools_analyze_config_readers_test.go @@ -30,7 +30,7 @@ func callAnalyzeConfigReaders(t *testing.T, srv *Server, args map[string]any) ma return out } -func addConfigKeyNode(g *graph.Graph, id, name, source string) { +func addConfigKeyNode(g graph.Store, id, name, source string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindConfigKey, @@ -39,7 +39,7 @@ func addConfigKeyNode(g *graph.Graph, id, name, source string) { }) } -func addReadConfigEdge(g *graph.Graph, from, to string) { +func addReadConfigEdge(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: graph.EdgeReadsConfig}) } diff --git a/internal/mcp/tools_analyze_coverage_gaps_test.go b/internal/mcp/tools_analyze_coverage_gaps_test.go index b2c0e441..a5f90e79 100644 --- a/internal/mcp/tools_analyze_coverage_gaps_test.go +++ b/internal/mcp/tools_analyze_coverage_gaps_test.go @@ -11,7 +11,7 @@ import ( // addCoveredNode wires a function node with synthetic // coverage_pct meta — emulating coverage.EnrichGraph output. -func addCoveredNode(g *graph.Graph, id, file string, pct float64, numStmt, hit int) { +func addCoveredNode(g graph.Store, id, file string, pct float64, numStmt, hit int) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindFunction, diff --git a/internal/mcp/tools_analyze_coverage_test.go b/internal/mcp/tools_analyze_coverage_test.go index b65e121f..c2b65917 100644 --- a/internal/mcp/tools_analyze_coverage_test.go +++ b/internal/mcp/tools_analyze_coverage_test.go @@ -19,12 +19,15 @@ func TestAnalyzeCoverage_StampsMeta(t *testing.T) { _ = os.WriteFile(filepath.Join(dir, "go.mod"), []byte("module example.test/repo\n\ngo 1.22\n"), 0o644) - // Synthetic cover profile: covers `main` (line 9), uncovered - // segment for `helper` (line 14). The file path is the - // module-qualified form Go's cover tool emits. + // Synthetic cover profile: covers `main` (line 7-9), uncovered + // segment for `helper` (line 11). Line numbers match the + // setupTestServer fixture in server_test.go — after the fmt + // import was dropped to keep external-call attribution clean, + // the function bodies shifted up by 2 lines. The file path is + // the module-qualified form Go's cover tool emits. profile := []byte(`mode: set -example.test/repo/main.go:9.13,11.2 1 1 -example.test/repo/main.go:14.13,14.16 1 0 +example.test/repo/main.go:7.13,9.2 1 1 +example.test/repo/main.go:11.13,11.16 1 0 `) profilePath := filepath.Join(dir, "cover.out") if err := os.WriteFile(profilePath, profile, 0o644); err != nil { diff --git a/internal/mcp/tools_analyze_cross_repo_test.go b/internal/mcp/tools_analyze_cross_repo_test.go index 4940c33c..b347593c 100644 --- a/internal/mcp/tools_analyze_cross_repo_test.go +++ b/internal/mcp/tools_analyze_cross_repo_test.go @@ -33,7 +33,7 @@ func callAnalyzeCrossRepo(t *testing.T, srv *Server, args map[string]any) map[st // seedCrossRepoGraph wires three repos with a handful of cross-repo // edges so the analyzer has something to group. -func seedCrossRepoGraph(g *graph.Graph) { +func seedCrossRepoGraph(g graph.Store) { add := func(id, repo string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindFunction, Name: id, FilePath: id, RepoPrefix: repo}) } diff --git a/internal/mcp/tools_analyze_edges.go b/internal/mcp/tools_analyze_edges.go index d4f8e844..48662845 100644 --- a/internal/mcp/tools_analyze_edges.go +++ b/internal/mcp/tools_analyze_edges.go @@ -20,6 +20,8 @@ package mcp import ( "context" "fmt" + "iter" + "slices" "sort" "strings" @@ -68,10 +70,9 @@ func (s *Server) handleAnalyzeChannelOps(ctx context.Context, req mcp.CallToolRe return row } - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSends && e.Kind != graph.EdgeRecvs { - continue - } + // One scan over Sends+Recvs only — replaces the legacy AllEdges() + // walk that pulled every edge over cgo just to keep two kinds. + for e := range edgesByKinds(s.graph, graph.EdgeSends, graph.EdgeRecvs) { if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { continue } @@ -156,10 +157,7 @@ func (s *Server) handleAnalyzeGoroutineSpawns(ctx context.Context, req mcp.CallT } byTarget := map[string]*spawnRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSpawns { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeSpawns) { mode, _ := e.Meta["mode"].(string) key := e.To + "|" + mode row, ok := byTarget[key] @@ -271,10 +269,7 @@ func (s *Server) handleAnalyzeFieldWriters(ctx context.Context, req mcp.CallTool } byField := map[string]*writerRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeWrites { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeWrites) { if idFilter != "" && e.To != idFilter { continue } @@ -379,8 +374,8 @@ func (s *Server) handleAnalyzeAnnotationUsers(ctx context.Context, req mcp.CallT Args string `json:"args,omitempty"` } var rows []annotatedRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeAnnotated || e.To != idFilter { + for e := range edgesByKinds(s.graph, graph.EdgeAnnotated) { + if e.To != idFilter { continue } argsStr, _ := e.Meta["args"].(string) @@ -433,10 +428,7 @@ func (s *Server) handleAnalyzeAnnotationUsers(ctx context.Context, req mcp.CallT Users int `json:"users"` } byID := map[string]*annoRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeAnnotated { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeAnnotated) { row, ok := byID[e.To] if !ok { n := s.graph.GetNode(e.To) @@ -523,10 +515,7 @@ func (s *Server) handleAnalyzeConfigReaders(ctx context.Context, req mcp.CallToo Reads int `json:"reads"` } byKey := map[string]*configRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeReadsConfig { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeReadsConfig) { row, ok := byKey[e.To] if !ok { n := s.graph.GetNode(e.To) @@ -636,10 +625,7 @@ func (s *Server) handleAnalyzeEnvVarUsers(ctx context.Context, req mcp.CallToolR Reads int `json:"reads"` } byKey := map[string]*envRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeReadsConfig { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeReadsConfig) { row, ok := byKey[e.To] if !ok { n := s.graph.GetNode(e.To) @@ -727,10 +713,7 @@ func (s *Server) handleAnalyzeEventEmitters(ctx context.Context, req mcp.CallToo Emitters []string `json:"emitters,omitempty"` } byEvent := map[string]*eventRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeEmits { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeEmits) { // Level filter: an emit edge stores the method on the edge // (e.g. "Errorf"); the event node may carry an event_kind. // We accept either source so both per-event and per-call @@ -880,7 +863,7 @@ func (s *Server) handleAnalyzePubsub(ctx context.Context, req mcp.CallToolReques return row } - for _, e := range s.graph.AllEdges() { + for e := range edgesByKinds(s.graph, graph.EdgeEmits, graph.EdgeListensOn) { switch e.Kind { case graph.EdgeEmits: row := ensureRow(e.To) @@ -987,61 +970,74 @@ func (s *Server) handleAnalyzeErrorSurface(ctx context.Context, req mcp.CallTool Errors []string `json:"errors"` ErrorMsgs []string `json:"error_msgs,omitempty"` } - byThrower := map[string]*throwerRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeThrows { - continue - } - if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { - continue - } - row, ok := byThrower[e.From] - if !ok { - n := s.graph.GetNode(e.From) - file := e.FilePath - line := e.Line - if n != nil { - if file == "" { - file = n.FilePath - } - if line == 0 { - line = n.StartLine - } + rows := make([]*throwerRow, 0) + if surfacer, ok := s.graph.(graph.ThrowerErrorSurfacer); ok { + // Server-side path: one Cypher GROUP BY for the per-thrower + // throws+targets dedup, one for the per-thrower error-msg + // attachment. No per-thrower GetOutEdges fanout. + for _, r := range surfacer.ThrowerErrorSurface(pathPrefix) { + row := &throwerRow{ + Symbol: r.ThrowerID, + File: r.FilePath, + Line: r.Line, + Throws: r.Throws, + Errors: append([]string(nil), r.ErrorTargets...), + ErrorMsgs: append([]string(nil), r.ErrorMsgs...), } - row = &throwerRow{Symbol: e.From, File: file, Line: line} - byThrower[e.From] = row - } - row.Throws++ - row.Errors = appendUnique(row.Errors, e.To) - } - // For every thrower, also surface the error_msg KindString - // literals it emits. EdgeThrows targets error types; the - // data-side companion (errors.New("…") → string::error_msg::…) - // carries the literal message. Joining both gives an agent both - // "what error types propagate" and "what literal messages - // originate here" in one row. - for thrower, row := range byThrower { - for _, e := range s.graph.GetOutEdges(thrower) { - if e == nil || e.Kind != graph.EdgeEmits { + sort.Strings(row.Errors) + sort.Strings(row.ErrorMsgs) + rows = append(rows, row) + } + } else { + byThrower := map[string]*throwerRow{} + for e := range edgesByKinds(s.graph, graph.EdgeThrows) { + if pathPrefix != "" && !strings.HasPrefix(e.FilePath, pathPrefix) { continue } - n := s.graph.GetNode(e.To) - if n == nil || n.Kind != graph.KindString { - continue + row, ok := byThrower[e.From] + if !ok { + n := s.graph.GetNode(e.From) + file := e.FilePath + line := e.Line + if n != nil { + if file == "" { + file = n.FilePath + } + if line == 0 { + line = n.StartLine + } + } + row = &throwerRow{Symbol: e.From, File: file, Line: line} + byThrower[e.From] = row } - ctxLabel, _ := n.Meta["context"].(string) - if ctxLabel != "error_msg" { - continue + row.Throws++ + row.Errors = appendUnique(row.Errors, e.To) + } + // For every thrower, also surface the error_msg KindString + // literals it emits. EdgeThrows targets error types; the + // data-side companion (errors.New("…") → string::error_msg::…) + // carries the literal message. + for thrower, row := range byThrower { + for _, e := range s.graph.GetOutEdges(thrower) { + if e == nil || e.Kind != graph.EdgeEmits { + continue + } + n := s.graph.GetNode(e.To) + if n == nil || n.Kind != graph.KindString { + continue + } + ctxLabel, _ := n.Meta["context"].(string) + if ctxLabel != "error_msg" { + continue + } + row.ErrorMsgs = appendUnique(row.ErrorMsgs, n.Name) } - row.ErrorMsgs = appendUnique(row.ErrorMsgs, n.Name) } - } - - rows := make([]*throwerRow, 0, len(byThrower)) - for _, r := range byThrower { - sort.Strings(r.Errors) - sort.Strings(r.ErrorMsgs) - rows = append(rows, r) + for _, r := range byThrower { + sort.Strings(r.Errors) + sort.Strings(r.ErrorMsgs) + rows = append(rows, r) + } } sort.Slice(rows, func(i, j int) bool { // Throwers with the most distinct error targets surface @@ -1163,7 +1159,11 @@ func (s *Server) handleAnalyzeCrossRepo(ctx context.Context, req mcp.CallToolReq return "" } - for _, e := range s.graph.AllEdges() { + for e := range edgesByKinds(s.graph, + graph.EdgeCrossRepoCalls, + graph.EdgeCrossRepoImplements, + graph.EdgeCrossRepoExtends, + ) { base, ok := graph.BaseKindForCrossRepo(e.Kind) if !ok { continue @@ -1262,6 +1262,41 @@ func (s *Server) handleAnalyzeCrossRepo(ctx context.Context, req mcp.CallToolReq // shared helpers // --------------------------------------------------------------------------- +// edgesByKinds streams every edge whose Kind is in the supplied set +// using the EdgesByKindsScanner capability when the backend +// implements it (one Cypher round-trip with a `kind IN $kinds` IN- +// list), or falls back to per-kind EdgesByKind iteration otherwise. +// +// The edge-driven analyzers below use it instead of `for _, e := range +// s.graph.AllEdges() { switch e.Kind … }` so the disk backends stop +// materialising the full edge table over cgo for a handful of kinds. +// Pass each kind as a separate argument — kinds typed inline as a +// variadic so call sites read as `edgesByKinds(g, EdgeEmits, +// EdgeListensOn)` rather than constructing a slice each time. +// +// Empty kinds yields nothing — matches both the capability contract +// and the original semantics (no kinds requested means no rows). +func edgesByKinds(g graph.Store, kinds ...graph.EdgeKind) iter.Seq[*graph.Edge] { + if len(kinds) == 0 { + return func(yield func(*graph.Edge) bool) {} + } + if scanner, ok := g.(graph.EdgesByKindsScanner); ok { + return scanner.EdgesByKinds(kinds) + } + return func(yield func(*graph.Edge) bool) { + for _, k := range kinds { + if k == "" { + continue + } + for e := range g.EdgesByKind(k) { + if !yield(e) { + return + } + } + } + } +} + // appendUnique returns dst with v added if not already present. // Used by every analyzer above to dedupe the From-side caller list // without falling back to a map (the lists are small per row, so a @@ -1270,10 +1305,8 @@ func appendUnique(dst []string, v string) []string { if v == "" { return dst } - for _, x := range dst { - if x == v { - return dst - } + if slices.Contains(dst, v) { + return dst } return append(dst, v) } diff --git a/internal/mcp/tools_analyze_error_surface_test.go b/internal/mcp/tools_analyze_error_surface_test.go index e8e3baa5..420255a9 100644 --- a/internal/mcp/tools_analyze_error_surface_test.go +++ b/internal/mcp/tools_analyze_error_surface_test.go @@ -30,7 +30,7 @@ func callAnalyzeErrorSurface(t *testing.T, srv *Server, args map[string]any) map return out } -func addThrowsEdge(g *graph.Graph, from, to, file string, line int) { +func addThrowsEdge(g graph.Store, from, to, file string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_event_emitters_test.go b/internal/mcp/tools_analyze_event_emitters_test.go index 54af6e20..fbfd357e 100644 --- a/internal/mcp/tools_analyze_event_emitters_test.go +++ b/internal/mcp/tools_analyze_event_emitters_test.go @@ -30,7 +30,7 @@ func callAnalyzeEventEmitters(t *testing.T, srv *Server, args map[string]any) ma return out } -func addEventNode(g *graph.Graph, id, name, kind string) { +func addEventNode(g graph.Store, id, name, kind string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindEvent, @@ -39,7 +39,7 @@ func addEventNode(g *graph.Graph, id, name, kind string) { }) } -func addEmitsEdge(g *graph.Graph, from, to, method string) { +func addEmitsEdge(g graph.Store, from, to, method string) { e := &graph.Edge{From: from, To: to, Kind: graph.EdgeEmits} if method != "" { e.Meta = map[string]any{"method": method} diff --git a/internal/mcp/tools_analyze_external_calls.go b/internal/mcp/tools_analyze_external_calls.go index 77e03618..429f7cdf 100644 --- a/internal/mcp/tools_analyze_external_calls.go +++ b/internal/mcp/tools_analyze_external_calls.go @@ -247,7 +247,7 @@ func suffixVersion(v string) string { // countCallersToExternal counts every incoming non-EdgeDependsOnModule // edge to an external symbol node — those are the calls / references // that goanalysis attributed. -func countCallersToExternal(g *graph.Graph, nodeID string) int { +func countCallersToExternal(g graph.Store, nodeID string) int { n := 0 for _, e := range g.GetInEdges(nodeID) { if e.Kind == graph.EdgeDependsOnModule { @@ -260,7 +260,7 @@ func countCallersToExternal(g *graph.Graph, nodeID string) int { // tallyExternalCallers returns (totalCallEdges, distinctCallers) — the // detail surface for the per-module symbol listing. -func tallyExternalCallers(g *graph.Graph, nodeID string) (int, int) { +func tallyExternalCallers(g graph.Store, nodeID string) (int, int) { calls := 0 seen := map[string]struct{}{} for _, e := range g.GetInEdges(nodeID) { diff --git a/internal/mcp/tools_analyze_external_calls_test.go b/internal/mcp/tools_analyze_external_calls_test.go index 956ea6de..cfd86cd1 100644 --- a/internal/mcp/tools_analyze_external_calls_test.go +++ b/internal/mcp/tools_analyze_external_calls_test.go @@ -30,7 +30,7 @@ func callAnalyzeExternalCalls(t *testing.T, srv *Server, args map[string]any) ma return out } -func addExternalModuleNode(g *graph.Graph, id, path, version, kind string) { +func addExternalModuleNode(g graph.Store, id, path, version, kind string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindModule, @@ -44,7 +44,7 @@ func addExternalModuleNode(g *graph.Graph, id, path, version, kind string) { }) } -func addExternalSymbolNode(g *graph.Graph, id, name, importPath, moduleID string, kind graph.NodeKind) { +func addExternalSymbolNode(g graph.Store, id, name, importPath, moduleID string, kind graph.NodeKind) { g.AddNode(&graph.Node{ ID: id, Kind: kind, @@ -63,7 +63,7 @@ func addExternalSymbolNode(g *graph.Graph, id, name, importPath, moduleID string }) } -func addExternalCall(g *graph.Graph, from, to string) { +func addExternalCall(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_field_writers_test.go b/internal/mcp/tools_analyze_field_writers_test.go index 4c17c4a0..e98ca1ba 100644 --- a/internal/mcp/tools_analyze_field_writers_test.go +++ b/internal/mcp/tools_analyze_field_writers_test.go @@ -30,11 +30,11 @@ func callAnalyzeFieldWriters(t *testing.T, srv *Server, args map[string]any) map return out } -func addFieldNode(g *graph.Graph, id, name string) { +func addFieldNode(g graph.Store, id, name string) { g.AddNode(&graph.Node{ID: id, Kind: graph.KindField, Name: name}) } -func addWriteEdge(g *graph.Graph, from, to string) { +func addWriteEdge(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: graph.EdgeWrites}) } diff --git a/internal/mcp/tools_analyze_framework.go b/internal/mcp/tools_analyze_framework.go index 566b68e6..300e55e9 100644 --- a/internal/mcp/tools_analyze_framework.go +++ b/internal/mcp/tools_analyze_framework.go @@ -39,10 +39,7 @@ func (s *Server) handleAnalyzeRoutes(ctx context.Context, req mcp.CallToolReques Line int `json:"line"` } var rows []*routeRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeHandlesRoute { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeHandlesRoute) { contractNode := s.graph.GetNode(e.To) if contractNode == nil { continue @@ -154,10 +151,7 @@ func (s *Server) handleAnalyzeModels(ctx context.Context, req mcp.CallToolReques Line int `json:"line"` } var rows []*modelRow - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeModelsTable { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeModelsTable) { modelNode := s.graph.GetNode(e.From) if modelNode == nil { continue @@ -269,10 +263,7 @@ func (s *Server) componentsRollup(ctx context.Context, req mcp.CallToolRequest, stats[id] = row return row } - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeRendersChild { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeRendersChild) { parent := get(e.From) parent.FanOut++ // Skip the child if it never resolved to a real node — leaving @@ -454,7 +445,7 @@ func (s *Server) handleAnalyzeDbtModels(ctx context.Context, req mcp.CallToolReq // Second pass: tally columns (EdgeMemberOf → model) and lineage // (EdgeDependsOn between two model nodes) in one walk of AllEdges. - for _, e := range s.graph.AllEdges() { + for e := range edgesByKinds(s.graph, graph.EdgeMemberOf, graph.EdgeDependsOn) { switch e.Kind { case graph.EdgeMemberOf: if r := rowByID[e.To]; r != nil { diff --git a/internal/mcp/tools_analyze_framework_test.go b/internal/mcp/tools_analyze_framework_test.go index 00f5f285..365f3a87 100644 --- a/internal/mcp/tools_analyze_framework_test.go +++ b/internal/mcp/tools_analyze_framework_test.go @@ -30,7 +30,7 @@ func callAnalyzeFramework(t *testing.T, srv *Server, kind string, args map[strin return out } -func addContractNode(g *graph.Graph, id, ctype string, meta map[string]any) { +func addContractNode(g graph.Store, id, ctype string, meta map[string]any) { full := map[string]any{"type": ctype, "role": "provider"} for k, v := range meta { full[k] = v @@ -40,7 +40,7 @@ func addContractNode(g *graph.Graph, id, ctype string, meta map[string]any) { }) } -func addHandlesRouteEdge(g *graph.Graph, from, to, file string, line int) { +func addHandlesRouteEdge(g graph.Store, from, to, file string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, Kind: graph.EdgeHandlesRoute, FilePath: file, Line: line, @@ -97,7 +97,7 @@ func TestAnalyzeRoutes_FilterByKind(t *testing.T) { } } -func addModelTableEdge(g *graph.Graph, from, to, orm, table, derivation string) { +func addModelTableEdge(g graph.Store, from, to, orm, table, derivation string) { g.AddNode(&graph.Node{ID: to, Kind: graph.KindTable, Name: table, Language: "go", Meta: map[string]any{"dialect": "orm"}}) g.AddEdge(&graph.Edge{ From: from, To: to, Kind: graph.EdgeModelsTable, @@ -151,7 +151,7 @@ func TestAnalyzeModels_FilterByTableSubstring(t *testing.T) { } } -func addRendersChildEdge(g *graph.Graph, from, to, name string, line int) { +func addRendersChildEdge(g graph.Store, from, to, name string, line int) { g.AddEdge(&graph.Edge{ From: from, To: to, Kind: graph.EdgeRendersChild, Line: line, @@ -224,7 +224,7 @@ func TestAnalyzeComponents_EmptyOnNoEdges(t *testing.T) { } } -func addDbtModelNode(g *graph.Graph, id, name, framework, resourceType, materialized string) { +func addDbtModelNode(g graph.Store, id, name, framework, resourceType, materialized string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindTable, Name: name, Language: "sql", FilePath: name + ".sql", StartLine: 1, @@ -235,7 +235,7 @@ func addDbtModelNode(g *graph.Graph, id, name, framework, resourceType, material }) } -func addDbtColumn(g *graph.Graph, modelID, col string) { +func addDbtColumn(g graph.Store, modelID, col string) { colID := modelID + "::" + col g.AddNode(&graph.Node{ID: colID, Kind: graph.KindColumn, Name: col, Language: "sql"}) g.AddEdge(&graph.Edge{From: colID, To: modelID, Kind: graph.EdgeMemberOf}) diff --git a/internal/mcp/tools_analyze_goroutine_spawns_test.go b/internal/mcp/tools_analyze_goroutine_spawns_test.go index e70113ef..69df7f4a 100644 --- a/internal/mcp/tools_analyze_goroutine_spawns_test.go +++ b/internal/mcp/tools_analyze_goroutine_spawns_test.go @@ -34,7 +34,7 @@ func callAnalyzeGoroutineSpawns(t *testing.T, srv *Server, args map[string]any) // site is unique under the graph's edge-dedup key. Meta is dropped // when mode is empty so the analyzer's "modeless spawn" path is // exercisable. -func addSpawnEdge(g *graph.Graph, from, to, mode string, line int) { +func addSpawnEdge(g graph.Store, from, to, mode string, line int) { e := &graph.Edge{From: from, To: to, Kind: graph.EdgeSpawns, FilePath: "f.go", Line: line} if mode != "" { e.Meta = map[string]any{"mode": mode} diff --git a/internal/mcp/tools_analyze_health_score.go b/internal/mcp/tools_analyze_health_score.go index a61c4e58..8ea03421 100644 --- a/internal/mcp/tools_analyze_health_score.go +++ b/internal/mcp/tools_analyze_health_score.go @@ -10,6 +10,7 @@ import ( mcp "github.com/mark3labs/mcp-go/mcp" + "github.com/zzet/gortex/internal/analysis" "github.com/zzet/gortex/internal/graph" ) @@ -156,25 +157,52 @@ func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolR allowedKinds = parseAnalyzeKindsFilter(k) } - // Build fan-in / fan-out / community-crossing maps in one edge - // pass. Same arithmetic shape as FindHotspots — we read the - // raw axes here rather than calling FindHotspots so the per- - // node fan-in is available for symbols below its threshold. + // Build fan-in / fan-out / community-crossing maps. Same + // arithmetic shape as FindHotspots -- we read the raw axes here + // rather than calling FindHotspots so the per-node fan-in is + // available for symbols below its threshold. + // + // Fan-in / fan-out go through analysis.CollectFanCounts, which + // uses the NodeFanAggregator capability when the backend + // supports it (one bulk Cypher per direction over the candidate + // id set) and falls back to a per-kind EdgesByKind stream + // otherwise. Crossings still need per-edge (from, to) for the + // Calls + References kinds -- streamed via EdgesByKind so even + // the fallback path never materialises the full edge set. nodeToComm := map[string]string{} if c := s.getCommunities(); c != nil { nodeToComm = c.NodeToComm } - fanIn := map[string]int{} - fanOut := map[string]int{} - crossings := map[string]int{} - for _, e := range s.graph.AllEdges() { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fanIn[e.To]++ + + // Pull only the candidate kinds from the store — most workspaces + // keep ~5-15% of nodes as functions/methods, so the kind pushdown + // drops the AllNodes materialisation by 1-2 orders of magnitude. + kindList := make([]graph.NodeKind, 0, len(allowedKinds)) + for k := range allowedKinds { + kindList = append(kindList, k) + } + scoped := s.scopedNodesByKinds(ctx, kindList) + candidateIDs := make([]string, 0, len(scoped)) + for _, n := range scoped { + if n == nil { + continue } - if e.Kind == graph.EdgeCalls { - fanOut[e.From]++ + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue } - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { + candidateIDs = append(candidateIDs, n.ID) + } + fanIn, fanOut := analysis.CollectFanCounts(s.graph, candidateIDs, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + []graph.EdgeKind{graph.EdgeCalls}, + ) + + crossings := map[string]int{} + for _, kind := range []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} { + for e := range s.graph.EdgesByKind(kind) { + if e == nil { + continue + } from := nodeToComm[e.From] to := nodeToComm[e.To] if from != "" && to != "" && from != to { @@ -191,8 +219,8 @@ func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolR now := time.Now() rows := make([]healthScoreRow, 0, 128) - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { + for _, n := range scoped { + if n == nil { continue } if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { @@ -244,10 +272,7 @@ func (s *Server) handleAnalyzeHealthScore(ctx context.Context, req mcp.CallToolR // (30..365d) = 100→50; stale-zone (365..1095d) = 50→0; // dead (>1095d) = 0. if ts, ok := extractTimestamp(n.Meta); ok { - ageDays := int(now.Sub(time.Unix(ts, 0)).Hours() / 24) - if ageDays < 0 { - ageDays = 0 - } + ageDays := max(int(now.Sub(time.Unix(ts, 0)).Hours()/24), 0) row.AgeDays = &ageDays recHealth := recencyScore(ageDays) row.RecencyPct = &recHealth diff --git a/internal/mcp/tools_analyze_health_score_test.go b/internal/mcp/tools_analyze_health_score_test.go index 05b54853..e42eea04 100644 --- a/internal/mcp/tools_analyze_health_score_test.go +++ b/internal/mcp/tools_analyze_health_score_test.go @@ -38,7 +38,7 @@ func callAnalyzeHealth(t *testing.T, srv *Server, extra map[string]any) map[stri // addHealthFn drops one function node into the graph with the given // id/file. Avoids re-using `addFn` from tools_analyze_concurrency_test.go // to keep this test file self-contained. -func addHealthFn(g *graph.Graph, id, file string, meta map[string]any) *graph.Node { +func addHealthFn(g graph.Store, id, file string, meta map[string]any) *graph.Node { n := &graph.Node{ ID: id, Kind: graph.KindFunction, Name: id, FilePath: file, StartLine: 1, EndLine: 5, diff --git a/internal/mcp/tools_analyze_history.go b/internal/mcp/tools_analyze_history.go index 20cd30b0..3f872cd7 100644 --- a/internal/mcp/tools_analyze_history.go +++ b/internal/mcp/tools_analyze_history.go @@ -170,3 +170,43 @@ func (s *Server) symbolNamesInFile(filePath string) []string { sort.Strings(names) return names } + +// symbolNamesByFiles is the batched sibling of symbolNamesInFile. +// Returns a map filePath → sorted distinct names for every input +// path in one backend round-trip when the store implements +// FileSymbolNamesByPaths; falls back to the per-file loop otherwise. +// Used by find_co_changing_symbols and analyze fixes_history where +// the row count after truncation is bounded but each per-row name +// lookup was a separate Cypher query before — multiple thousand +// query-engine entry points per call on Ladybug. +func (s *Server) symbolNamesByFiles(paths []string) map[string][]string { + if len(paths) == 0 { + return nil + } + kinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod, graph.KindType, graph.KindInterface} + out := make(map[string][]string, len(paths)) + if scanner, ok := s.graph.(graph.FileSymbolNamesByPaths); ok { + rows := scanner.FileSymbolNamesByPaths(paths, kinds) + seenPerFile := make(map[string]map[string]bool, len(paths)) + for _, r := range rows { + seen := seenPerFile[r.FilePath] + if seen == nil { + seen = make(map[string]bool) + seenPerFile[r.FilePath] = seen + } + if r.Name == "" || seen[r.Name] { + continue + } + seen[r.Name] = true + out[r.FilePath] = append(out[r.FilePath], r.Name) + } + for f := range out { + sort.Strings(out[f]) + } + return out + } + for _, p := range paths { + out[p] = s.symbolNamesInFile(p) + } + return out +} diff --git a/internal/mcp/tools_analyze_hotspot_modes.go b/internal/mcp/tools_analyze_hotspot_modes.go index 2783c4e1..4592ebc4 100644 --- a/internal/mcp/tools_analyze_hotspot_modes.go +++ b/internal/mcp/tools_analyze_hotspot_modes.go @@ -30,7 +30,7 @@ import ( // We don't fail when the meta is absent — the analyzer treats this // as a soft ranker, not a strict filter, so callers get *some* // ranking even on un-enriched graphs (the unweighted baseline). -func rerankHotspots(entries []analysis.HotspotEntry, g *graph.Graph, mode, direction string, windowDays int) []analysis.HotspotEntry { +func rerankHotspots(entries []analysis.HotspotEntry, g graph.Store, mode, direction string, windowDays int) []analysis.HotspotEntry { if windowDays <= 0 { windowDays = 30 } diff --git a/internal/mcp/tools_analyze_hotspot_modes_test.go b/internal/mcp/tools_analyze_hotspot_modes_test.go index e9492425..528d7ef2 100644 --- a/internal/mcp/tools_analyze_hotspot_modes_test.go +++ b/internal/mcp/tools_analyze_hotspot_modes_test.go @@ -12,7 +12,7 @@ import ( // buildHotspotRerankFixture seeds three function nodes with deterministic // complexity scores AND varying blame / releases metadata so the // novelty / directional modes can reorder them in predictable ways. -func buildHotspotRerankFixture(t *testing.T, now time.Time) (*graph.Graph, []analysis.HotspotEntry) { +func buildHotspotRerankFixture(t *testing.T, now time.Time) (graph.Store, []analysis.HotspotEntry) { t.Helper() g := graph.New() diff --git a/internal/mcp/tools_analyze_impact.go b/internal/mcp/tools_analyze_impact.go index 4235c695..8db320b2 100644 --- a/internal/mcp/tools_analyze_impact.go +++ b/internal/mcp/tools_analyze_impact.go @@ -9,6 +9,7 @@ import ( mcp "github.com/mark3labs/mcp-go/mcp" + "github.com/zzet/gortex/internal/analysis" "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/reach" ) @@ -135,14 +136,61 @@ func (s *Server) handleAnalyzeImpactComposite(ctx context.Context, req mcp.CallT nodeToComm = c.NodeToComm } - // One edge pass builds direct fan-in plus, per symbol, the set of - // distinct communities its call/reference neighbours belong to. - fanIn := map[string]int{} + // Build the candidate id set up front so both the fan-in + // aggregator and the per-edge community walk stay bounded by + // the kinds / path / ids the caller actually asked for. Without + // this, the analyzer paid for an unfiltered AllEdges() + // materialisation per call -- ~500k edges over cgo on the gortex + // workspace, the bulk of the wall-clock cost on Ladybug. + scoped := s.scopedNodes(ctx) + candidateIDs := make([]string, 0, len(scoped)) + candidateSet := make(map[string]struct{}, len(scoped)) + for _, n := range scoped { + if n == nil { + continue + } + if allowedKinds != nil { + if _, ok := allowedKinds[n.Kind]; !ok { + continue + } + } + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + if len(idFilter) > 0 { + if _, ok := idFilter[n.ID]; !ok { + continue + } + } + candidateIDs = append(candidateIDs, n.ID) + candidateSet[n.ID] = struct{}{} + } + + // fan-in: uses the NodeFanAggregator capability when the + // backend supports it (one bulk Cypher per direction over the + // candidate id set) and falls back to a per-kind EdgesByKind + // stream otherwise. fanOutKinds is empty -- impact only reads + // fan-in. + fanIn, _ := analysis.CollectFanCounts(s.graph, candidateIDs, + []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}, + nil, + ) + + // neighborComms[n] = set of distinct communities of n's call / + // reference neighbours (both directions). Streamed via + // EdgesByKind per kind so neither backend pays for an + // unfiltered AllEdges walk; the per-kind MATCH on disk backends + // is the same plan EdgesByKind feeds every other analyzer. + // Membership is restricted to candidate ids -- a node outside + // the result set has nowhere to receive a span count. neighborComms := map[string]map[string]struct{}{} addComm := func(node, comm string) { if comm == "" { return } + if _, ok := candidateSet[node]; !ok { + return + } set := neighborComms[node] if set == nil { set = map[string]struct{}{} @@ -150,29 +198,23 @@ func (s *Server) handleAnalyzeImpactComposite(ctx context.Context, req mcp.CallT } set[comm] = struct{}{} } - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { - continue - } - fanIn[e.To]++ - addComm(e.From, nodeToComm[e.To]) - addComm(e.To, nodeToComm[e.From]) - } - - rows := make([]impactRow, 0, 128) - for _, n := range s.scopedNodes(ctx) { - if allowedKinds != nil { - if _, ok := allowedKinds[n.Kind]; !ok { + for _, kind := range []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} { + for e := range s.graph.EdgesByKind(kind) { + if e == nil { continue } + addComm(e.From, nodeToComm[e.To]) + addComm(e.To, nodeToComm[e.From]) } - if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + } + + rows := make([]impactRow, 0, len(candidateIDs)) + for _, n := range scoped { + if n == nil { continue } - if len(idFilter) > 0 { - if _, ok := idFilter[n.ID]; !ok { - continue - } + if _, ok := candidateSet[n.ID]; !ok { + continue } prVal := pr.ScoreOf(n.ID) diff --git a/internal/mcp/tools_analyze_infra.go b/internal/mcp/tools_analyze_infra.go index f15b1427..5537e3aa 100644 --- a/internal/mcp/tools_analyze_infra.go +++ b/internal/mcp/tools_analyze_infra.go @@ -67,12 +67,14 @@ func (s *Server) handleAnalyzeK8sResources(ctx context.Context, req mcp.CallTool c.usesEnv++ } } - for _, e := range s.graph.AllEdges() { - switch e.Kind { - case graph.EdgeDependsOn, graph.EdgeConfigures, graph.EdgeMounts, - graph.EdgeExposes, graph.EdgeUsesEnv: - bump(e.From, e.Kind) - } + for e := range edgesByKinds(s.graph, + graph.EdgeDependsOn, + graph.EdgeConfigures, + graph.EdgeMounts, + graph.EdgeExposes, + graph.EdgeUsesEnv, + ) { + bump(e.From, e.Kind) } var rows []*resourceRow @@ -148,10 +150,7 @@ func (s *Server) handleAnalyzeImages(ctx context.Context, req mcp.CallToolReques } consumers := make(map[string]int) - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeDependsOn { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeDependsOn) { consumers[e.To]++ } @@ -227,11 +226,8 @@ func (s *Server) handleAnalyzeKustomize(ctx context.Context, req mcp.CallToolReq c.res++ } } - for _, e := range s.graph.AllEdges() { - switch e.Kind { - case graph.EdgeDependsOn, graph.EdgeReferences: - bump(e.From, e.Kind) - } + for e := range edgesByKinds(s.graph, graph.EdgeDependsOn, graph.EdgeReferences) { + bump(e.From, e.Kind) } var rows []*overlayRow diff --git a/internal/mcp/tools_analyze_infra_test.go b/internal/mcp/tools_analyze_infra_test.go index fe13942b..2a78550e 100644 --- a/internal/mcp/tools_analyze_infra_test.go +++ b/internal/mcp/tools_analyze_infra_test.go @@ -33,7 +33,7 @@ func callAnalyzeInfra(t *testing.T, srv *Server, kind string, args map[string]an return out } -func seedK8sFixture(g *graph.Graph) { +func seedK8sFixture(g graph.Store) { deploy := &graph.Node{ ID: "k8s::Deployment::prod::api", Kind: graph.KindResource, Name: "api", FilePath: "k8s/api.yaml", StartLine: 1, diff --git a/internal/mcp/tools_analyze_kcore.go b/internal/mcp/tools_analyze_kcore.go new file mode 100644 index 00000000..5efaf971 --- /dev/null +++ b/internal/mcp/tools_analyze_kcore.go @@ -0,0 +1,142 @@ +// kcore — find the densely connected core of the graph. +// +// k-core decomposition assigns every node a k-degree: the largest +// k for which the node remains in the k-core after iteratively +// pruning nodes with degree < k. Nodes with high k-degree sit at +// the densely connected centre of the graph — useful for "what's +// the core infrastructure every other layer depends on", and as a +// complement to PageRank (which weights by random-walk authority, +// not local density). +// +// Routing: +// +// - When the backing graph.Store implements graph.KCorer (today +// only store_ladybug), the analyzer delegates to the engine- +// native parallel implementation. +// +// - Otherwise analysis.ComputeKCore runs in-process. The +// implementation is the classic Batagelj & Zaversnik bucket +// algorithm — O(V + E), no recursion. + +package mcp + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/analysis" + "github.com/zzet/gortex/internal/graph" +) + +// kcoreRow is the per-symbol shape the analyzer returns. +type kcoreRow struct { + ID string `json:"id"` + Name string `json:"name,omitempty"` + Kind string `json:"kind,omitempty"` + FilePath string `json:"file_path,omitempty"` + Line int `json:"line,omitempty"` + KDegree int `json:"k_degree"` +} + +func (s *Server) handleAnalyzeKCore(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + args := req.GetArguments() + + limit := 20 + if v, ok := args["limit"].(float64); ok && v > 0 { + limit = int(v) + } + minDegree := 0 + if v, ok := args["min_degree"].(float64); ok && v > 0 { + minDegree = int(v) + } + + hits := s.runKCore(graph.KCoreOpts{ + NodeKinds: parseKindFilter(stringArg(args, "node_kinds")), + }) + + // Filter by min_degree (drop trivial low-core nodes), then cap. + if minDegree > 0 { + filtered := hits[:0] + for _, h := range hits { + if h.KDegree >= int64(minDegree) { + filtered = append(filtered, h) + } + } + hits = filtered + } + if limit > 0 && limit < len(hits) { + hits = hits[:limit] + } + + // Batch-materialise hit nodes in one backend round-trip — same + // rationale as analyze(pagerank). Preserves the descending + // k-degree order from runKCore. + ids := make([]string, 0, len(hits)) + for _, h := range hits { + if h.NodeID != "" { + ids = append(ids, h.NodeID) + } + } + nodeByID := s.graph.GetNodesByIDs(ids) + + rows := make([]kcoreRow, 0, len(hits)) + for _, h := range hits { + row := kcoreRow{ID: h.NodeID, KDegree: int(h.KDegree)} + if n := nodeByID[h.NodeID]; n != nil { + row.Name = n.Name + row.Kind = string(n.Kind) + row.FilePath = n.FilePath + row.Line = n.StartLine + } + rows = append(rows, row) + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze("kcore", rows)) + } + if isCompact(req) { + var b strings.Builder + for _, r := range rows { + fmt.Fprintf(&b, "%s %s %s:%d k=%d\n", r.Kind, r.ID, r.FilePath, r.Line, r.KDegree) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{"kcore": rows, "count": len(rows)}) +} + +// runKCore picks the engine-native KCorer when available, +// otherwise falls back to the in-process implementation. Returns +// hits sorted by k-degree descending (the engine-native CALL +// returns them unordered; the in-process ComputeKCore returns +// already sorted — normalise both here so the handler doesn't +// have to re-sort). +func (s *Server) runKCore(opts graph.KCoreOpts) []graph.KCoreHit { + if store := s.backendStore(); store != nil { + if kc, ok := store.(graph.KCorer); ok { + hits, err := kc.KCoreDecomposition(opts) + if err == nil { + sort.Slice(hits, func(i, j int) bool { + if hits[i].KDegree != hits[j].KDegree { + return hits[i].KDegree > hits[j].KDegree + } + return hits[i].NodeID < hits[j].NodeID + }) + return hits + } + // Engine-native error falls through. + } + } + res := analysis.ComputeKCore(s.graph, analysis.KCoreOptions{ + NodeKinds: opts.NodeKinds, + EdgeKinds: opts.EdgeKinds, + }) + out := make([]graph.KCoreHit, len(res)) + for i, h := range res { + out[i] = graph.KCoreHit{NodeID: h.NodeID, KDegree: int64(h.KDegree)} + } + return out +} diff --git a/internal/mcp/tools_analyze_orphan_tables_test.go b/internal/mcp/tools_analyze_orphan_tables_test.go index 9ad3295e..6c57fb1b 100644 --- a/internal/mcp/tools_analyze_orphan_tables_test.go +++ b/internal/mcp/tools_analyze_orphan_tables_test.go @@ -33,7 +33,7 @@ func callAnalyzeOrphanTables(t *testing.T, srv *Server, args map[string]any) map // addTable + addQuery + addMigration are tiny helpers that mirror the // shape the indexer produces. Kept inside the test so it doesn't grow // production-side scaffolding. -func addTable(g *graph.Graph, id, table, dialect string) { +func addTable(g graph.Store, id, table, dialect string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindTable, @@ -45,7 +45,7 @@ func addTable(g *graph.Graph, id, table, dialect string) { }) } -func addQueryEdge(g *graph.Graph, fromID, toID string) { +func addQueryEdge(g graph.Store, fromID, toID string) { g.AddEdge(&graph.Edge{ From: fromID, To: toID, @@ -53,7 +53,7 @@ func addQueryEdge(g *graph.Graph, fromID, toID string) { }) } -func addMigrationEdge(g *graph.Graph, fromID, toID string) { +func addMigrationEdge(g graph.Store, fromID, toID string) { g.AddEdge(&graph.Edge{ From: fromID, To: toID, diff --git a/internal/mcp/tools_analyze_ownership_test.go b/internal/mcp/tools_analyze_ownership_test.go index b5042b7e..a6496b73 100644 --- a/internal/mcp/tools_analyze_ownership_test.go +++ b/internal/mcp/tools_analyze_ownership_test.go @@ -33,7 +33,7 @@ func callAnalyzeOwnership(t *testing.T, srv *Server, args map[string]any) map[st // addBlameNode wires a function node with synthetic last_authored // meta keyed off email + timestamp. -func addBlameNode(g *graph.Graph, id, file, email string, ts int64) { +func addBlameNode(g graph.Store, id, file, email string, ts int64) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindFunction, diff --git a/internal/mcp/tools_analyze_pagerank.go b/internal/mcp/tools_analyze_pagerank.go new file mode 100644 index 00000000..14cf7ed9 --- /dev/null +++ b/internal/mcp/tools_analyze_pagerank.go @@ -0,0 +1,277 @@ +// pagerank — graph-EXTRACTION-flavoured centrality analysis. +// +// analyze kind=pagerank ranks symbols by PageRank authority: a +// symbol is "central" when central symbols depend on it, so a +// rarely-called API that's invoked from every domain layer ranks +// higher than a heavily-called test helper. This is qualitatively +// different from the degree-based `hotspots` analyzer — random-walk +// authority weights influence by reach, not by raw fan-in count. +// +// Routing: +// +// - When the backing graph.Store implements graph.PageRanker +// (today only store_ladybug), the analyzer delegates to the +// engine-native parallel implementation (Ligra-based). Saves +// the per-call cost of a fresh Go-side power iteration. +// +// - Otherwise (in-memory store), falls back to +// analysis.ComputePageRank — the same pure-Go implementation +// the search rerank pipeline consumes via the cached +// Server.pageRank field. + +package mcp + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/analysis" + "github.com/zzet/gortex/internal/graph" +) + +// pageRankRow is the per-symbol shape the analyzer returns. +type pageRankRow struct { + ID string `json:"id"` + Name string `json:"name,omitempty"` + Kind string `json:"kind,omitempty"` + FilePath string `json:"file_path,omitempty"` + Line int `json:"line,omitempty"` + Rank float64 `json:"rank"` +} + +func (s *Server) handleAnalyzePageRank(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + args := req.GetArguments() + + limit := 20 + if v, ok := args["limit"].(float64); ok && v > 0 { + limit = int(v) + } + damping := 0.0 + if v, ok := args["damping"].(float64); ok && v > 0 && v < 1 { + damping = v + } + maxIter := 0 + if v, ok := args["max_iterations"].(float64); ok && v > 0 { + maxIter = int(v) + } + tolerance := 0.0 + if v, ok := args["tolerance"].(float64); ok && v > 0 { + tolerance = v + } + nodeKinds := parseKindFilter(stringArg(args, "node_kinds")) + + hits := s.runPageRank(graph.PageRankOpts{ + NodeKinds: nodeKinds, + DampingFactor: damping, + MaxIterations: maxIter, + Tolerance: tolerance, + Limit: limit, + }) + + // Batch-materialise hit nodes in one backend round-trip instead + // of per-id GetNode. On Ladybug each GetNode is a cgo Cypher + // call; on the default limit (20) the per-id path issued 20 + // cgo round-trips per pagerank invocation. Single GetNodesByIDs + // collapses that into one bulk query while preserving rank order + // (the local map lookup is keyed by NodeID). + ids := make([]string, 0, len(hits)) + for _, h := range hits { + if h.NodeID != "" { + ids = append(ids, h.NodeID) + } + } + nodeByID := s.graph.GetNodesByIDs(ids) + + rows := make([]pageRankRow, 0, len(hits)) + for _, h := range hits { + row := pageRankRow{ID: h.NodeID, Rank: h.Rank} + if n := nodeByID[h.NodeID]; n != nil { + row.Name = n.Name + row.Kind = string(n.Kind) + row.FilePath = n.FilePath + row.Line = n.StartLine + } + rows = append(rows, row) + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze("pagerank", rows)) + } + if isCompact(req) { + var b strings.Builder + for _, r := range rows { + fmt.Fprintf(&b, "%s %s %s:%d rank=%.6f\n", r.Kind, r.ID, r.FilePath, r.Line, r.Rank) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{"pagerank": rows, "count": len(rows)}) +} + +// runPageRank picks the engine-native PageRanker when the +// backing store implements it, otherwise falls back to the +// in-process power iteration. +func (s *Server) runPageRank(opts graph.PageRankOpts) []graph.PageRankHit { + if store := s.backendStore(); store != nil { + if pr, ok := store.(graph.PageRanker); ok { + hits, err := pr.PageRank(opts) + if err == nil { + return hits + } + // Fall through to the in-process path on backend + // error rather than surface a half-completed + // result; engine-native is a hot path optimisation, + // not the source of truth. + } + } + // Fallback: pure-Go power iteration on the in-memory mirror. + // analysis.ComputePageRank doesn't accept the same options + // as the engine-native call yet — it uses fixed damping / + // iteration constants — so opts.DampingFactor / MaxIterations + // / Tolerance are silently ignored on the fallback path. The + // NodeKinds filter is honoured by post-filtering the result. + res := analysis.ComputePageRank(s.graph) + if res == nil || len(res.Scores) == 0 { + return nil + } + allow := makeKindAllow(opts.NodeKinds) + hits := make([]graph.PageRankHit, 0, len(res.Scores)) + for id, rank := range res.Scores { + if !allow(s.graph.GetNode(id)) { + continue + } + hits = append(hits, graph.PageRankHit{NodeID: id, Rank: rank}) + } + sort.Slice(hits, func(i, j int) bool { return hits[i].Rank > hits[j].Rank }) + if opts.Limit > 0 && opts.Limit < len(hits) { + hits = hits[:opts.Limit] + } + return hits +} + +// backendStore returns the underlying graph.Store the indexer +// writes to — which is what implements the capability interfaces +// (PageRanker, CommunityDetector, …). Falls back to s.graph when +// no indexer is wired so test fixtures keep working. +func (s *Server) backendStore() graph.Store { + if s.indexer != nil { + return s.indexer.Graph() + } + return s.graph +} + +// parseKindFilter parses a comma-separated list of graph node +// kinds (e.g. "function,method,type") into a typed slice. Empty +// input → empty slice (caller treats that as "no filter"). +func parseKindFilter(in string) []graph.NodeKind { + in = strings.TrimSpace(in) + if in == "" { + return nil + } + parts := strings.Split(in, ",") + out := make([]graph.NodeKind, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p == "" { + continue + } + out = append(out, graph.NodeKind(p)) + } + return out +} + +// handleAnalyzeLouvain returns the Louvain partitioning of the +// graph. When the backing store implements graph.CommunityDetector +// (today only store_ladybug), the partitioning is delegated to the +// engine-native implementation and threaded through the existing +// label / hub / cohesion / parent post-processing +// (analysis.DetectCommunitiesLouvainBackend) so the response is +// shape-identical to the in-process path. Otherwise the in-process +// DetectCommunitiesLouvain runs. +// +// Distinct from `analyze kind=clusters` which uses the Leiden +// algorithm (the Server's cached communities). Louvain produces +// different — typically more granular — partitions; this kind +// exposes it as a first-class result for clients that want the +// Louvain shape specifically. +func (s *Server) handleAnalyzeLouvain(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + limit := 50 + if v, ok := req.GetArguments()["limit"].(float64); ok && v > 0 { + limit = int(v) + } + + result := s.runLouvain() + if result == nil { + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "communities": []any{}, + "modularity": 0.0, + }) + } + + communities := result.Communities + if limit > 0 && limit < len(communities) { + communities = communities[:limit] + } + + if s.isGCX(ctx, req) { + return s.gcxResponseWithBudget(req)(encodeAnalyze("louvain", map[string]any{ + "communities": communities, + "modularity": result.Modularity, + })) + } + if isCompact(req) { + var b strings.Builder + fmt.Fprintf(&b, "modularity=%.4f communities=%d\n", result.Modularity, len(result.Communities)) + for _, c := range communities { + fmt.Fprintf(&b, " %s size=%d cohesion=%.3f label=%s hub=%s\n", + c.ID, c.Size, c.Cohesion, c.Label, c.Hub) + } + return mcp.NewToolResultText(b.String()), nil + } + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "communities": communities, + "modularity": result.Modularity, + "total": len(result.Communities), + }) +} + +// runLouvain picks the engine-native CommunityDetector when the +// backing store implements it, otherwise falls back to the +// pure-Go in-process Louvain. The output shape is identical +// either way (analysis.DetectCommunitiesLouvainBackend threads +// the engine-native partition through the same post-processing). +func (s *Server) runLouvain() *analysis.CommunityResult { + if store := s.backendStore(); store != nil { + if cd, ok := store.(graph.CommunityDetector); ok { + if r := analysis.DetectCommunitiesLouvainBackend(s.graph, cd); r != nil { + return r + } + // Engine-native error path falls through to the + // in-process implementation rather than surfacing + // a half-completed result. + } + } + return analysis.DetectCommunitiesLouvain(s.graph) +} + +// makeKindAllow returns a predicate that reports whether a node's +// kind passes the filter. nil node is always rejected (defensive). +func makeKindAllow(kinds []graph.NodeKind) func(*graph.Node) bool { + if len(kinds) == 0 { + return func(n *graph.Node) bool { return n != nil } + } + set := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + return func(n *graph.Node) bool { + if n == nil { + return false + } + _, ok := set[n.Kind] + return ok + } +} diff --git a/internal/mcp/tools_analyze_pubsub_test.go b/internal/mcp/tools_analyze_pubsub_test.go index 1675cb49..d860bc87 100644 --- a/internal/mcp/tools_analyze_pubsub_test.go +++ b/internal/mcp/tools_analyze_pubsub_test.go @@ -30,7 +30,7 @@ func callAnalyzePubsub(t *testing.T, srv *Server, args map[string]any) map[strin return out } -func addPubsubTopic(g *graph.Graph, id, name, transport string) { +func addPubsubTopic(g graph.Store, id, name, transport string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindEvent, @@ -39,7 +39,7 @@ func addPubsubTopic(g *graph.Graph, id, name, transport string) { }) } -func addListensOnEdge(g *graph.Graph, from, to string) { +func addListensOnEdge(g graph.Store, from, to string) { g.AddEdge(&graph.Edge{From: from, To: to, Kind: graph.EdgeListensOn}) } diff --git a/internal/mcp/tools_analyze_role.go b/internal/mcp/tools_analyze_role.go index 7d7c1ee2..a07ac165 100644 --- a/internal/mcp/tools_analyze_role.go +++ b/internal/mcp/tools_analyze_role.go @@ -103,7 +103,7 @@ func (s *Server) handleAnalyzeRole(ctx context.Context, req mcp.CallToolRequest) // the first matching label. Rules are deliberately conservative; // false-negatives (defaulting to "core") are preferable to noisy // false-positives on a label that pretends to be authoritative. -func classifyRole(n *graph.Node, fanIn, fanOut int, g *graph.Graph, nodeToComm map[string]string) string { +func classifyRole(n *graph.Node, fanIn, fanOut int, g graph.Store, nodeToComm map[string]string) string { switch { case fanIn == 0 && fanOut == 0: return "dead" diff --git a/internal/mcp/tools_analyze_stale_code_test.go b/internal/mcp/tools_analyze_stale_code_test.go index c9ca9143..9e185a82 100644 --- a/internal/mcp/tools_analyze_stale_code_test.go +++ b/internal/mcp/tools_analyze_stale_code_test.go @@ -13,7 +13,7 @@ import ( // addBlameEnrichedNode wires a function node with synthetic // last_authored meta — emulating what blame.EnrichGraph would have // produced after a real run. -func addBlameEnrichedNode(g *graph.Graph, id, file string, line int, email, commit string, ageDays int) { +func addBlameEnrichedNode(g graph.Store, id, file string, line int, email, commit string, ageDays int) { ts := time.Now().Add(-time.Duration(ageDays*24) * time.Hour).Unix() g.AddNode(&graph.Node{ ID: id, diff --git a/internal/mcp/tools_analyze_stale_flags_test.go b/internal/mcp/tools_analyze_stale_flags_test.go index a0eab3c9..59d44f29 100644 --- a/internal/mcp/tools_analyze_stale_flags_test.go +++ b/internal/mcp/tools_analyze_stale_flags_test.go @@ -33,7 +33,7 @@ func callAnalyzeStaleFlags(t *testing.T, srv *Server, args map[string]any) map[s // addFlagWithCallers wires a flag node + N caller functions, each // stamped with last_authored.timestamp = ageDays ago. -func addFlagWithCallers(g *graph.Graph, flagID, provider, name string, callers map[string]int /* callerID → ageDays */) { +func addFlagWithCallers(g graph.Store, flagID, provider, name string, callers map[string]int /* callerID → ageDays */) { g.AddNode(&graph.Node{ ID: flagID, Kind: graph.KindFlag, diff --git a/internal/mcp/tools_analyze_string_downstream.go b/internal/mcp/tools_analyze_string_downstream.go index 9941c005..faf96bc3 100644 --- a/internal/mcp/tools_analyze_string_downstream.go +++ b/internal/mcp/tools_analyze_string_downstream.go @@ -52,10 +52,7 @@ func (s *Server) handleAnalyzeLogEvents(ctx context.Context, req mcp.CallToolReq Emitters []string `json:"emitters,omitempty"` } byString := map[string]*logRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeEmits { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeEmits) { n := s.graph.GetNode(e.To) if n == nil || n.Kind != graph.KindString { continue @@ -224,10 +221,7 @@ func (s *Server) handleAnalyzeSQLCallSites(ctx context.Context, req mcp.CallTool Writes int `json:"writes"` } bySite := map[string]*sqlCallSite{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeQueries { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeQueries) { row, ok := bySite[e.From] if !ok { name, file := e.From, "" diff --git a/internal/mcp/tools_analyze_string_downstream_test.go b/internal/mcp/tools_analyze_string_downstream_test.go index e7bbc1f1..8fc6f568 100644 --- a/internal/mcp/tools_analyze_string_downstream_test.go +++ b/internal/mcp/tools_analyze_string_downstream_test.go @@ -36,7 +36,7 @@ func callAnalyze(t *testing.T, srv *Server, kind string, extra map[string]any) m // addEmitToKindString builds a (caller, KindString) emit pair with // the given context and meta. Used by the registry-downstream // analyzers' tests. -func addEmitToKindString(g *graph.Graph, caller, strID, value, ctx string, nodeMeta, edgeMeta map[string]any) { +func addEmitToKindString(g graph.Store, caller, strID, value, ctx string, nodeMeta, edgeMeta map[string]any) { meta := map[string]any{ "context": ctx, "value": value, diff --git a/internal/mcp/tools_analyze_string_emitters.go b/internal/mcp/tools_analyze_string_emitters.go index d96c8e58..6b51087d 100644 --- a/internal/mcp/tools_analyze_string_emitters.go +++ b/internal/mcp/tools_analyze_string_emitters.go @@ -34,10 +34,7 @@ func (s *Server) handleAnalyzeStringEmitters(ctx context.Context, req mcp.CallTo Emitters []string `json:"emitters,omitempty"` } byString := map[string]*stringRow{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeEmits { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeEmits) { n := s.graph.GetNode(e.To) if n == nil || n.Kind != graph.KindString { continue diff --git a/internal/mcp/tools_analyze_string_emitters_test.go b/internal/mcp/tools_analyze_string_emitters_test.go index 4406bdad..ca3aa829 100644 --- a/internal/mcp/tools_analyze_string_emitters_test.go +++ b/internal/mcp/tools_analyze_string_emitters_test.go @@ -30,7 +30,7 @@ func callAnalyzeStringEmitters(t *testing.T, srv *Server, args map[string]any) m return out } -func addStringNode(g *graph.Graph, id, value, ctx string) { +func addStringNode(g graph.Store, id, value, ctx string) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindString, @@ -39,7 +39,7 @@ func addStringNode(g *graph.Graph, id, value, ctx string) { }) } -func addStringEmitEdge(g *graph.Graph, from, to, ctx, method string) { +func addStringEmitEdge(g graph.Store, from, to, ctx, method string) { g.AddEdge(&graph.Edge{ From: from, To: to, diff --git a/internal/mcp/tools_analyze_tests.go b/internal/mcp/tools_analyze_tests.go index 6e24d98d..40e3a0ef 100644 --- a/internal/mcp/tools_analyze_tests.go +++ b/internal/mcp/tools_analyze_tests.go @@ -57,10 +57,7 @@ func (s *Server) handleAnalyzeTestsAsEdges(ctx context.Context, req mcp.CallTool testsBySymbol := make(map[string][]string) symbolsByTest := make(map[string][]string) edgeCount := 0 - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeTests { - continue - } + for e := range edgesByKinds(s.graph, graph.EdgeTests) { edgeCount++ testsBySymbol[e.To] = append(testsBySymbol[e.To], e.From) symbolsByTest[e.From] = append(symbolsByTest[e.From], e.To) @@ -71,9 +68,27 @@ func (s *Server) handleAnalyzeTestsAsEdges(ctx context.Context, req mcp.CallTool primary = symbolsByTest } + // Batch-fetch every primary key and every related ID in one bulk + // round-trip. On a repo with thousands of EdgeTests edges the old + // per-id GetNode pattern burned one cgo Cypher call per row plus + // one per related ID on Ladybug — easily 5-10k round-trips per + // analyze kind=tests_as_edges call. + idSet := make(map[string]struct{}, len(primary)) + for id, relatedIDs := range primary { + idSet[id] = struct{}{} + for _, rid := range relatedIDs { + idSet[rid] = struct{}{} + } + } + allIDs := make([]string, 0, len(idSet)) + for id := range idSet { + allIDs = append(allIDs, id) + } + nodeByID := s.graph.GetNodesByIDs(allIDs) + rows := make([]testEdgeRow, 0, len(primary)) for id, relatedIDs := range primary { - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } @@ -88,7 +103,7 @@ func (s *Server) handleAnalyzeTestsAsEdges(ctx context.Context, req mcp.CallTool } seen[rid] = true name := rid - if rn := s.graph.GetNode(rid); rn != nil { + if rn := nodeByID[rid]; rn != nil { name = rn.Name } related = append(related, testEdgeRef{ID: rid, Name: name}) diff --git a/internal/mcp/tools_analyze_todos_test.go b/internal/mcp/tools_analyze_todos_test.go index e2960fcd..2eaff6fe 100644 --- a/internal/mcp/tools_analyze_todos_test.go +++ b/internal/mcp/tools_analyze_todos_test.go @@ -12,7 +12,7 @@ import ( // addTodoNode is a small helper for these tests — wires a KindTodo // node directly into the graph without going through the indexer's // per-file pipeline. -func addTodoNode(g *graph.Graph, id string, line int, meta map[string]any) { +func addTodoNode(g graph.Store, id string, line int, meta map[string]any) { g.AddNode(&graph.Node{ ID: id, Kind: graph.KindTodo, diff --git a/internal/mcp/tools_architecture.go b/internal/mcp/tools_architecture.go index 78887a2c..d52e9f2a 100644 --- a/internal/mcp/tools_architecture.go +++ b/internal/mcp/tools_architecture.go @@ -2,6 +2,7 @@ package mcp import ( "context" + "maps" "sort" "strings" @@ -62,23 +63,37 @@ func (s *Server) handleGetArchitecture(ctx context.Context, req mcp.CallToolRequ topEntryPoints := max(req.GetInt("top_entry_points", 10), 1) pathPrefix := strings.TrimSpace(req.GetString("path_prefix", "")) - scoped := s.scopedNodes(ctx) - inScope := make(map[string]*graph.Node, len(scoped)) - for _, n := range scoped { - if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { - continue + // scoped + inScope are only needed when the session is bound or + // the caller supplied a path-prefix narrowing. Otherwise every + // node is in scope and downstream membership tests are tautologies + // the helpers handle via nil inScope. + _, _, bound := s.sessionScope(ctx) + needScoped := bound || pathPrefix != "" + var scoped []*graph.Node + var inScope map[string]bool + var totalNodesScoped int + if needScoped { + scoped = s.scopedNodes(ctx) + inScope = make(map[string]bool, len(scoped)) + for _, n := range scoped { + if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { + continue + } + inScope[n.ID] = true } - inScope[n.ID] = n + totalNodesScoped = len(inScope) + } else { + totalNodesScoped = s.graph.NodeCount() } // 1. Summary — language mix + node/edge counts. - summary := architectureSummary(scoped, inScope, s.graph) + summary := architectureSummary(scoped, inScope, totalNodesScoped, s.graph) // 2. Communities — same shape as the outline tool, capped here. communitiesSection := architectureCommunities(s.getCommunities(), inScope, topCommunities) // 3. Hotspots — load-bearing symbols, scoped + capped. - hotspots := architectureHotspots(s.graph, s.getCommunities(), inScope, topHotspots) + hotspots := architectureHotspots(s.getHotspots(), inScope, topHotspots) // 4. Entry points — functions with zero in-edges that have // out-edges (called by no one, calls into the system). Sorted @@ -127,7 +142,7 @@ func (s *Server) handleGetArchitecture(ctx context.Context, req mcp.CallToolRequ // unrecognised tier returns ("", message) so the handler can surface a // clean error. Otherwise it rolls the base graph up to the requested // tier via analysis.BuildHierarchy and returns the wire shape. -func architectureHierarchy(g *graph.Graph, cr *analysis.CommunityResult, resolution string) (map[string]any, string) { +func architectureHierarchy(g graph.Store, cr *analysis.CommunityResult, resolution string) (map[string]any, string) { resolution = strings.ToLower(strings.TrimSpace(resolution)) if resolution == "" { return nil, "" @@ -169,11 +184,23 @@ func architectureHierarchy(g *graph.Graph, cr *analysis.CommunityResult, resolut // architectureSummary builds the language mix + node/edge count // header. Edges are bounded to the scoped subgraph so multi-repo -// callers don't see cross-workspace numbers. -func architectureSummary(allScoped []*graph.Node, inScope map[string]*graph.Node, g *graph.Graph) map[string]any { +// callers don't see cross-workspace numbers. nil inScope is the +// signal that every node is in scope — the helper short-circuits +// the lang count through Stats() and the edge count through +// EdgeCount() rather than materialising the whole graph over cgo. +func architectureSummary(allScoped []*graph.Node, inScope map[string]bool, totalNodes int, g graph.Store) map[string]any { langCounts := map[string]int{} - for _, n := range inScope { - if n.Language != "" { + if inScope == nil { + // Unbound session + no path-prefix — pull the aggregate from + // the backend's cached stats. One indexed groupby vs a + // whole-table scan over cgo. + stats := g.Stats() + maps.Copy(langCounts, stats.ByLanguage) + } else { + for _, n := range allScoped { + if !inScope[n.ID] || n.Language == "" { + continue + } langCounts[n.Language]++ } } @@ -192,15 +219,23 @@ func architectureSummary(allScoped []*graph.Node, inScope map[string]*graph.Node return languages[i].Name < languages[j].Name }) - totalEdges := 0 - for _, e := range g.AllEdges() { - if _, ok := inScope[e.From]; !ok { - continue - } - if _, ok := inScope[e.To]; !ok { - continue + // Common case — unbound session + no path-prefix — every node + // is in scope so the edge count is exactly the backend's + // EdgeCount(), which is an O(1) lookup. Skips materialising + // every edge over cgo just to count them. + var totalEdges int + if inScope == nil { + totalEdges = g.EdgeCount() + } else { + for _, e := range g.AllEdges() { + if !inScope[e.From] { + continue + } + if !inScope[e.To] { + continue + } + totalEdges++ } - totalEdges++ } primary := "" @@ -208,32 +243,40 @@ func architectureSummary(allScoped []*graph.Node, inScope map[string]*graph.Node primary = languages[0].Name } + unscopedCount := totalNodes + if inScope != nil { + unscopedCount = len(allScoped) + } return map[string]any{ - "total_nodes": len(inScope), - "total_nodes_unscoped": len(allScoped), + "total_nodes": totalNodes, + "total_nodes_unscoped": unscopedCount, "total_edges": totalEdges, "primary_language": primary, "languages": languages, } } -func architectureCommunities(cr *analysis.CommunityResult, inScope map[string]*graph.Node, top int) map[string]any { +func architectureCommunities(cr *analysis.CommunityResult, inScope map[string]bool, top int) map[string]any { out := map[string]any{"count": 0} if cr == nil { return out } kept := make([]analysis.Community, 0, len(cr.Communities)) for _, c := range cr.Communities { - // Drop communities with no members in scope. - match := false - for _, m := range c.Members { - if _, ok := inScope[m]; ok { - match = true - break + // nil inScope means "every node is in scope" — keep the + // community unconditionally. Otherwise drop the community + // when no member lands inside the session's workspace. + if inScope != nil { + match := false + for _, m := range c.Members { + if inScope[m] { + match = true + break + } + } + if !match { + continue } - } - if !match { - continue } kept = append(kept, c) } @@ -261,13 +304,13 @@ func architectureCommunities(cr *analysis.CommunityResult, inScope map[string]*g return out } -func architectureHotspots(g *graph.Graph, cr *analysis.CommunityResult, inScope map[string]*graph.Node, top int) []map[string]any { +func architectureHotspots(hotspots []analysis.HotspotEntry, inScope map[string]bool, top int) []map[string]any { out := []map[string]any{} - for _, h := range analysis.FindHotspots(g, cr, 0) { + for _, h := range hotspots { if len(out) >= top { break } - if _, ok := inScope[h.ID]; !ok { + if inScope != nil && !inScope[h.ID] { continue } out = append(out, map[string]any{ @@ -284,24 +327,87 @@ func architectureHotspots(g *graph.Graph, cr *analysis.CommunityResult, inScope return out } -func architectureEntryPoints(inScope map[string]*graph.Node, g *graph.Graph, top int) []map[string]any { +// architectureEntryPoints returns functions/methods with zero +// incoming edges and at least one outgoing edge — the "called by +// no one, calls into the system" pattern. +// +// The candidate pool is either the kind-filtered subset of an in-scope +// node map (bound session / path-prefix narrowing) or — when inScope +// is nil — the function+method slice pulled directly from the storage +// layer via NodesByKindsScanner. The legacy code path walked the full +// scoped-nodes slice every call just to keep the callable subset. +// +// Uses NodeDegreeAggregator when the backend implements it (one +// batched in/out count instead of 2N GetInEdges/GetOutEdges cgo +// round-trips on Ladybug — the per-node loop was the entire +// wall-clock cost of this section on large repos). +func architectureEntryPoints(inScope map[string]bool, g graph.Store, top int) []map[string]any { type entryCandidate struct { node *graph.Node fanOut int } - cands := make([]entryCandidate, 0, len(inScope)) - for _, n := range inScope { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue + // Pre-filter on kind Go-side first. When inScope is nil pull + // only function/method via the kind scanner; otherwise project + // the same subset out of the supplied scope set. + var pool []*graph.Node + if inScope == nil { + if scan, ok := g.(graph.NodesByKindsScanner); ok { + pool = scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + } else { + all := g.AllNodes() + pool = make([]*graph.Node, 0, len(all)) + for _, n := range all { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + pool = append(pool, n) + } + } } - if len(g.GetInEdges(n.ID)) > 0 { - continue + } else { + // Materialise the callable subset out of the in-scope node + // id set. The caller's scoped slice already lives in memory, + // so this stays cheap — but the inScope map carries bools, + // not nodes, so we re-resolve via GetNode for each id. + pool = make([]*graph.Node, 0, len(inScope)) + for id := range inScope { + n := g.GetNode(id) + if n == nil { + continue + } + if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { + continue + } + pool = append(pool, n) } - out := len(g.GetOutEdges(n.ID)) - if out == 0 { - continue + } + cands := make([]entryCandidate, 0, len(pool)) + if agg, ok := g.(graph.NodeDegreeAggregator); ok && len(pool) > 0 { + ids := make([]string, 0, len(pool)) + byID := make(map[string]*graph.Node, len(pool)) + for _, n := range pool { + ids = append(ids, n.ID) + byID[n.ID] = n + } + for _, r := range agg.NodeDegreeCounts(ids, nil) { + if r.InCount > 0 || r.OutCount == 0 { + continue + } + n := byID[r.NodeID] + if n == nil { + continue + } + cands = append(cands, entryCandidate{node: n, fanOut: r.OutCount}) + } + } else { + for _, n := range pool { + if len(g.GetInEdges(n.ID)) > 0 { + continue + } + out := len(g.GetOutEdges(n.ID)) + if out == 0 { + continue + } + cands = append(cands, entryCandidate{node: n, fanOut: out}) } - cands = append(cands, entryCandidate{node: n, fanOut: out}) } sort.Slice(cands, func(i, j int) bool { if cands[i].fanOut != cands[j].fanOut { @@ -324,13 +430,13 @@ func architectureEntryPoints(inScope map[string]*graph.Node, g *graph.Graph, top return out } -func architectureProcesses(pr *analysis.ProcessResult, inScope map[string]*graph.Node, top int) []architectureProcess { +func architectureProcesses(pr *analysis.ProcessResult, inScope map[string]bool, top int) []architectureProcess { if pr == nil { return []architectureProcess{} } kept := make([]analysis.Process, 0, len(pr.Processes)) for _, p := range pr.Processes { - if _, ok := inScope[p.EntryPoint]; !ok { + if inScope != nil && !inScope[p.EntryPoint] { continue } kept = append(kept, p) @@ -361,22 +467,34 @@ func architectureProcesses(pr *analysis.ProcessResult, inScope map[string]*graph // architectureCrossRepo bundles every cross_repo_* edge into a // (from_repo, to_repo, kind) → count rollup. Empty list when no // cross-repo edges exist (single-repo mode). -func architectureCrossRepo(g *graph.Graph) []crossRepoRow { +// +// Picks the CrossRepoEdgeAggregator capability when the backend +// implements it (one Cypher GROUP BY replaces the AllEdges + +// per-edge GetNode pair — typically ~286k cgo edge rows + thousands +// of GetNode round-trips on Ladybug for <100 rows of output). Falls +// back to the AllEdges-driven loop on backends that don't. +func architectureCrossRepo(g graph.Store) []crossRepoRow { type key struct { kind, fromRepo, toRepo string } counts := map[key]int{} - for _, e := range g.AllEdges() { - if _, isCross := graph.BaseKindForCrossRepo(e.Kind); !isCross { - continue + if ag, ok := g.(graph.CrossRepoEdgeAggregator); ok { + for _, r := range ag.CrossRepoEdgeCounts() { + counts[key{kind: string(r.Kind), fromRepo: r.FromRepo, toRepo: r.ToRepo}] = r.Count } - from := g.GetNode(e.From) - to := g.GetNode(e.To) - if from == nil || to == nil { - continue + } else { + for _, e := range g.AllEdges() { + if _, isCross := graph.BaseKindForCrossRepo(e.Kind); !isCross { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + k := key{kind: string(e.Kind), fromRepo: from.RepoPrefix, toRepo: to.RepoPrefix} + counts[k]++ } - k := key{kind: string(e.Kind), fromRepo: from.RepoPrefix, toRepo: to.RepoPrefix} - counts[k]++ } rows := make([]crossRepoRow, 0, len(counts)) for k, c := range counts { diff --git a/internal/mcp/tools_ast.go b/internal/mcp/tools_ast.go index 427c2e43..af8b83ac 100644 --- a/internal/mcp/tools_ast.go +++ b/internal/mcp/tools_ast.go @@ -178,8 +178,14 @@ func (s *Server) buildASTTargets(language, pathPrefix string, allowedRepos map[s return nil, fmt.Errorf("search_ast: no graph available") } out := make([]astquery.Target, 0, 256) - for _, n := range s.graph.AllNodes() { - if n.Kind != graph.KindFile { + // File nodes are a fraction of the node table; iterating the + // KindFile bucket via NodesByKind lets the backend stream only + // those rows instead of materialising the full table over cgo. + // Repo / language / path filters compose AND, so they stay Go- + // side — they can't be projected onto the bucket index without + // duplicating the predicate set across both call sites. + for n := range s.graph.NodesByKind(graph.KindFile) { + if n == nil { continue } if allowedRepos != nil && n.RepoPrefix != "" && !allowedRepos[n.RepoPrefix] { @@ -227,7 +233,7 @@ func (s *Server) buildASTTargets(language, pathPrefix string, allowedRepos map[s // than `min` incoming edges. Without an enclosing symbol, the // match is preserved (we'd otherwise silently swallow file-level // matches that legitimately have no caller graph). -func filterByMinFanIn(g *graph.Graph, matches []astquery.Match, min int) []astquery.Match { +func filterByMinFanIn(g graph.Store, matches []astquery.Match, min int) []astquery.Match { if g == nil || min <= 0 { return matches } diff --git a/internal/mcp/tools_check_references.go b/internal/mcp/tools_check_references.go index 28080a44..f5329a8d 100644 --- a/internal/mcp/tools_check_references.go +++ b/internal/mcp/tools_check_references.go @@ -81,14 +81,38 @@ func (s *Server) handleCheckReferences(ctx context.Context, req mcp.CallToolRequ callers := map[string]bool{} totalEdges := 0 if target != nil { - for _, e := range s.graph.GetInEdges(target.ID) { + // Pre-filter the in-edges and batch-fetch the surviving + // `From` nodes in one round-trip. On Ladybug the per-edge + // GetNode pattern was a cgo Cypher call per inbound edge — + // for heavily-referenced symbols (hundreds of callers) the + // cost was dominant. One GetNodesByIDs gives us the same + // data in a single bulk query. + inEdges := s.graph.GetInEdges(target.ID) + fromIDs := make([]string, 0, len(inEdges)) + seenFrom := make(map[string]struct{}, len(inEdges)) + for _, e := range inEdges { if !isCheckRefEdge(e.Kind) { continue } if minTier != "" && !atOrAboveTier(string(e.Origin), minTier) { continue } - from := s.graph.GetNode(e.From) + if _, dup := seenFrom[e.From]; dup { + continue + } + seenFrom[e.From] = struct{}{} + fromIDs = append(fromIDs, e.From) + } + fromByID := s.graph.GetNodesByIDs(fromIDs) + + for _, e := range inEdges { + if !isCheckRefEdge(e.Kind) { + continue + } + if minTier != "" && !atOrAboveTier(string(e.Origin), minTier) { + continue + } + from := fromByID[e.From] if from != nil && excludeTests && isTestPath(from.FilePath) { continue } @@ -149,39 +173,13 @@ func (s *Server) handleCheckReferences(ctx context.Context, req mcp.CallToolRequ } } - // Importing-files scan — every node whose FilePath imports the - // target's FilePath. Today the graph encodes file-level imports - // via EdgeImports between file/import nodes; we walk those to - // answer "is the home package consumed at all?". - importingFiles := []string{} - if target != nil && target.FilePath != "" { - seen := map[string]bool{} - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } - toNode := s.graph.GetNode(e.To) - if toNode == nil { - continue - } - if toNode.FilePath != target.FilePath && toNode.ID != target.FilePath { - continue - } - fromNode := s.graph.GetNode(e.From) - if fromNode == nil { - continue - } - if excludeTests && isTestPath(fromNode.FilePath) { - continue - } - if seen[fromNode.FilePath] { - continue - } - seen[fromNode.FilePath] = true - importingFiles = append(importingFiles, fromNode.FilePath) - } - sort.Strings(importingFiles) - } + // Importing-files scan — every file whose nodes carry an + // EdgeImports edge into the target's FilePath. Backends that + // implement graph.FileImporters serve this from one Cypher join + // (no AllEdges() materialisation, no per-edge GetNode round- + // trip). The legacy AllEdges + per-edge GetNode loop stays as + // the fallback for backends that don't ship the capability. + importingFiles := s.collectImportingFiles(target, excludeTests) referenced := totalEdges > 0 || len(sameName) > 0 || len(importingFiles) > 0 @@ -199,6 +197,67 @@ func (s *Server) handleCheckReferences(ctx context.Context, req mcp.CallToolRequ }) } +// collectImportingFiles answers "which files import the file that +// holds target?". Prefers the graph.FileImporters capability when +// the backend implements it — that path runs one Cypher join +// instead of an AllEdges() scan plus 2× per-edge GetNode round-trip. +// Returns a sorted, deduplicated, optionally test-filtered slice +// of file paths. +// +// When target is nil or has no FilePath the question is undefined; +// returns an empty slice (consistent with the legacy behaviour). +func (s *Server) collectImportingFiles(target *graph.Node, excludeTests bool) []string { + importingFiles := []string{} + if target == nil || target.FilePath == "" { + return importingFiles + } + seen := map[string]bool{} + add := func(fromFile string) { + if fromFile == "" { + return + } + if excludeTests && isTestPath(fromFile) { + return + } + if seen[fromFile] { + return + } + seen[fromFile] = true + importingFiles = append(importingFiles, fromFile) + } + + if fi, ok := s.graph.(graph.FileImporters); ok { + for _, row := range fi.FileImporters(target.FilePath) { + add(row.FromFile) + } + sort.Strings(importingFiles) + return importingFiles + } + + // Fallback: pull every edge and filter Go-side. Identical + // pre-capability behaviour — only the cgo-heavy backend ever + // reaches this path. + for _, e := range s.graph.AllEdges() { + if e.Kind != graph.EdgeImports { + continue + } + toNode := s.graph.GetNode(e.To) + if toNode == nil { + continue + } + if toNode.FilePath != target.FilePath && toNode.ID != target.FilePath { + continue + } + fromNode := s.graph.GetNode(e.From) + if fromNode == nil { + continue + } + add(fromNode.FilePath) + } + sort.Strings(importingFiles) + return importingFiles +} + // isCheckRefEdge identifies edges that mean "this symbol is being // used". Mirrors safe_delete_symbol's referencing-edge filter so // the two tools agree on what "referenced" means. diff --git a/internal/mcp/tools_churn.go b/internal/mcp/tools_churn.go index 5c6aa027..68f0a2b4 100644 --- a/internal/mcp/tools_churn.go +++ b/internal/mcp/tools_churn.go @@ -4,27 +4,25 @@ import ( "context" "sort" "strings" - "time" "github.com/mark3labs/mcp-go/mcp" - "github.com/zzet/gortex/internal/blame" + "github.com/zzet/gortex/internal/graph" ) -// registerChurnRateTool wires get_churn_rate — a standalone MCP tool -// that exposes per-symbol git-commit density. The metric is already -// implicit in `analyze hotspots` (composite); this tool surfaces the -// raw number so refactor planning, code review, and bus-factor work -// can read it directly. +// registerChurnRateTool wires get_churn_rate — a pure graph scan over +// per-symbol churn metadata pre-computed by `gortex enrich churn`. // -// Computation: walk the scoped subgraph for function/method nodes, -// group by file_path, run `git blame -p` once per unique file, count -// distinct commits whose blame range intersects the symbol's line -// range. Bounded by file count, not symbol count. +// At read time the handler does NOT shell out to git. Every value it +// returns lives in n.Meta["churn"] on the node, populated either by +// the CLI/git-hook (which writes through the LadyBug backend) or by +// an in-process call to the enrich_churn MCP tool. When no node in +// scope has the data, the response is a structured error pointing +// the agent at the enrich command. func (s *Server) registerChurnRateTool() { s.addTool( mcp.NewTool("get_churn_rate", - mcp.WithDescription("Per-symbol git-commit density. For each function/method in scope, runs `git blame -p` once per unique file and counts distinct commits intersecting the symbol's line range. Returns {symbol_id, name, file, churn_rate (commits per active day), commit_count, age_days, last_author, last_commit_at}. Sort and filter by churn_rate or commit_count to find unstable abstractions, hidden coupling, and bus-factor risks. Pairs with `analyze hotspots` — that returns the composite; this returns the raw signal."), + mcp.WithDescription("Per-symbol git-commit density, read from pre-computed graph data. For each function/method in scope returns {symbol_id, name, file, churn_rate (commits per active day), commit_count, age_days, last_author, last_commit_at}. Sort and filter by churn_rate or commit_count to find unstable abstractions, hidden coupling, and bus-factor risks. Data is populated by `gortex enrich churn` (or the enrich_churn MCP tool); when nothing in scope has churn meta the tool returns a structured error with the suggested next command. No git subprocess at request time — sub-second on indexed repos."), mcp.WithString("path_prefix", mcp.Description("Scope analysis to nodes under this file-path prefix.")), mcp.WithNumber("min_commits", mcp.Description("Only return symbols with at least this many commits (default: 1).")), mcp.WithString("kinds", mcp.Description("Comma-separated kinds (default: function,method). Pass 'all' for every symbol.")), @@ -65,11 +63,10 @@ func (s *Server) handleGetChurnRate(ctx context.Context, req mcp.CallToolRequest allowed = nil } - // Resolve the repo root once so blame.Run can be called with a - // fixed cwd. In multi-repo mode each file lives under one of the - // MultiIndexer repos; we resolve per-file with resolveFilePath. scoped := s.scopedNodes(ctx) - byFile := map[string][]*graph.Node{} + rows := make([]churnRow, 0, 64) + seenFiles := map[string]struct{}{} + sawMeta := false for _, n := range scoped { if allowed != nil { if _, ok := allowed[n.Kind]; !ok { @@ -79,88 +76,30 @@ func (s *Server) handleGetChurnRate(ctx context.Context, req mcp.CallToolRequest if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - if n.StartLine == 0 { - continue - } - byFile[n.FilePath] = append(byFile[n.FilePath], n) - } - - rows := make([]churnRow, 0, len(scoped)) - scannedFiles := 0 - for filePath, nodes := range byFile { - abs, _, err := s.resolveFilePath(filePath) - if err != nil { - continue - } - workTree := repoRootContaining(abs) - if workTree == "" { + row, ok := churnRowFromMeta(n) + if !ok { continue } - // Convert absolute path back to a path relative to the git - // work tree — git blame takes tree-relative paths. - gitRel := abs - if rel, err := stripPathPrefix(abs, workTree+"/"); err == nil { - gitRel = rel - } - lines, err := blame.Run(workTree, gitRel) - if err != nil || len(lines) == 0 { + sawMeta = true + if row.CommitCount < minCommits { continue } - scannedFiles++ + rows = append(rows, row) + seenFiles[n.FilePath] = struct{}{} + } - for _, n := range nodes { - endLine := n.EndLine - if endLine == 0 { - endLine = n.StartLine - } - commits := map[string]bool{} - oldest, newest := time.Time{}, time.Time{} - latestEmail := "" - for line := n.StartLine; line <= endLine; line++ { - a, ok := lines[line] - if !ok { - continue - } - if !commits[a.Commit] { - commits[a.Commit] = true - } - if oldest.IsZero() || a.Timestamp.Before(oldest) { - oldest = a.Timestamp - } - if newest.IsZero() || a.Timestamp.After(newest) { - newest = a.Timestamp - latestEmail = a.Email - } - } - if len(commits) == 0 || len(commits) < minCommits { - continue - } - ageDays := 0 - if !oldest.IsZero() { - ageDays = int(time.Since(oldest).Hours() / 24) - } - // Churn rate: commits per active day. A symbol active for - // 1 day with 3 commits gets churn_rate=3.0; one active for - // 100 days with the same 3 commits gets 0.03. The minimum - // denominator of 1 day stops a fresh symbol from looking - // infinitely churny. - activeDays := ageDays - if activeDays < 1 { - activeDays = 1 - } - row := churnRow{ - ID: n.ID, Name: n.Name, File: n.FilePath, - StartLine: n.StartLine, EndLine: endLine, - CommitCount: len(commits), - AgeDays: ageDays, - ChurnRate: roundScore(float64(len(commits)) / float64(activeDays)), - LastAuthor: latestEmail, - } - if !newest.IsZero() { - row.LastCommitAt = newest.UTC().Format(time.RFC3339) - } - rows = append(rows, row) - } + if !sawMeta { + // No node in scope carries meta.churn — the agent needs to + // run the enricher before this tool can answer. We surface + // the gap loudly rather than returning an empty result that + // looks like "nothing churns" (which is misleading). + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "error": "no churn data in scope; run `gortex enrich churn` (or call the enrich_churn MCP tool) to populate meta.churn", + "suggestion": "gortex enrich churn", + "symbols": []churnRow{}, + "total": 0, + "truncated": false, + }) } sort.Slice(rows, func(i, j int) bool { @@ -187,23 +126,84 @@ func (s *Server) handleGetChurnRate(ctx context.Context, req mcp.CallToolRequest } return s.respondJSONOrTOON(ctx, req, map[string]any{ - "symbols": rows, - "total": len(rows), - "truncated": truncated, - "scanned_files": scannedFiles, - "sort_by": sortBy, - "min_commits": minCommits, + "symbols": rows, + "total": len(rows), + "truncated": truncated, + "scanned_files": len(seenFiles), + "sort_by": sortBy, + "min_commits": minCommits, }) } -// stripPathPrefix returns path with prefix stripped iff path begins -// with prefix. Used to convert absolute paths back to git-tree-relative. -func stripPathPrefix(path, prefix string) (string, error) { - if strings.HasPrefix(path, prefix) { - return path[len(prefix):], nil +// churnRowFromMeta projects a node's meta.churn payload into the +// response row. Returns (zero, false) when the node has no churn +// metadata — the caller distinguishes "missing data" from +// "filtered out". The Meta layout matches what +// internal/churn.EnrichGraph writes: +// +// meta.churn = { +// commit_count: int, +// age_days: int, +// churn_rate: float64, +// last_author: string, +// last_commit_at: RFC3339 string, +// } +// +// Numeric fields tolerate both int and float64 because Meta round- +// trips through gob (LadyBug) or JSON (snapshots), which can widen +// ints to floats. Missing fields default to zero — they're stamped +// together so partial payloads are unexpected, but a defensive read +// is cheaper than asserting and crashing on an old snapshot. +func churnRowFromMeta(n *graph.Node) (churnRow, bool) { + if n == nil || n.Meta == nil { + return churnRow{}, false + } + raw, ok := n.Meta["churn"].(map[string]any) + if !ok || len(raw) == 0 { + return churnRow{}, false + } + endLine := n.EndLine + if endLine == 0 { + endLine = n.StartLine + } + row := churnRow{ + ID: n.ID, Name: n.Name, File: n.FilePath, + StartLine: n.StartLine, EndLine: endLine, + CommitCount: intFromAny(raw["commit_count"]), + AgeDays: intFromAny(raw["age_days"]), + ChurnRate: floatFromAny(raw["churn_rate"]), } - if path == strings.TrimSuffix(prefix, "/") { - return "", nil + if v, ok := raw["last_author"].(string); ok { + row.LastAuthor = v + } + if v, ok := raw["last_commit_at"].(string); ok { + row.LastCommitAt = v + } + return row, true +} + +func intFromAny(v any) int { + switch x := v.(type) { + case int: + return x + case int64: + return int(x) + case float64: + return int(x) + } + return 0 +} + +func floatFromAny(v any) float64 { + switch x := v.(type) { + case float64: + return x + case float32: + return float64(x) + case int: + return float64(x) + case int64: + return float64(x) } - return path, errPathUnresolved + return 0 } diff --git a/internal/mcp/tools_churn_test.go b/internal/mcp/tools_churn_test.go index ce84e286..2ff45537 100644 --- a/internal/mcp/tools_churn_test.go +++ b/internal/mcp/tools_churn_test.go @@ -3,85 +3,57 @@ package mcp import ( "context" "encoding/json" - "os" - "os/exec" - "path/filepath" "testing" "time" "github.com/mark3labs/mcp-go/mcp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/zzet/gortex/internal/graph" ) -// seedChurnRepo creates a real git repo at dir, with several commits -// touching different parts of foo.go so blame returns distinct -// authors and timestamps per line range. Returns absolute path. -func seedChurnRepo(t *testing.T) string { - t.Helper() - dir := t.TempDir() - - gitInit := func(args ...string) { - cmd := exec.Command("git", args...) - cmd.Dir = dir - if out, err := cmd.CombinedOutput(); err != nil { - t.Fatalf("git %v: %v\n%s", args, err, out) - } - } - gitInit("init", "-q") - gitInit("config", "user.email", "alice@example.com") - gitInit("config", "user.name", "alice") - gitInit("config", "commit.gpgsign", "false") - - write := func(content string) { - require.NoError(t, os.WriteFile(filepath.Join(dir, "foo.go"), []byte(content), 0o644)) - } - - // Commit 1: initial file. dead and live each at one line range. - write(`package foo - -func dead() int { - return 1 -} - -func live() int { - return 1 -} -`) - gitInit("add", "foo.go") - gitInit("commit", "-q", "-m", "init") - - // Commits 2-4: modify live() body three times, dead() once. - for i := 2; i <= 4; i++ { - write(`package foo - -func dead() int { - return ` + string(rune('1'+i)) + ` -} - -func live() int { - return ` + string(rune('1'+i)) + ` -} -`) - gitInit("commit", "-aq", "-m", "edit "+string(rune('1'+i))+"") - } - - return dir -} - -func newChurnTestServer(t *testing.T, dir string) *Server { +// seedChurnGraph builds a small graph with two function nodes whose +// meta.churn data the read-side handler is supposed to surface. We +// stamp the metadata directly instead of running the enricher — the +// read path is what's under test here; the enrich pass has its own +// tests in internal/churn. +func seedChurnGraph(t *testing.T) *Server { t.Helper() g := graph.New() - absFoo := filepath.Join(dir, "foo.go") + now := time.Now().UTC() g.AddNode(&graph.Node{ - ID: absFoo + "::dead", Name: "dead", Kind: graph.KindFunction, - FilePath: absFoo, StartLine: 3, EndLine: 5, Language: "go", + ID: "foo.go::dead", + Kind: graph.KindFunction, + Name: "dead", + FilePath: "foo.go", + StartLine: 3, EndLine: 5, + Meta: map[string]any{ + "churn": map[string]any{ + "commit_count": 1, + "age_days": 0, + "churn_rate": 1.0, + "last_author": "alice@example.com", + "last_commit_at": now.Format(time.RFC3339), + }, + }, }) g.AddNode(&graph.Node{ - ID: absFoo + "::live", Name: "live", Kind: graph.KindFunction, - FilePath: absFoo, StartLine: 7, EndLine: 9, Language: "go", + ID: "foo.go::live", + Kind: graph.KindFunction, + Name: "live", + FilePath: "foo.go", + StartLine: 7, EndLine: 9, + Meta: map[string]any{ + "churn": map[string]any{ + "commit_count": 4, + "age_days": 2, + "churn_rate": 2.0, + "last_author": "bob@example.com", + "last_commit_at": now.Format(time.RFC3339), + }, + }, }) return &Server{ @@ -112,45 +84,35 @@ func callChurnHandler(t *testing.T, s *Server, args map[string]any) map[string]a } func TestChurnRate_BothFunctionsSurface(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{}) symbols, _ := out["symbols"].([]any) require.Len(t, symbols, 2, "both dead and live should surface") } -func TestChurnRate_LiveHasHigherCommitCount(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - +func TestChurnRate_SortByCommitCount(t *testing.T) { + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{"sort_by": "commit_count"}) symbols, _ := out["symbols"].([]any) require.Len(t, symbols, 2) first := symbols[0].(map[string]any) second := symbols[1].(map[string]any) - // Both functions get edited by the same 4 commits — blame attribution - // will treat the entire file's lines as touched in each commit. The - // ordering should at least be stable; the count should be ≥1. - assert.GreaterOrEqual(t, int(first["commit_count"].(float64)), 1) - assert.GreaterOrEqual(t, int(second["commit_count"].(float64)), 1) + assert.Greater(t, int(first["commit_count"].(float64)), int(second["commit_count"].(float64))) + assert.Equal(t, "live", first["name"], "live has 4 commits, should rank above dead's 1") } func TestChurnRate_MinCommitsFilter(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - - // Very high threshold should drop everything. - out := callChurnHandler(t, s, map[string]any{"min_commits": 100}) + s := seedChurnGraph(t) + // dead has 1, live has 4 — threshold of 3 keeps only live. + out := callChurnHandler(t, s, map[string]any{"min_commits": 3}) symbols, _ := out["symbols"].([]any) - assert.Empty(t, symbols) + require.Len(t, symbols, 1) + assert.Equal(t, "live", symbols[0].(map[string]any)["name"]) } func TestChurnRate_LimitTruncates(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{"limit": 1}) symbols, _ := out["symbols"].([]any) assert.Len(t, symbols, 1) @@ -158,47 +120,27 @@ func TestChurnRate_LimitTruncates(t *testing.T) { } func TestChurnRate_PathPrefixFilter(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - - // Use a prefix that won't match anything. + s := seedChurnGraph(t) + // Prefix that matches none of the nodes' file paths. out := callChurnHandler(t, s, map[string]any{"path_prefix": "/no/such/path"}) - symbols, _ := out["symbols"].([]any) - assert.Empty(t, symbols) + // With no in-scope nodes carrying meta we hit the structured + // error path — assert the suggestion is present. + assert.Equal(t, "gortex enrich churn", out["suggestion"]) } func TestChurnRate_ScannedFilesCount(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{}) // One file (foo.go) — scanned once even with two symbols. assert.EqualValues(t, 1, out["scanned_files"].(float64)) } -func TestChurnRate_AgeDaysWithinFreshRepo(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - - out := callChurnHandler(t, s, map[string]any{}) - symbols, _ := out["symbols"].([]any) - require.NotEmpty(t, symbols) - first := symbols[0].(map[string]any) - // Fresh repo — age_days < 1 most of the time. Allow some slack. - age := int(first["age_days"].(float64)) - assert.LessOrEqual(t, age, 1, "fresh repo: symbol age should be 0 or 1 day") -} - -func TestChurnRate_RejectsNonGitDirectory(t *testing.T) { - dir := t.TempDir() - // Create a file but no git repo. - abs := filepath.Join(dir, "foo.go") - require.NoError(t, os.WriteFile(abs, []byte("package foo\nfunc x() {}\n"), 0o644)) - +func TestChurnRate_ErrorsWhenNoMeta(t *testing.T) { + // Graph with a function node but no meta.churn → error response. g := graph.New() g.AddNode(&graph.Node{ - ID: abs + "::x", Name: "x", Kind: graph.KindFunction, - FilePath: abs, StartLine: 2, EndLine: 2, + ID: "bar.go::x", Name: "x", Kind: graph.KindFunction, + FilePath: "bar.go", StartLine: 2, EndLine: 2, }) s := &Server{ graph: g, @@ -208,16 +150,13 @@ func TestChurnRate_RejectsNonGitDirectory(t *testing.T) { sessions: newSessionMap(), toolScopes: newScopeRegistry(), } - out := callChurnHandler(t, s, map[string]any{}) - symbols, _ := out["symbols"].([]any) - assert.Empty(t, symbols, "non-git directories return zero rows, not an error") + require.NotEmpty(t, out["error"], "expected structured error when no meta.churn is present") + assert.Equal(t, "gortex enrich churn", out["suggestion"]) } func TestChurnRate_SortByOptions(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) for _, sortBy := range []string{"churn_rate", "commit_count", "age_days"} { out := callChurnHandler(t, s, map[string]any{"sort_by": sortBy}) assert.Equal(t, sortBy, out["sort_by"], "sort_by echoed") @@ -226,20 +165,8 @@ func TestChurnRate_SortByOptions(t *testing.T) { } } -func TestStripPathPrefix(t *testing.T) { - got, err := stripPathPrefix("/a/b/c.go", "/a/") - require.NoError(t, err) - assert.Equal(t, "b/c.go", got) - - _, err = stripPathPrefix("/x/y.go", "/a/") - assert.Error(t, err) -} - -// Smoke test: roundtrip Unix timestamp through time.Time matches RFC3339. func TestChurnRate_TimestampShape(t *testing.T) { - dir := seedChurnRepo(t) - s := newChurnTestServer(t, dir) - + s := seedChurnGraph(t) out := callChurnHandler(t, s, map[string]any{}) symbols, _ := out["symbols"].([]any) require.NotEmpty(t, symbols) @@ -249,3 +176,37 @@ func TestChurnRate_TimestampShape(t *testing.T) { _, err := time.Parse(time.RFC3339, ts) require.NoError(t, err) } + +func TestChurnRate_TolerantMetaTypes(t *testing.T) { + // gob → JSON → Go round-trip can widen ints to float64. Verify the + // projection handles both forms transparently. + g := graph.New() + g.AddNode(&graph.Node{ + ID: "f.go::a", Name: "a", Kind: graph.KindFunction, + FilePath: "f.go", StartLine: 1, EndLine: 1, + Meta: map[string]any{ + "churn": map[string]any{ + "commit_count": float64(7), // came back from JSON + "age_days": int64(3), // came back from gob int64 + "churn_rate": float64(2.33), + "last_author": "x@y", + "last_commit_at": "2026-05-01T00:00:00Z", + }, + }, + }) + s := &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } + out := callChurnHandler(t, s, map[string]any{}) + symbols, _ := out["symbols"].([]any) + require.Len(t, symbols, 1) + row := symbols[0].(map[string]any) + assert.EqualValues(t, 7, row["commit_count"]) + assert.EqualValues(t, 3, row["age_days"]) + assert.InDelta(t, 2.33, row["churn_rate"].(float64), 0.001) +} diff --git a/internal/mcp/tools_clones.go b/internal/mcp/tools_clones.go index 4fc1ccbd..b2bd6394 100644 --- a/internal/mcp/tools_clones.go +++ b/internal/mcp/tools_clones.go @@ -83,10 +83,16 @@ func (s *Server) handleFindClones(ctx context.Context, req mcp.CallToolRequest) // Walk EdgeSimilarTo edges. The graph holds them symmetrically // (fA→fB and fB→fA); canonicalise to A(...) + // instead of the full AllEdges scan we used to pay for. ~500k edge + // rows materialised over cgo dropped to the SimilarTo-bearing + // subset (~hundreds-to-thousands on a normal workspace). seen := make(map[[2]string]struct{}) var pairs []clones.Pair - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeSimilarTo { + for e := range s.graph.EdgesByKind(graph.EdgeSimilarTo) { + if e == nil { continue } a, b := e.From, e.To diff --git a/internal/mcp/tools_cochange.go b/internal/mcp/tools_cochange.go index 45d31be2..278e9d8f 100644 --- a/internal/mcp/tools_cochange.go +++ b/internal/mcp/tools_cochange.go @@ -63,29 +63,49 @@ func (s *Server) handleFindCoChangingSymbols(ctx context.Context, req mcp.CallTo scores := s.coChangeScores(targetFile) counts := s.coChangeCounts(targetFile) - rows := make([]coChangeRow, 0, len(scores)) + // Two-phase build: first collect (file, score, count) tuples that + // survive the minScore gate, then sort + truncate to the requested + // limit, then batch-resolve the per-file symbol names. The Symbols + // lookup is the only graph-touching work in this handler — pulling + // it through one capability call instead of N GetFileNodes round- + // trips is the entire ladybug win. + type pending struct { + file string + score float64 + count int + } + pendings := make([]pending, 0, len(scores)) for file, score := range scores { if score < minScore { continue } - rows = append(rows, coChangeRow{ - File: file, - Score: roundScore(score), - Count: counts[file], - Symbols: s.symbolNamesInFile(file), - }) + pendings = append(pendings, pending{file: file, score: score, count: counts[file]}) } - sort.Slice(rows, func(i, j int) bool { - if rows[i].Score != rows[j].Score { - return rows[i].Score > rows[j].Score + sort.Slice(pendings, func(i, j int) bool { + if pendings[i].score != pendings[j].score { + return pendings[i].score > pendings[j].score } - return rows[i].File < rows[j].File + return pendings[i].file < pendings[j].file }) truncated := false - if len(rows) > limit { - rows = rows[:limit] + if len(pendings) > limit { + pendings = pendings[:limit] truncated = true } + keepFiles := make([]string, 0, len(pendings)) + for _, p := range pendings { + keepFiles = append(keepFiles, p.file) + } + symbolsByFile := s.symbolNamesByFiles(keepFiles) + rows := make([]coChangeRow, 0, len(pendings)) + for _, p := range pendings { + rows = append(rows, coChangeRow{ + File: p.file, + Score: roundScore(p.score), + Count: p.count, + Symbols: symbolsByFile[p.file], + }) + } result := map[string]any{ "target_file": targetFile, @@ -96,19 +116,90 @@ func (s *Server) handleFindCoChangingSymbols(ctx context.Context, req mcp.CallTo if symbolID != "" { result["symbol_id"] = symbolID } + // When the cache is empty AND the background mine has not finished + // yet, surface an in-progress marker so the caller can distinguish + // "this file has no co-change data" from "the daemon hasn't built + // the data yet". The mine is fired at daemon-ready by RunAnalysis; + // a fresh Ladybug daemon takes tens of seconds before the cache is + // populated. + if len(rows) == 0 && !s.coChangeReady() { + result["mining_in_progress"] = true + result["note"] = "co-change graph is still being mined; retry shortly" + } return s.respondJSONOrTOON(ctx, req, result) } -// ensureCoChange mines the co-change graph exactly once per daemon -// lifetime. Safe for concurrent callers — later callers block until -// the first mine completes, then return immediately. +// ensureCoChange triggers the co-change mine if it has not run yet +// and returns IMMEDIATELY — the mine itself runs asynchronously. +// +// Why async? On a disk backend (Ladybug) with no pre-existing +// EdgeCoChange edges, mineCoChange spends 60+ seconds in +// cochange.AddEdges: an AllNodes full-table scan plus thousands of +// per-pair AddEdge cgo round-trips. Wrapping that in sync.Once.Do +// turned every queued tool call into a blocked-for-60s caller. The +// async shape keeps the request path off the slow path. +// +// PrewarmCoChange (called from RunAnalysis at daemon-ready) fires +// the mine ahead of any user-visible call so the cache is already +// populated by the time the first find_co_changing_symbols arrives. +// +// Returning immediately means the first user call may see an empty +// cache when the prewarm goroutine has not yet completed. That is +// the deliberate trade-off — the alternative is a 60s blocked tool +// call. The handler surfaces an `in_progress` flag when the cache is +// empty so callers know to retry rather than treating the file as +// genuinely uncoupled. func (s *Server) ensureCoChange() { - s.cochangeOnce.Do(s.mineCoChange) + s.cochangeOnce.Do(func() { + go s.mineCoChange() + }) +} + +// PrewarmCoChange triggers the co-change mine in the background so a +// later find_co_changing_symbols / search rerank call sees a +// populated cache without blocking. Safe to call multiple times — the +// underlying sync.Once still gates the work to one execution. +// +// Returns immediately whether mining is in progress, completed, or +// freshly started. +func (s *Server) PrewarmCoChange() { + go s.cochangeOnce.Do(s.mineCoChange) +} + +// coChangeReady reports whether the mine has completed and the cache +// is populated. Used by the handler to set an `in_progress` flag +// when the cache is empty but mining is still running. +func (s *Server) coChangeReady() bool { + s.cochangeMu.RLock() + defer s.cochangeMu.RUnlock() + return s.cochangeByFile != nil } // mineCoChange populates the co-change caches. It prefers EdgeCoChange // edges already present in the graph (an enriched snapshot); only when -// none exist does it mine `git log` and materialise the edges. +// none exist does it mine `git log`. +// +// The mine writes ONLY the in-memory caches — it deliberately does +// not materialise EdgeCoChange edges back into the graph store. +// Persisting tens of thousands of EdgeCoChange edges via AddEdge on a +// disk backend (Ladybug) is several minutes of cgo INSERTs, and every +// such insert grows the live edge count. The analyze[clusters] +// partition cache is keyed on (NodeCount, EdgeCount, +// EdgeIdentityRevisions); a background edge-count drift invalidates +// it on every check, forcing a 40s Leiden recompute on each call. +// +// What we LOSE by skipping the persist: +// - A subsequent daemon start can no longer take the +// coChangeFromEdges fast path; it re-mines `git log` (typically +// 5-15s) on every restart. +// +// What we KEEP: +// - find_co_changing_symbols reads the in-memory cache directly. +// - The search rerank's CoChangeOf hook reads the in-memory cache +// (not EdgeCoChange edges). +// - cochange.EnrichGraph (the CLI / external enrichment path) is +// untouched — that's a separate code path that explicitly opts +// into the AddEdges persist when the operator wants it. func (s *Server) mineCoChange() { scores := map[string]map[string]float64{} counts := map[string]map[string]int{} @@ -123,7 +214,6 @@ func (s *Server) mineCoChange() { if len(res.Pairs) == 0 { continue } - cochange.AddEdges(s.graph, res.Pairs, prefix) for _, p := range res.Pairs { fa, fb := p.FileA, p.FileB if prefix != "" { @@ -141,18 +231,27 @@ func (s *Server) mineCoChange() { // edges already in the graph. Returns true when at least one edge was // found — the signal that an enriched snapshot is loaded and no fresh // git mine is needed. +// +// EdgesByKind streams only the CoChange edges; the endpoint nodes are +// fetched in one batched GetNodesByIDs call instead of two GetNode +// round-trips per edge. On disk backends (Ladybug) that drops the +// whole-graph AllEdges materialisation plus the per-edge cgo +// GetNode trips that loaded the file paths. func (s *Server) coChangeFromEdges(scores map[string]map[string]float64, counts map[string]map[string]int) bool { - found := false - for _, e := range s.graph.AllEdges() { - if e.Kind != graph.EdgeCoChange { - continue - } - from := s.graph.GetNode(e.From) - to := s.graph.GetNode(e.To) - if from == nil || to == nil { + // First pass: collect CoChange edges + the set of node IDs they + // reference. Both can stream from EdgesByKind in one Cypher + // round-trip on disk backends. + type ccEdge struct { + from, to string + score float64 + count int + } + var edges []ccEdge + idSet := make(map[string]struct{}) + for e := range s.graph.EdgesByKind(graph.EdgeCoChange) { + if e == nil { continue } - found = true score := e.Confidence if e.Meta != nil { if v, ok := e.Meta["score"].(float64); ok { @@ -170,9 +269,35 @@ func (s *Server) coChangeFromEdges(scores map[string]map[string]float64, counts count = int(v) } } - addCoChangeLink(scores, counts, from.FilePath, to.FilePath, score, count) + edges = append(edges, ccEdge{from: e.From, to: e.To, score: score, count: count}) + idSet[e.From] = struct{}{} + idSet[e.To] = struct{}{} + } + if len(edges) == 0 { + return false + } + + // Batched endpoint resolution — one Cypher WHERE id IN $ids vs. + // 2 * len(edges) per-row GetNode trips. On a workspace with + // thousands of co-change edges this is the bulk of the latency. + ids := make([]string, 0, len(idSet)) + for id := range idSet { + ids = append(ids, id) + } + nodes := s.graph.GetNodesByIDs(ids) + + for _, e := range edges { + from, ok := nodes[e.from] + if !ok || from == nil { + continue + } + to, ok := nodes[e.to] + if !ok || to == nil { + continue + } + addCoChangeLink(scores, counts, from.FilePath, to.FilePath, e.score, e.count) } - return found + return true } // addCoChangeLink records one directed co-change relationship. diff --git a/internal/mcp/tools_coding.go b/internal/mcp/tools_coding.go index d02c258a..86e709fb 100644 --- a/internal/mcp/tools_coding.go +++ b/internal/mcp/tools_coding.go @@ -264,6 +264,34 @@ func resolveKeepPredicate(keep string, symbols []*graph.Node) (func(elide.Decl) return pred, resolved } +// editingContextSymbolNodes reconstructs the *graph.Node slice the +// elide.KeepAny predicate needs from the editing-context Defines +// rows. We carry the node IDs only on the wire, but a `keep` token +// can target a node by id, name, or kind — so we re-resolve every +// defines row to a node here. Used only when compress_bodies=true. +func (s *Server) editingContextSymbolNodes(filePath string, defines []map[string]any) []*graph.Node { + if len(defines) == 0 { + return nil + } + ids := make([]string, 0, len(defines)) + for _, d := range defines { + if id, _ := d["id"].(string); id != "" { + ids = append(ids, id) + } + } + if len(ids) == 0 { + return nil + } + nodes := s.graph.GetNodesByIDs(ids) + out := make([]*graph.Node, 0, len(ids)) + for _, id := range ids { + if n, ok := nodes[id]; ok && n != nil { + out = append(out, n) + } + } + return out +} + func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { fp, err := req.RequireString("path") if err != nil { @@ -274,99 +302,164 @@ func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRe s.ensureFresh([]string{fp}) s.sessionFor(ctx).recordFile(fp) - sg := s.engineFor(ctx).GetFileSymbols(fp) - if len(sg.Nodes) == 0 { - return mcp.NewToolResultError("no symbols found for file: " + fp), nil - } - // A file outside the session's workspace is reported as not found - // — its symbols all share one repo, so the first node decides. - if !s.nodeInSessionScope(ctx, sg.Nodes[0]) { - return mcp.NewToolResultError("no symbols found for file: " + fp), nil - } - // Confine the caller/callee neighbourhoods below to the session - // workspace so editing context never reaches across the boundary. - sessWS, _, _ := s.sessionScope(ctx) - // Frecency: a file-level editing context is effectively an access to - // every symbol defined in that file. Credit each of them — this is - // the signal that "the agent is working in this area right now." - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - continue - } - s.frecency.Record(n.ID) - } - out := editingContext{} - - // File info. - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - out.File = map[string]any{"id": n.ID, "language": n.Language} - break + var fileNodeForScope *graph.Node + callerCap := 20 + calleeCap := 20 + + // Fast path: when the backend implements FileEditingContext we + // take all five projections in a small fixed number of Cypher + // round-trips instead of the per-symbol GetCallers / GetCallChain + // loop. The fallback retains the previous engine-based shape so + // the in-memory backend is unaffected. + if fc, ok := s.graph.(graph.FileEditingContext); ok { + bundle := fc.FileEditingContext(fp, []graph.NodeKind{graph.KindFunction, graph.KindMethod}) + if bundle == nil || (bundle.FileNode == nil && len(bundle.Defines) == 0) { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + fileNodeForScope = bundle.FileNode + if fileNodeForScope == nil && len(bundle.Defines) > 0 { + fileNodeForScope = bundle.Defines[0] + } + if !s.nodeInSessionScope(ctx, fileNodeForScope) { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + for _, n := range bundle.Defines { + s.frecency.Record(n.ID) + } + if bundle.FileNode != nil { + out.File = map[string]any{"id": bundle.FileNode.ID, "language": bundle.FileNode.Language} + } + for _, n := range bundle.Defines { + entry := map[string]any{ + "id": n.ID, + "kind": n.Kind, + "name": n.Name, + "start_line": n.StartLine, + } + if n.Meta != nil { + if sig, ok := n.Meta["signature"]; ok { + entry["signature"] = sig + } + } + out.Defines = append(out.Defines, entry) } - } - - // Defines: all non-file symbols in this file. - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - continue + for _, e := range bundle.Imports { + out.Imports = append(out.Imports, map[string]any{ + "id": e.To, + "external": strings.HasPrefix(e.To, "external::"), + }) } - entry := map[string]any{ - "id": n.ID, - "kind": n.Kind, - "name": n.Name, - "start_line": n.StartLine, + // Workspace-scope post-filter mirrors the legacy GetCallers / + // GetCallChain WorkspaceID gate. + sessWS, _, bound := s.sessionScope(ctx) + var opts query.QueryOptions + if bound { + opts.WorkspaceID = sessWS } - if sig, ok := n.Meta["signature"]; ok { - entry["signature"] = sig + for _, n := range bundle.CalledBy { + if bound && !opts.ScopeAllows(n) { + continue + } + if len(out.CalledBy) >= callerCap { + break + } + out.CalledBy = append(out.CalledBy, map[string]any{ + "id": n.ID, + "name": n.Name, + "file_path": n.FilePath, + "start_line": n.StartLine, + }) } - out.Defines = append(out.Defines, entry) - } - - // Imports: outgoing import edges from the file node. - for _, e := range sg.Edges { - if e.Kind == graph.EdgeImports { - importInfo := map[string]any{ - "id": e.To, - "external": strings.HasPrefix(e.To, "external::"), + for _, n := range bundle.Calls { + if bound && !opts.ScopeAllows(n) { + continue + } + if len(out.Calls) >= calleeCap { + break } - out.Imports = append(out.Imports, importInfo) + out.Calls = append(out.Calls, map[string]any{ + "id": n.ID, + "name": n.Name, + "file_path": n.FilePath, + "start_line": n.StartLine, + }) } - } - - // CalledBy: who calls symbols in this file (depth 1). - callerSeen := make(map[string]bool) - for _, n := range sg.Nodes { - if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { - callers := s.engineFor(ctx).GetCallers(n.ID, query.QueryOptions{Depth: 1, Limit: 20, Detail: "brief", WorkspaceID: sessWS}) - for _, cn := range callers.Nodes { - if cn.FilePath != fp && !callerSeen[cn.ID] { - callerSeen[cn.ID] = true - out.CalledBy = append(out.CalledBy, map[string]any{ - "id": cn.ID, - "name": cn.Name, - "file_path": cn.FilePath, - "start_line": cn.StartLine, - }) + } else { + sg := s.engineFor(ctx).GetFileSymbols(fp) + if len(sg.Nodes) == 0 { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + if !s.nodeInSessionScope(ctx, sg.Nodes[0]) { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + sessWS, _, _ := s.sessionScope(ctx) + for _, n := range sg.Nodes { + if n.Kind == graph.KindFile { + continue + } + s.frecency.Record(n.ID) + } + for _, n := range sg.Nodes { + if n.Kind == graph.KindFile { + out.File = map[string]any{"id": n.ID, "language": n.Language} + break + } + } + for _, n := range sg.Nodes { + if n.Kind == graph.KindFile { + continue + } + entry := map[string]any{ + "id": n.ID, + "kind": n.Kind, + "name": n.Name, + "start_line": n.StartLine, + } + if sig, ok := n.Meta["signature"]; ok { + entry["signature"] = sig + } + out.Defines = append(out.Defines, entry) + } + for _, e := range sg.Edges { + if e.Kind == graph.EdgeImports { + out.Imports = append(out.Imports, map[string]any{ + "id": e.To, + "external": strings.HasPrefix(e.To, "external::"), + }) + } + } + callerSeen := make(map[string]bool) + for _, n := range sg.Nodes { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + callers := s.engineFor(ctx).GetCallers(n.ID, query.QueryOptions{Depth: 1, Limit: callerCap, Detail: "brief", WorkspaceID: sessWS}) + for _, cn := range callers.Nodes { + if cn.FilePath != fp && !callerSeen[cn.ID] { + callerSeen[cn.ID] = true + out.CalledBy = append(out.CalledBy, map[string]any{ + "id": cn.ID, + "name": cn.Name, + "file_path": cn.FilePath, + "start_line": cn.StartLine, + }) + } } } } - } - - // Calls: what symbols in this file call (depth 1). - callSeen := make(map[string]bool) - for _, n := range sg.Nodes { - if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { - chain := s.engineFor(ctx).GetCallChain(n.ID, query.QueryOptions{Depth: 1, Limit: 20, Detail: "brief", WorkspaceID: sessWS}) - for _, cn := range chain.Nodes { - if cn.FilePath != fp && !callSeen[cn.ID] { - callSeen[cn.ID] = true - out.Calls = append(out.Calls, map[string]any{ - "id": cn.ID, - "name": cn.Name, - "file_path": cn.FilePath, - "start_line": cn.StartLine, - }) + callSeen := make(map[string]bool) + for _, n := range sg.Nodes { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + chain := s.engineFor(ctx).GetCallChain(n.ID, query.QueryOptions{Depth: 1, Limit: calleeCap, Detail: "brief", WorkspaceID: sessWS}) + for _, cn := range chain.Nodes { + if cn.FilePath != fp && !callSeen[cn.ID] { + callSeen[cn.ID] = true + out.Calls = append(out.Calls, map[string]any{ + "id": cn.ID, + "name": cn.Name, + "file_path": cn.FilePath, + "start_line": cn.StartLine, + }) + } } } } @@ -388,18 +481,20 @@ func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRe } } if language != "" && elide.IsSupported(language) { - // Use the first non-file node to find the on-disk path. + // Use the file node (cached above from the editing-context + // bundle) to find the on-disk path. Falls back to the first + // defines node if no file node materialised (defensive — the + // FileEditingContext implementation always returns one when + // the file is indexed). var fileBytes []byte - for _, n := range sg.Nodes { - if n.Kind == graph.KindFile { - if absPath, rerr := s.resolveNodePath(n); rerr == nil { - if content, ok := s.overlayContentFor(ctx, absPath); ok { - fileBytes = []byte(content) - } else if b, ferr := os.ReadFile(absPath); ferr == nil { - fileBytes = b - } + anchor := fileNodeForScope + if anchor != nil { + if absPath, rerr := s.resolveNodePath(anchor); rerr == nil { + if content, ok := s.overlayContentFor(ctx, absPath); ok { + fileBytes = []byte(content) + } else if b, ferr := os.ReadFile(absPath); ferr == nil { + fileBytes = b } - break } } if len(fileBytes) > 0 { @@ -407,7 +502,8 @@ func (s *Server) handleGetEditingContext(ctx context.Context, req mcp.CallToolRe // verbatim bodies while the rest of the file is still // stubbed — keep the functions being edited at full // source and compress everything else. - keepPred, resolved := resolveKeepPredicate(req.GetString("keep", ""), sg.Nodes) + keepNodes := s.editingContextSymbolNodes(fp, out.Defines) + keepPred, resolved := resolveKeepPredicate(req.GetString("keep", ""), keepNodes) keptSymbols = resolved if compressed, cerr := elide.CompressWith(fileBytes, language, elide.Options{Keep: keepPred}); cerr == nil { sourceCompressed = string(compressed) diff --git a/internal/mcp/tools_core.go b/internal/mcp/tools_core.go index 21dc896f..2ecb8a08 100644 --- a/internal/mcp/tools_core.go +++ b/internal/mcp/tools_core.go @@ -7,6 +7,7 @@ import ( "path/filepath" "sort" "strings" + "time" "github.com/mark3labs/mcp-go/mcp" toon "github.com/toon-format/toon-go" @@ -563,11 +564,22 @@ func filterSubGraph(sg *query.SubGraph, allowed map[string]bool) *query.SubGraph edges = append(edges, e) } } + totalEdges := len(edges) + // Counts-only payloads arrive with Edges == nil and TotalEdges + // pre-populated — preserve the upstream count instead of zeroing + // it. Inexact in the presence of a non-trivial filter (we'd need + // the edges to know which belong to filtered-out nodes), but the + // gcx output that asks for the count-only path runs with the + // session's workspace scope already applied at the store, so the + // filter pass is typically a no-op. + if len(sg.Edges) == 0 && sg.TotalEdges > 0 { + totalEdges = sg.TotalEdges + } return &query.SubGraph{ Nodes: nodes, Edges: edges, TotalNodes: len(nodes), - TotalEdges: len(edges), + TotalEdges: totalEdges, Truncated: sg.Truncated, } } @@ -619,6 +631,69 @@ func enrichSubGraphEdges(sg *query.SubGraph) { } } +// isNonDefinitionNode reports whether a node kind is NOT a file-level +// definition and should be dropped from a get_file_summary view. It +// excludes the file node itself, imports, and the function-body-internal +// nodes (locals, params, closures, generic params, builtins) that the +// file_path lookup pulls in but that the "symbols a file defines" +// contract never wanted. Without this filter the summary floods with +// hundreds of locals/params (the old defines-edge query excluded them by +// construction; the GetFileNodes-based path does not). +func isNonDefinitionNode(k graph.NodeKind) bool { + switch k { + case graph.KindFile, graph.KindImport, graph.KindLocal, + graph.KindParam, graph.KindClosure, graph.KindGenericParam, + graph.KindBuiltin: + return true + } + return false +} + +// stripNonDefinitionNodes returns a copy of sg with non-definition nodes +// nodes removed (and edges that reference them dropped). Used by +// handleGetFileSummary to keep its output focused on the symbols a +// file *defines* — the file node and per-statement import nodes are +// useful internals (e.g. for the file-neighbourhood walk that drives +// the Ladybug-side pushdown) but noise in the agent-visible payload. +func stripNonDefinitionNodes(sg *query.SubGraph) *query.SubGraph { + if sg == nil { + return nil + } + keep := make(map[string]bool, len(sg.Nodes)) + nodes := make([]*graph.Node, 0, len(sg.Nodes)) + for _, n := range sg.Nodes { + if n == nil || isNonDefinitionNode(n.Kind) { + continue + } + nodes = append(nodes, n) + keep[n.ID] = true + } + edges := make([]*graph.Edge, 0, len(sg.Edges)) + for _, e := range sg.Edges { + if e == nil || !keep[e.From] || !keep[e.To] { + continue + } + edges = append(edges, e) + } + totalEdges := len(edges) + // Counts-only payloads arrive with Edges == nil and TotalEdges + // already populated by the store. Keep that count — the file + + // import nodes we're stripping pulled some edges with them so it's + // a slight overcount, but the gcx callers that take this path + // only render it as a header scalar, not as anything load-bearing. + if len(sg.Edges) == 0 && sg.TotalEdges > 0 { + totalEdges = sg.TotalEdges + } + return &query.SubGraph{ + Nodes: nodes, + Edges: edges, + TotalNodes: len(nodes), + TotalEdges: totalEdges, + Truncated: sg.Truncated, + CallerNotes: sg.CallerNotes, + } +} + // compactSubGraph formats a SubGraph as compact text. func compactSubGraph(sg *query.SubGraph) string { var b strings.Builder @@ -726,7 +801,7 @@ func (s *Server) registerCoreTools() { mcp.WithString("assist", mcp.Description("LLM assist mode: \"auto\" (default — engages on natural-language queries, skips identifier lookups), \"on\" (force engage), \"off\" (bypass), \"deep\" (on + a body-grounded verification pass that reads candidate code and HONESTLY drops irrelevant matches — slower, may return empty results when nothing genuinely matches). Requires an LLM provider configured via `llm.provider` (local / anthropic / openai / ollama / claudecli / gemini / bedrock / deepseek); behaves as \"off\" when none is available.")), mcp.WithBoolean("debug", mcp.Description("When true, attach a `rerank` block to the response carrying per-candidate scores and per-signal contributions from the 11-signal rerank pipeline (bm25, semantic, fan_in, hits, fan_out, churn, community, minhash, api_signature, type_signature, recency, feedback) plus the active per-signal weight map. Off by default; enable to inspect ranking decisions or tune `.gortex.yaml::search::weights`.")), mcp.WithString("query_class", mcp.Description("Advisory hint that tunes the bm25-vs-semantic balance of the rerank: \"auto\" (default — detect from query shape), \"symbol\" (identifier / API lookup — BM25-heavy), \"concept\" (natural-language description — balanced), \"path\" (file-path query — most BM25-heavy), \"signature\" (type/function-signature fragment — BM25-leaning), \"keyword_soup\" (a degenerate boolean OR-list \u2014 suppresses LLM expansion and splits the soup into per-disjunct BM25 fetches; a `query_advice` nudge rides on the response). The class actually used is echoed back as `query_class` in the response.")), - mcp.WithString("expand", mcp.Description("Query-expansion channels: \"both\" (default \u2014 LLM expansion when the assist gate engages, plus the deterministic equivalence-class table), \"equivalence\" (only the LLM-free curated synonym table + per-repo auto-mined concepts), \"llm\" (only LLM expansion), \"off\" (pure BM25, no expansion). Equivalence expansion bridges query vocabulary to the words a symbol uses (auth->login, delete->remove) and runs even with no LLM provider configured.")), + mcp.WithString("expand", mcp.Description("Query-expansion channels: \"both\" (default \u2014 LLM expansion when the assist gate engages, plus the deterministic equivalence-class table), \"equivalence\" (only the LLM-free curated synonym table + per-repo auto-mined concepts), \"llm\" (only LLM expansion), \"off\" (pure BM25, no expansion). Equivalence expansion bridges query vocabulary to the words a symbol uses (auth->login, delete->remove) and runs even with no LLM provider configured. For identifier queries (query_class symbol / path / signature) the server auto-disables expansion + vector even when expand is set \u2014 these classes match best on BM25 + exact-name alone.")), mcp.WithString("corpus", mcp.Description("Which corpus to search: \"code\" (default \u2014 code symbols only), \"docs\" (only Markdown prose-section nodes \u2014 the heading-delimited documentation sections), \"all\" (both). With docs/all a prose query matches the right README / guide section by its body text.")), mcp.WithNumber("max_per_file", mcp.Description("Cap how many results a single source file may contribute to the diverse head of the result set (default 3). Hits beyond the cap are demoted below not-yet-capped results — never dropped — so the top of the list spans more files. Set 0 to disable diversification.")), ), @@ -735,7 +810,7 @@ func (s *Server) registerCoreTools() { s.addTool( mcp.NewTool("get_file_summary", - mcp.WithDescription("Use instead of Read to understand a file's role: returns all its symbols and imports without reading source lines."), + mcp.WithDescription("Use instead of Read to understand a file's role: returns the symbols a file defines (functions, methods, types, fields, …) without reading source lines. The file node itself and import nodes are excluded — use find_import_path or get_dependencies for import-shape queries."), mcp.WithString("path", mcp.Required(), mcp.Description("Relative file path")), mcp.WithBoolean("compact", mcp.Description("One-line-per-symbol text output (saves 50-70% tokens)")), mcp.WithString("format", mcp.Description("Output format: json (default), gcx (GCX1 compact wire format), or toon")), @@ -1103,7 +1178,15 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques projectArg = fq.Project } scopeWS, scopeProj := s.resolveQueryScope(ctx, workspaceArg, projectArg) - scope := query.QueryOptions{WorkspaceID: scopeWS, ProjectID: scopeProj} + // Per-phase timing for the search hot path. The struct is populated + // across the engine boundary (BM25 backend call wall-clock attributes + // to BM25*MS in fetchAndMergeBM25Timed; GetNodes / FindName / Fallback + // land here from inside Engine.gatherBackendCandidates) and surfaced + // at the end as a single debug log line. Nil-safe: callers without + // debug logging pay zero overhead. + timings := &query.SearchTimings{} + phaseStart := time.Now() + scope := query.QueryOptions{WorkspaceID: scopeWS, ProjectID: scopeProj, SearchTimings: timings} // Keyword-soup defense: a degenerate boolean / OR-list query // ("A OR B OR 'no access'") defeats ordinary retrieval. Detect it @@ -1120,6 +1203,37 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques soupReason = "query reads as a boolean OR-list; search ranks best on a single concept or symbol name -- run one query per disjunct, or describe the intent in plain words" } + // Identifier-shape fast path. ClassifyQuery is the structural + // detector the rerank uses; QueryClassSymbol / Path / Signature + // are queries where the rerank's classWeightTable already proves + // the semantic channel contributes near-zero signal (0.65 / 0.45 / + // 0.80 vs the baseline 1.00) — see internal/search/rerank/ + // query_kind.go::classWeightTable. For these classes the handler + // forces expansion off and tells the engine to skip the vector + // channel entirely; the rest of the pipeline (BM25 + bundle + + // rerank) is the only path that matters. An explicit + // query_class arg pin on one of these three classes engages the + // fast path too. A soup query never engages the fast path — + // keyword_soup has its own split-disjunct treatment. + // + // Validation of the query_class arg happens here so the early + // gating uses the same validated value the rerank below uses; + // invalid input is rejected before the engine runs. + queryClass := rerank.ClassifyQuery(q) + if qcArg := strings.TrimSpace(req.GetString("query_class", "")); qcArg != "" { + parsed, ok := rerank.ParseQueryClass(qcArg) + if !ok { + return mcp.NewToolResultError("invalid query_class: " + qcArg + " (want auto, symbol, concept, path, signature, or keyword_soup)"), nil + } + if parsed != rerank.QueryClassUnknown { + queryClass = parsed + } + } + identifierFastPath := !isSoup && isIdentifierClass(queryClass) + if identifierFastPath { + scope.SkipVectorChannel = true + } + // LLM assist gate: decides whether the expansion + rerank passes // run for this query. The service-enabled check is layered inside // the helpers so a stub build is a clean bypass. A soup query @@ -1129,6 +1243,14 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques // expand mode picks which query-expansion channels run -- LLM, // the deterministic equivalence table, both (default), or off. expand := parseExpandMode(req) + // Identifier-shape queries skip every expansion channel — the + // rerank's classWeightTable shows BM25 is near-perfect for these + // classes; expansion would only add the combined-OR fan-out's + // extra Cypher call without lifting recall on a literal-token + // query. The explicit arg pin still wins for soup / concept. + if identifierFastPath { + expand = expandOff + } engage := shouldEngageAssist(assist, q) && s.llmService != nil && s.llmService.Enabled() if isSoup || !expand.allowsLLMExpansion() { engage = false @@ -1139,6 +1261,14 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques // Slightly widen the BM25 over-fetch when we're going to // rerank: more head candidates means a more useful reorder. fetchLimit = offset + limit + rerankCap + } else if identifierFastPath { + // Identifier-shape fast path: no expansion, no vector channel, + // no LLM rerank — the only down-stream consumer is the + // structural rerank pipeline scoring a single FTS-ranked head. + // A wide head is wasted work; every extra candidate drags an + // in/out edge pair through the bundle phase. Tighten to + // +5 so the post-filter slack still leaves a full page. + fetchLimit = offset + limit + 5 } // Expansion terms feeding the BM25 OR-merge: LLM-derived synonyms @@ -1162,14 +1292,28 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques } expandedTerms := mergeExpansionTerms(soupFragments, llmTerms, equivTerms) + // Build the rerank context BEFORE the BM25 fetch so the engine's + // bundle path can seed its edge caches as the BM25 calls land. + // The handler-side applyRerankBoostsTimed reuses this same rctx, + // so the merged candidate set's edges are already cached when + // prepare() runs against the post-filter slice. Without this + // pre-fetch construction the engine's bundle would build a + // throwaway cache on each BM25 call and the handler's later + // rerank would still fetch every candidate's edges itself. + rctx := s.buildRerankContext(ctx, q) + scope.RerankContext = rctx + var nodes []*graph.Node var primaryCount int if len(expandedTerms) > 0 { - nodes, primaryCount = fetchAndMergeBM25(s.engineFor(ctx), q, expandedTerms, fetchLimit, scope) + nodes, primaryCount = fetchAndMergeBM25Timed(s.engineFor(ctx), q, expandedTerms, fetchLimit, scope, timings) } else { + bm25Start := time.Now() nodes = s.engineFor(ctx).SearchSymbolsScoped(q, fetchLimit, scope) + timings.BM25PrimaryMS += time.Since(bm25Start).Milliseconds() primaryCount = len(nodes) } + candsAfterGather := len(nodes) mergedCount := len(nodes) // pre-filter; comparable to primaryCount // Apply repo/project/ref filter. @@ -1253,34 +1397,45 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques // feedback, churn) layer on top once the agent has spent time // in the codebase. Cold queries with no session data fall back // to a structural-only pass. - rctx := s.buildRerankContext(ctx, q) - // Per-class rerank weighting: detect the query class (or honour an - // explicit query_class hint) and pin it on the rerank Context so - // the pipeline scales the bm25 / semantic blend accordingly. - queryClass := rerank.ClassifyQuery(q) - if qcArg := strings.TrimSpace(req.GetString("query_class", "")); qcArg != "" { - parsed, ok := rerank.ParseQueryClass(qcArg) - if !ok { - return mcp.NewToolResultError("invalid query_class: " + qcArg + " (want auto, symbol, concept, path, signature, or keyword_soup)"), nil - } - if parsed != rerank.QueryClassUnknown { - queryClass = parsed - } - } - // A detected soup query reports the keyword_soup class even when - // the caller did not pin it, so the response surfaces the class - // the handler actually treated the query as. + // + // rctx was built above (before the BM25 fetch) so the engine's + // bundle path could seed its edge caches into the same rctx the + // handler-side rerank will read from. + // queryClass was classified + validated at the top of the handler + // so the identifier-shape fast path could read it. Re-apply the + // soup override here — soup detection happens after classification + // and reports keyword_soup regardless of what the structural + // detector thought the query looked like. if isSoup { queryClass = rerank.QueryClassKeywordSoup } - rctx.QueryClass = queryClass + if rctx != nil { + rctx.QueryClass = queryClass + } + candsAfterFilter := len(nodes) + // Capture the post-filter candidate ID set so we can ask the rctx + // what fraction of these candidates' edges were already cached by + // the bundle pre-seed (vs needing prepare's own batched fetch). + // Hit-rate is reported on the debug log as cache_hit_rate. + if rctx != nil { + preIDs := make([]string, 0, len(nodes)) + for _, n := range nodes { + if n != nil { + preIDs = append(preIDs, n.ID) + } + } + timings.CacheHitRate = rctx.EdgeCacheHitRate(preIDs) + } var rerankBreakdown []*rerank.Candidate - nodes = applyRerankBoosts(s, nodes, q, rctx, &rerankBreakdown) + var rerankPrepare, rerankSignals time.Duration + nodes, rerankPrepare, rerankSignals = applyRerankBoostsTimed(s, nodes, q, rctx, &rerankBreakdown) // Per-file diversification: keep one file's many symbols from // monopolising the head of the result set. Runs after the rerank // so demotion acts on final scores; nothing is dropped. + diversifyStart := time.Now() nodes, rerankBreakdown = diversifyByFile(nodes, rerankBreakdown, req.GetInt("max_per_file", defaultMaxPerFile)) + diversifyMS := time.Since(diversifyStart).Milliseconds() // Remember the returned IDs for attribution on later consume calls. // Cap at top limit so unseen "overflow" results don't get credited. @@ -1392,6 +1547,53 @@ func (s *Server) handleSearchSymbols(ctx context.Context, req mcp.CallToolReques } resp["rerank"] = encodeRerankBreakdown(pageBreakdown, s.engineFor(ctx).Rerank()) } + + // Per-phase Debug log line — single zap.Debug call carrying every + // timing field for this search_symbols invocation. The bench harness + // greps for the "search_symbols phases" message at --log-level + // debug; production runs at info level pay nothing. Tracked phases: + // BM25 primary / expansion calls (wall-clock around the engine), + // the inner GetNodesByIDs / FindNodesByName / Fallback hops (from + // the engine), rerank prepare (batched edge fetch) and signals + // (in-process scoring), diversify, and the candidate counts at + // gather → filter → final. + if s.logger != nil { + totalMS := time.Since(phaseStart).Milliseconds() + // "BM25 backend" cost = the BM25 wall-clock minus the inner + // phases the engine also accumulated under that call. Negative + // values are clamped to 0 (clock granularity / contention). + // BundleMS is subtracted too — it's a fold of the FTS + nodes + // + edge fetches that, on the legacy path, would have shown up + // in TextBackend / GetNodes / (no field for edges) separately. + bm25Backend := timings.BM25PrimaryMS + timings.BM25ExpansionMS - timings.GetNodesMS - timings.FindNameMS - timings.FallbackMS - timings.BundleMS + if bm25Backend < 0 { + bm25Backend = 0 + } + s.logger.Debug("search_symbols phases", + zap.String("query", q), + zap.Int("expansion_terms", len(expandedTerms)), + zap.Int64("bm25_primary_ms", timings.BM25PrimaryMS), + zap.Int64("bm25_expansion_ms", timings.BM25ExpansionMS), + zap.Int64("bm25_backend_ms", bm25Backend), + zap.Int64("text_backend_ms", timings.TextBackendMS), + zap.Int64("embed_ms", timings.EmbedMS), + zap.Int64("vector_search_ms", timings.VectorSearchMS), + zap.Int64("engine_rerank_ms", timings.EngineRerankMS), + zap.Int64("bundle_ms", timings.BundleMS), + zap.Float64("cache_hit_rate", timings.CacheHitRate), + zap.Int64("get_nodes_ms", timings.GetNodesMS), + zap.Int64("find_name_ms", timings.FindNameMS), + zap.Int64("fallback_ms", timings.FallbackMS), + zap.Duration("rerank_prepare_ms", rerankPrepare), + zap.Duration("rerank_signals_ms", rerankSignals), + zap.Int64("diversify_ms", diversifyMS), + zap.Int64("total_ms", totalMS), + zap.Int("cands_after_gather", candsAfterGather), + zap.Int("cands_after_filter", candsAfterFilter), + zap.Int("cands_final", len(nodes)), + ) + } + return s.respondJSONOrTOON(ctx, req, resp) } @@ -1446,7 +1648,20 @@ func roundTo(v float64, places int) float64 { return float64(int64(v*pow+0.5)) / pow } -func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { +func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolRequest) (res *mcp.CallToolResult, retErr error) { + // Defensive panic recovery — get_file_summary has been observed + // to crash the MCP transport in multi-repo mode (file-content + // validation gap). Surface the panic as a tool error so the + // session survives. + defer func() { + if r := recover(); r != nil { + s.logger.Error("get_file_summary panic recovered", + zap.String("path", req.GetString("path", "")), + zap.Any("panic", r)) + res = mcp.NewToolResultError(fmt.Sprintf("get_file_summary internal error: %v", r)) + retErr = nil + } + }() fp, err := req.RequireString("path") if err != nil { return mcp.NewToolResultError("path is required"), nil @@ -1455,7 +1670,21 @@ func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolReque // Auto re-index stale file before querying. s.ensureFresh([]string{fp}) - sg := s.engineFor(ctx).GetFileSymbols(fp) + // gcx is the high-volume agent format and only emits total_edges + // in its meta header — never per-edge rows. Route gcx-only calls + // through the count-only path so the disk backends skip + // materialising every adjacent edge across cgo (a 4 000-row + // round-trip on a 500-symbol file becomes two scalar aggregates). + // compact + json paths still take the full SubGraph because + // compact summarises edges per confidence label and json ships + // every edge in the body. + gcxOnly := s.isGCX(ctx, req) && !isCompact(req) + var sg *query.SubGraph + if gcxOnly { + sg = s.engineFor(ctx).GetFileSymbolsCounts(fp) + } else { + sg = s.engineFor(ctx).GetFileSymbols(fp) + } if len(sg.Nodes) == 0 { return mcp.NewToolResultError("no symbols found for file: " + fp), nil } @@ -1470,12 +1699,26 @@ func (s *Server) handleGetFileSummary(ctx context.Context, req mcp.CallToolReque return mcp.NewToolResultError("no symbols found for file in specified scope: " + fp), nil } + // get_file_summary's contract is "what symbols does this file + // define" — the file node itself and import nodes ride on + // GetFileSubGraph because they're useful for other walkers, but + // the encoder layer wants the symbols-only view. The compact + // path already filtered both kinds inline; the cleaner home is + // here so every output format (compact, gcx, json, toon) sees the + // same shape. + sg = stripNonDefinitionNodes(sg) + if len(sg.Nodes) == 0 { + return mcp.NewToolResultError("no symbols found for file: " + fp), nil + } + if isCompact(req) { return mcp.NewToolResultText(compactSubGraph(sg)), nil } - // ETag conditional fetch. - etag := computeETag(sg) + // ETag conditional fetch. Use the structural SubGraph hash — + // json.Marshal'ing the whole SubGraph + Meta on every call was the + // dominant cost on large files (~2 ms / call on a 500-symbol file). + etag := etagSubGraph(sg) if ifNoneMatch := req.GetString("if_none_match", ""); ifNoneMatch != "" && ifNoneMatch == etag { return notModifiedResult(etag), nil } diff --git a/internal/mcp/tools_coupling.go b/internal/mcp/tools_coupling.go index 4618fb5e..95f1f495 100644 --- a/internal/mcp/tools_coupling.go +++ b/internal/mcp/tools_coupling.go @@ -97,25 +97,45 @@ func (s *Server) handleGetCouplingMetrics(ctx context.Context, req mcp.CallToolR stats[u] = &units{ca: map[string]bool{}, ce: map[string]bool{}} } - for _, e := range s.graph.AllEdges() { - if !isCouplingEdge(e.Kind) { - continue - } - fromUnit, fromOK := nodeToUnit[e.From] - toUnit, toOK := nodeToUnit[e.To] - if !fromOK || !toOK { - continue - } - if fromUnit == toUnit { - stats[fromUnit].internal++ + // Iterate the coupling-edge buckets directly via EdgesByKind + // instead of AllEdges() + a Go-side filter — Ladybug's + // EdgesByKind runs one indexed Cypher per kind and ships only + // the matching rows. Structural edges (defines / member_of / + // contains-file-of-symbol) which dominate edge counts on large + // repos drop out before they cross cgo. Order is fixed so the + // loop body stays trivially identical to the legacy AllEdges + // branch. + for _, k := range []graph.EdgeKind{ + graph.EdgeCalls, + graph.EdgeImports, + graph.EdgeImplements, + graph.EdgeExtends, + graph.EdgeReferences, + graph.EdgeInstantiates, + graph.EdgeCrossRepoCalls, + graph.EdgeCrossRepoImplements, + graph.EdgeCrossRepoExtends, + } { + for e := range s.graph.EdgesByKind(k) { + if e == nil { + continue + } + fromUnit, fromOK := nodeToUnit[e.From] + toUnit, toOK := nodeToUnit[e.To] + if !fromOK || !toOK { + continue + } + if fromUnit == toUnit { + stats[fromUnit].internal++ + stats[fromUnit].total++ + continue + } + // Cross-unit: counts as ce for the source unit, ca for the target. + stats[fromUnit].ce[toUnit] = true stats[fromUnit].total++ - continue + stats[toUnit].ca[fromUnit] = true + stats[toUnit].total++ } - // Cross-unit: counts as ce for the source unit, ca for the target. - stats[fromUnit].ce[toUnit] = true - stats[fromUnit].total++ - stats[toUnit].ca[fromUnit] = true - stats[toUnit].total++ } rows := make([]couplingRow, 0, len(stats)) @@ -213,21 +233,3 @@ func packageOfPath(path string, depth int) string { return strings.Join(parts[:depth], "/") } -// isCouplingEdge identifies edges that signal real dependency -// — calls, imports, implements, extends, references, instantiates. -// Structural edges (defines, member_of) don't count. -func isCouplingEdge(k graph.EdgeKind) bool { - switch k { - case graph.EdgeCalls, - graph.EdgeImports, - graph.EdgeImplements, - graph.EdgeExtends, - graph.EdgeReferences, - graph.EdgeInstantiates, - graph.EdgeCrossRepoCalls, - graph.EdgeCrossRepoImplements, - graph.EdgeCrossRepoExtends: - return true - } - return false -} diff --git a/internal/mcp/tools_enhancements.go b/internal/mcp/tools_enhancements.go index 88557812..ded055f9 100644 --- a/internal/mcp/tools_enhancements.go +++ b/internal/mcp/tools_enhancements.go @@ -8,6 +8,7 @@ import ( "math" "os" "path/filepath" + "slices" "sort" "strings" "time" @@ -22,7 +23,6 @@ import ( "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/persistence" "github.com/zzet/gortex/internal/query" - "github.com/zzet/gortex/internal/releases" "github.com/zzet/gortex/internal/tokens" "go.uber.org/zap" ) @@ -35,6 +35,17 @@ func (s *Server) ensureFresh(filePaths []string) []string { if s.watcher != nil { return nil } + // In multi-repo mode the legacy single-Indexer's fileMtimes is + // always empty for cross-repo paths, so IsStale returns true for + // every file → IndexFile fires → race with the daemon's read + // surface, which has been observed to crash the MCP transport + // (CGo concurrency hazard on liblbug). The MultiIndexer's own + // per-repo watcher / Reconcile path owns freshness here; the + // single-Indexer auto-refresh is dead weight that does more harm + // than good. + if s.multiIndexer != nil { + return nil + } if s.indexer == nil { return nil } @@ -145,7 +156,7 @@ func (s *Server) registerEnhancementTools() { mcp.WithNumber("min_pct", mcp.Description("(coverage_gaps) Lower-inclusive coverage threshold — default 0")), mcp.WithNumber("max_pct", mcp.Description("(coverage_gaps) Upper-exclusive coverage threshold — default 100, i.e. anything not fully covered")), mcp.WithString("provider", mcp.Description("(stale_flags) Filter to a single provider — launchdarkly, growthbook, unleash, internal")), - mcp.WithString("tag", mcp.Description("(todos) Filter by tag — TODO / FIXME / HACK / XXX / NOTE — case-insensitive")), + mcp.WithString("tag", mcp.Description("(todos) Filter by tag — TODO / FIXME / HACK / XXX / NOTE — case-insensitive. (releases) Filter to one release tag — returns the file list whose meta.added_in matches; populate via enrich_releases first.")), mcp.WithString("assignee", mcp.Description("(todos) Filter by exact assignee — case-sensitive")), mcp.WithString("ticket", mcp.Description("(todos) Filter by exact ticket reference — e.g. PROJ-42")), mcp.WithBoolean("has_assignee", mcp.Description("(todos) Keep only TODOs that have an assignee set")), @@ -443,7 +454,7 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ // Gather recent symbols from parameter or session state. var recentIDs []string if recentStr != "" { - for _, id := range strings.Split(recentStr, ",") { + for id := range strings.SplitSeq(recentStr, ",") { recentIDs = append(recentIDs, strings.TrimSpace(id)) } } @@ -578,14 +589,7 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ var candidates []prefetchCandidate for id, sc := range scoreMap { // Exclude recently viewed symbols themselves - isRecent := false - for _, rid := range recentIDs { - if id == rid { - isRecent = true - break - } - } - if isRecent { + if slices.Contains(recentIDs, id) { continue } @@ -629,14 +633,8 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ if limit <= 0 { limit = 10 } - offset := decodeCursor(req.GetString("cursor", "")) - if offset > totalCount { - offset = totalCount - } - endIdx := offset + limit - if endIdx > totalCount { - endIdx = totalCount - } + offset := min(decodeCursor(req.GetString("cursor", "")), totalCount) + endIdx := min(offset+limit, totalCount) candidates = candidates[offset:endIdx] truncated := endIdx < totalCount nextCursor := "" @@ -697,7 +695,7 @@ func (s *Server) handlePrefetchContext(ctx context.Context, req mcp.CallToolRequ func (s *Server) handleAnalyze(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { kind, err := req.RequireString("kind") if err != nil { - return mcp.NewToolResultError("kind is required (one of: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, health_score, annotation_users, config_readers, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, impact, named, tests_as_edges, connectivity_health)"), nil + return mcp.NewToolResultError("kind is required (one of: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, health_score, annotation_users, config_readers, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, impact, named, tests_as_edges, connectivity_health, pagerank, louvain, wcc, scc, kcore)"), nil } switch kind { case "dead_code": @@ -810,8 +808,18 @@ func (s *Server) handleAnalyze(ctx context.Context, req mcp.CallToolRequest) (*m return s.handleAnalyzeTestsAsEdges(ctx, req) case "connectivity_health": return s.handleAnalyzeConnectivityHealth(ctx, req) + case "pagerank": + return s.handleAnalyzePageRank(ctx, req) + case "louvain": + return s.handleAnalyzeLouvain(ctx, req) + case "wcc": + return s.handleAnalyzeConnectedComponents(ctx, req, false) + case "scc": + return s.handleAnalyzeConnectedComponents(ctx, req, true) + case "kcore": + return s.handleAnalyzeKCore(ctx, req) default: - return mcp.NewToolResultError("unknown analyze kind: " + kind + " (expected: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, sast, hygiene, health_score, annotation_users, config_readers, env_var_users, sql_call_sites, fixes_history, edge_audit, domain, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, dbt_models, impact, named, tests_as_edges, connectivity_health)"), nil + return mcp.NewToolResultError("unknown analyze kind: " + kind + " (expected: dead_code, hotspots, cycles, would_create_cycle, todos, blame, coverage, stale_code, ownership, coverage_gaps, stale_flags, releases, cgo_users, wasm_users, orphan_tables, unreferenced_tables, coverage_summary, channel_ops, goroutine_spawns, field_writers, race_writes, unclosed_channels, unsafe_patterns, sast, hygiene, health_score, annotation_users, config_readers, env_var_users, sql_call_sites, fixes_history, edge_audit, domain, event_emitters, pubsub, string_emitters, error_surface, log_events, sql_rebuild, external_calls, routes, models, components, k8s_resources, images, kustomize, cross_repo, dbt_models, impact, named, tests_as_edges, connectivity_health, pagerank, louvain, wcc, scc, kcore)"), nil } } @@ -847,10 +855,10 @@ func (s *Server) handleAnalyzeTodos(ctx context.Context, req mcp.CallToolRequest } var rows []todoRow - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindTodo { - continue - } + // Push the kind filter into the storage layer — todos are a + // tiny slice of the node table, so the AllNodes scan was the + // dominant cgo cost on Ladybug. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindTodo}) { tag, _ := n.Meta["tag"].(string) assignee, _ := n.Meta["assignee"].(string) ticket, _ := n.Meta["ticket"].(string) @@ -1006,10 +1014,10 @@ func (s *Server) handleAnalyzeStaleCode(ctx context.Context, req mcp.CallToolReq AgeDays int `json:"age_days"` } var rows []staleRow - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } + // Push the kind filter into the storage layer; the meta gate + // (last_authored.timestamp) stays in Go since the meta column is + // opaque to Cypher. + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { la, ok := n.Meta["last_authored"].(map[string]any) if !ok { continue @@ -1069,6 +1077,21 @@ func (s *Server) handleAnalyzeStaleCode(ctx context.Context, req mcp.CallToolReq }) } +// allowedKindsSlice returns the keys of an analyzer's allowedKinds +// set so the caller can hand them to scopedNodesByKinds. Kept as a +// helper rather than inlined at every call site so the order is +// deterministic — not load-bearing for correctness (the capability +// dedupes), but it keeps test expectations stable when the IN list +// is logged. +func allowedKindsSlice(allowed map[graph.NodeKind]struct{}) []graph.NodeKind { + out := make([]graph.NodeKind, 0, len(allowed)) + for k := range allowed { + out = append(out, k) + } + slices.Sort(out) + return out +} + // parseAnalyzeKindsFilter parses a comma-separated kinds argument // into the set used by handleAnalyzeStaleCode. The literal "all" // returns the broadest blame-eligible kind set so callers can drop @@ -1076,7 +1099,7 @@ func (s *Server) handleAnalyzeStaleCode(ctx context.Context, req mcp.CallToolReq // fields included too. func parseAnalyzeKindsFilter(arg string) map[graph.NodeKind]struct{} { out := map[graph.NodeKind]struct{}{} - for _, k := range strings.Split(arg, ",") { + for k := range strings.SplitSeq(arg, ",") { k = strings.TrimSpace(strings.ToLower(k)) if k == "" { continue @@ -1144,10 +1167,10 @@ func (s *Server) handleAnalyzeOwnership(ctx context.Context, req mcp.CallToolReq } byEmail := map[string]*ownerStats{} - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } + // Kind pushdown — owners are derived from the blame meta on + // function/method (or wider) nodes; the analyzer scans tens of + // thousands of irrelevant nodes without it on Ladybug. + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } @@ -1286,10 +1309,9 @@ func (s *Server) handleAnalyzeCoverageGaps(ctx context.Context, req mcp.CallTool Hit int `json:"hit"` } var rows []gapRow - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } + // Kind pushdown — coverage_pct only ever lands on executable + // kinds, so the IN-list IS the candidate set. + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } @@ -1401,10 +1423,12 @@ func (s *Server) handleAnalyzeStaleFlags(ctx context.Context, req mcp.CallToolRe var rows []staleFlag unscored := 0 - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindFlag { - continue - } + // Kind pushdown — KindFlag is a few hundred nodes max even on + // the biggest workspaces, so pulling AllNodes() to find them + // was pure cgo overhead. The caller batch below still does per- + // flag GetInEdges; pushing that into a single Cypher join is a + // separate follow-up since the join semantics differ per flag. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindFlag}) { provider, _ := n.Meta["provider"].(string) if providerFilter != "" && provider != providerFilter { continue @@ -1536,10 +1560,9 @@ func (s *Server) handleAnalyzeOrphanTables(ctx context.Context, req mcp.CallTool QueryCount int `json:"query_count"` } var rows []orphanRow - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindTable { - continue - } + // Kind pushdown — only KindTable carries the providers/queries + // fan-in we care about; the rest of the node table is noise. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindTable}) { // Walk incoming edges to detect both providers (migrations) // and consumers (query call sites). hasProvider := false @@ -1617,10 +1640,8 @@ func (s *Server) handleAnalyzeUnreferencedTables(ctx context.Context, req mcp.Ca ProviderCount int `json:"provider_count"` } var rows []unrefRow - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindTable { - continue - } + // Kind pushdown — same story as orphan_tables. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindTable}) { providerCount := 0 queryCount := 0 for _, e := range s.graph.GetInEdges(n.ID) { @@ -1704,10 +1725,8 @@ func (s *Server) handleAnalyzeCoverageSummary(ctx context.Context, req mcp.CallT } byDir := map[string]*dirStats{} - for _, n := range s.scopedNodes(ctx) { - if _, ok := allowedKinds[n.Kind]; !ok { - continue - } + // Kind pushdown — coverage_pct only lives on executable kinds. + for _, n := range s.scopedNodesByKinds(ctx, allowedKindsSlice(allowedKinds)) { if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } @@ -1797,10 +1816,10 @@ func (s *Server) handleAnalyzeInteropUsers(ctx context.Context, req mcp.CallTool ID string `json:"id"` } var rows []interopFile - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindFile { - continue - } + // Kind pushdown — uses_cgo / uses_wasm_bindgen sentinels only + // live on file nodes; pulling AllNodes() to find them was pure + // cgo overhead on Ladybug. + for _, n := range s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindFile}) { if v, _ := n.Meta[metaKey].(bool); !v { continue } @@ -1830,34 +1849,159 @@ func (s *Server) handleAnalyzeInteropUsers(ctx context.Context, req mcp.CallTool }) } -// handleAnalyzeReleases walks git tags chronologically and stamps -// meta.added_in on every file node with the earliest tag whose -// tree contained that file. Symbols inherit indirectly via their -// owning file — answers "added in v1.4?" with one graph hop from -// any symbol to its file. Re-runnable: each call re-walks tags -// and overwrites existing meta. +// handleAnalyzeReleases reads the pre-computed release timeline from +// the graph. Inputs come from meta.added_in (stamped on KindFile +// nodes) and the KindRelease nodes the enricher materialises — one +// per tag, ordered, carrying file_count metadata. No git subprocess +// at read time. +// +// When nothing in scope carries release metadata the tool returns a +// structured error pointing the agent at `enrich_releases` (or the +// `gortex enrich releases` CLI) rather than silently returning an +// empty result; the latter would look like "this repo has no +// releases" even when the cause is "you haven't enriched yet". +// +// Optional filter `tag` returns only the named release with the list +// of files whose meta.added_in matches it — answers "what shipped in +// v1.4?" with a single graph scan. func (s *Server) handleAnalyzeReleases(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { - roots := s.collectRepoRoots(req.GetString("repo", "")) - if len(roots) == 0 { - return mcp.NewToolResultError("releases enrichment requires at least one indexed repo with a root path"), nil - } - total := 0 - perRepo := make(map[string]any, len(roots)) - for prefix, root := range roots { - count, err := releases.EnrichGraphWithRepoPrefix(s.graph, root, prefix) - if err != nil { - perRepo[prefix] = map[string]any{"root": root, "error": err.Error()} + if s.graph == nil { + return mcp.NewToolResultError("graph not initialized"), nil + } + repoFilter := strings.TrimSpace(req.GetString("repo", "")) + tagFilter := strings.TrimSpace(req.GetString("tag", "")) + + type releaseRow struct { + ID string `json:"id"` + Tag string `json:"tag"` + RepoPrefix string `json:"repo_prefix,omitempty"` + FileCount int `json:"file_count"` + Order int `json:"order"` + Files []string `json:"files,omitempty"` + } + releaseByTag := map[string]*releaseRow{} + for _, n := range s.graph.AllNodes() { + if n.Kind != graph.KindRelease { continue } - total += count - perRepo[prefix] = map[string]any{"root": root, "enriched": count} + if repoFilter != "" && n.RepoPrefix != repoFilter { + continue + } + row := &releaseRow{ + ID: n.ID, + Tag: n.Name, + RepoPrefix: n.RepoPrefix, + } + if n.Meta != nil { + row.FileCount = intFromAny(n.Meta["file_count"]) + row.Order = intFromAny(n.Meta["order"]) + } + key := releaseKey(n.RepoPrefix, n.Name) + releaseByTag[key] = row } + + if tagFilter != "" { + // Caller wants the file list for one release. We surface it + // from meta.added_in rather than a tree walk, so the answer + // is whatever the last enrich pass observed. + row, ok := releaseByTag[releaseKey(repoFilter, tagFilter)] + if !ok { + // Tolerate the no-prefix form: agents pass "v1.4" without + // realising the graph stores multi-repo tags as + // "/v1.4". Fall back to a tag-name-only match. + for k, r := range releaseByTag { + if r.Tag == tagFilter { + row = r + _ = k + break + } + } + } + if row == nil { + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "error": fmt.Sprintf("no KindRelease node for tag %q; run `enrich_releases` first", tagFilter), + "suggestion": "enrich_releases", + "releases": []releaseRow{}, + "total": 0, + }) + } + for _, n := range s.graph.AllNodes() { + if n.Kind != graph.KindFile || n.FilePath == "" { + continue + } + if repoFilter != "" && n.RepoPrefix != repoFilter { + continue + } + if n.Meta == nil { + continue + } + added, _ := n.Meta["added_in"].(string) + if added != row.Tag { + continue + } + row.Files = append(row.Files, n.FilePath) + } + sort.Strings(row.Files) + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "releases": []releaseRow{*row}, + "total": 1, + "tag": tagFilter, + "file_hits": len(row.Files), + }) + } + + // No tag filter: return the timeline. Use `order` (oldest=0) so + // callers can flip to newest-first via reverse. + if len(releaseByTag) == 0 { + // Distinguish "no enrichment yet" from "repo has no tags" by + // peeking at any file's meta.added_in. If even one file has + // the field set the enrichment ran and produced no releases + // (an unlikely combination; surface as an empty timeline); + // otherwise return the structured error. + hasAnyAddedIn := false + for _, n := range s.graph.AllNodes() { + if n.Kind == graph.KindFile && n.Meta != nil { + if _, ok := n.Meta["added_in"].(string); ok { + hasAnyAddedIn = true + break + } + } + } + if !hasAnyAddedIn { + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "error": "no release timeline in scope; run `enrich_releases` (or `gortex enrich releases`) to populate KindRelease nodes and meta.added_in", + "suggestion": "enrich_releases", + "releases": []releaseRow{}, + "total": 0, + }) + } + } + rows := make([]releaseRow, 0, len(releaseByTag)) + for _, r := range releaseByTag { + rows = append(rows, *r) + } + sort.Slice(rows, func(i, j int) bool { + if rows[i].Order != rows[j].Order { + return rows[i].Order < rows[j].Order + } + return rows[i].Tag < rows[j].Tag + }) return s.respondJSONOrTOON(ctx, req, map[string]any{ - "enriched": total, - "per_repo": perRepo, + "releases": rows, + "total": len(rows), }) } +// releaseKey builds the lookup key from a (repoPrefix, tag) pair so +// the tag-filtered path can compare scoped IDs against the bare +// agent input. +func releaseKey(repoPrefix, tag string) string { + if repoPrefix == "" { + return tag + } + return repoPrefix + "/" + tag +} + // handleAnalyzeBlame runs `git blame -p` against the indexed // repository and stamps meta.last_authored on each function / // method / type / interface / field / variable / constant / @@ -2039,7 +2183,12 @@ func (s *Server) handleFindHotspots(ctx context.Context, req mcp.CallToolRequest threshold = v } - entries := analysis.FindHotspots(s.graph, s.getCommunities(), threshold) + var entries []analysis.HotspotEntry + if threshold == 0 { + entries = s.getHotspots() + } else { + entries = analysis.FindHotspots(s.graph, s.getCommunities(), threshold) + } // K17: optional novelty / directional reranking modes. Default // "complexity" preserves the legacy ranking. @@ -2114,7 +2263,7 @@ func (s *Server) handleFindHotspots(ctx context.Context, req mcp.CallToolRequest // multi-repo mode. type scaffoldReader struct{ s *Server } -func (r scaffoldReader) Graph() *graph.Graph { return r.s.graph } +func (r scaffoldReader) Graph() graph.Store { return r.s.graph } func (r scaffoldReader) ResolveFilePath(graphPath string) string { abs, err := r.s.resolveGraphPath(graphPath) if err != nil { @@ -2162,13 +2311,8 @@ func (s *Server) handleScaffold(ctx context.Context, req mcp.CallToolRequest) (* return mcp.NewToolResultError(fmt.Sprintf("could not read %s: %v", edit.FilePath, readErr)), nil } lines := strings.Split(string(content), "\n") - insertIdx := edit.InsertionLine - 1 - if insertIdx < 0 { - insertIdx = 0 - } - if insertIdx > len(lines) { - insertIdx = len(lines) - } + insertIdx := max(edit.InsertionLine-1, 0) + insertIdx = min(insertIdx, len(lines)) newLines := make([]string, 0, len(lines)+strings.Count(edit.Code, "\n")+2) newLines = append(newLines, lines[:insertIdx]...) newLines = append(newLines, "") @@ -2520,10 +2664,7 @@ func (s *Server) buildIndexHealthPayload() map[string]any { } } - successfullyIndexed := totalDetected - len(parseErrors) - if successfullyIndexed < 0 { - successfullyIndexed = 0 - } + successfullyIndexed := max(totalDetected-len(parseErrors), 0) var healthScore float64 if totalDetected > 0 { @@ -2886,10 +3027,7 @@ func (s *Server) handleBatchEdit(ctx context.Context, req mcp.CallToolRequest) ( for i := 0; i < node.StartLine-1 && i < len(lines); i++ { symbolStart += len(lines[i]) + 1 } - symbolEnd := symbolStart + len(symbolSource) - if symbolEnd > len(fileStr) { - symbolEnd = len(fileStr) - } + symbolEnd := min(symbolStart+len(symbolSource), len(fileStr)) offset := strings.Index(fileStr[symbolStart:symbolEnd], o.edit.OldSource) if offset < 0 { @@ -3063,10 +3201,7 @@ func (s *Server) handleGetContracts(ctx context.Context, req mcp.CallToolRequest if contractsOffset > contractsTotal { contractsOffset = contractsTotal } - contractsEnd := contractsOffset + contractsLimit - if contractsEnd > contractsTotal { - contractsEnd = contractsTotal - } + contractsEnd := min(contractsOffset+contractsLimit, contractsTotal) filtered = filtered[contractsOffset:contractsEnd] contractsTruncated := contractsEnd < contractsTotal contractsNextCursor := "" diff --git a/internal/mcp/tools_enrich_churn.go b/internal/mcp/tools_enrich_churn.go new file mode 100644 index 00000000..4d28f206 --- /dev/null +++ b/internal/mcp/tools_enrich_churn.go @@ -0,0 +1,102 @@ +package mcp + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/churn" +) + +// registerEnrichChurnTool exposes the churn enricher as an MCP tool so +// agents (and the post-commit / post-merge git hook driving `gortex +// enrich churn`) can refresh per-symbol churn data without going +// through the daemon control socket. The handler runs the enricher +// in-process against s.graph, so it inherits whatever backend the +// daemon was launched with — LadyBug for persistence, in-memory for +// CI / one-off invocations. +// +// The accompanying `get_churn_rate` tool reads from the same +// meta.churn fields this tool writes; pre-computation here is what +// makes the read path a sub-second graph scan. +func (s *Server) registerEnrichChurnTool() { + s.addTool( + mcp.NewTool("enrich_churn", + mcp.WithDescription("Pre-compute per-file and per-symbol git churn data and stamp it on graph nodes so `get_churn_rate` can answer without a git subprocess. Walks `git log ` and `git blame ` once per file, then projects line-range commit counts onto every function/method node. The branch is the repository's default branch (origin/main, then origin/master, then local main/master/trunk) unless `branch` overrides. Idempotent: re-running updates the same Meta fields in place. Daemons backed by LadyBug persist the result across restarts; in-memory daemons recompute on next call."), + mcp.WithString("branch", mcp.Description("Branch / tag / SHA to compute churn against. Empty means resolve the repository's default branch.")), + mcp.WithString("path", mcp.Description("Optional path or repo prefix to scope the enrichment. Multi-repo daemons enrich every tracked repo when empty.")), + mcp.WithString("format", mcp.Description("Output format: json (default), gcx, or toon")), + ), + s.handleEnrichChurn, + ) +} + +func (s *Server) handleEnrichChurn(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + if s.graph == nil { + return mcp.NewToolResultError("graph not initialized"), nil + } + branch := strings.TrimSpace(req.GetString("branch", "")) + pathArg := strings.TrimSpace(req.GetString("path", "")) + + // Resolve targets: one repo root per tracked repo, optionally + // filtered by path (matched as either prefix or absolute root). + type target struct { + prefix string + root string + } + var targets []target + if s.multiIndexer != nil { + for prefix, meta := range s.multiIndexer.AllMetadata() { + if pathArg != "" && pathArg != prefix && pathArg != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + } + if len(targets) == 0 { + return mcp.NewToolResultError(fmt.Sprintf("no tracked repo matches %q", pathArg)), nil + } + + started := time.Now() + type perRepo struct { + Prefix string `json:"prefix"` + Branch string `json:"branch"` + HeadSHA string `json:"head_sha"` + Files int `json:"files"` + Symbols int `json:"symbols"` + Skipped string `json:"skipped,omitempty"` + } + var per []perRepo + totalFiles, totalSymbols := 0, 0 + for _, t := range targets { + b := branch + if b == "" { + b = churn.DefaultBranch(t.root) + } + if b == "" { + per = append(per, perRepo{Prefix: t.prefix, Skipped: "no default branch resolvable"}) + continue + } + res, err := churn.EnrichGraph(ctx, s.graph, t.root, churn.Options{Branch: b}) + if err != nil { + per = append(per, perRepo{Prefix: t.prefix, Branch: b, Skipped: err.Error()}) + continue + } + per = append(per, perRepo{ + Prefix: t.prefix, Branch: res.Branch, HeadSHA: res.HeadSHA, + Files: res.Files, Symbols: res.Symbols, + }) + totalFiles += res.Files + totalSymbols += res.Symbols + } + + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "repos": per, + "files": totalFiles, + "symbols": totalSymbols, + "duration_ms": time.Since(started).Milliseconds(), + }) +} diff --git a/internal/mcp/tools_enrich_releases.go b/internal/mcp/tools_enrich_releases.go new file mode 100644 index 00000000..18bb8f82 --- /dev/null +++ b/internal/mcp/tools_enrich_releases.go @@ -0,0 +1,92 @@ +package mcp + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/mark3labs/mcp-go/mcp" + + "github.com/zzet/gortex/internal/churn" + "github.com/zzet/gortex/internal/releases" +) + +// registerEnrichReleasesTool exposes the releases enricher as an MCP +// tool. `analyze kind=releases` is now a pure read — populating the +// per-file meta.added_in and the KindRelease timeline is this tool's +// job (counterpart to enrich_churn). +// +// Branch constrains the considered tags to those reachable from the +// branch — typically the repo's default branch — so topic-branch tags +// don't pollute the timeline. Empty branch means "every tag", matching +// the legacy behaviour. +func (s *Server) registerEnrichReleasesTool() { + s.addTool( + mcp.NewTool("enrich_releases", + mcp.WithDescription("Pre-compute the release timeline: list tags on the default branch (or `branch` override), stamp meta.added_in on every file present in each tag's tree, and materialise one KindRelease node per tag. The read tool `analyze kind=releases` then answers from this Meta without re-walking git. Idempotent; LadyBug-backed daemons persist the result across restarts."), + mcp.WithString("branch", mcp.Description("Branch / tag / SHA whose reachable tag set bounds the timeline. Empty resolves the repo's default branch; pass a value to override.")), + mcp.WithString("path", mcp.Description("Optional path or repo prefix to scope the enrichment. Multi-repo daemons enrich every tracked repo when empty.")), + mcp.WithString("format", mcp.Description("Output format: json (default), gcx, or toon")), + ), + s.handleEnrichReleases, + ) +} + +func (s *Server) handleEnrichReleases(ctx context.Context, req mcp.CallToolRequest) (*mcp.CallToolResult, error) { + if s.graph == nil { + return mcp.NewToolResultError("graph not initialized"), nil + } + branchArg := strings.TrimSpace(req.GetString("branch", "")) + pathArg := strings.TrimSpace(req.GetString("path", "")) + + type target struct { + prefix string + root string + } + var targets []target + if s.multiIndexer != nil { + for prefix, meta := range s.multiIndexer.AllMetadata() { + if pathArg != "" && pathArg != prefix && pathArg != meta.RootPath { + continue + } + targets = append(targets, target{prefix: prefix, root: meta.RootPath}) + } + } + if len(targets) == 0 { + return mcp.NewToolResultError(fmt.Sprintf("no tracked repo matches %q", pathArg)), nil + } + _ = ctx + + started := time.Now() + type perRepo struct { + Prefix string `json:"prefix"` + Branch string `json:"branch,omitempty"` + Files int `json:"files"` + Skipped string `json:"skipped,omitempty"` + } + var per []perRepo + totalFiles := 0 + for _, t := range targets { + b := branchArg + if b == "" { + b = churn.DefaultBranch(t.root) + // b can stay "" — releases.EnrichGraphForBranch treats + // that as "every tag", the right fallback when no default + // branch resolves. + } + count, err := releases.EnrichGraphForBranch(s.graph, t.root, t.prefix, b) + if err != nil { + per = append(per, perRepo{Prefix: t.prefix, Branch: b, Skipped: err.Error()}) + continue + } + per = append(per, perRepo{Prefix: t.prefix, Branch: b, Files: count}) + totalFiles += count + } + + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "repos": per, + "files": totalFiles, + "duration_ms": time.Since(started).Milliseconds(), + }) +} diff --git a/internal/mcp/tools_extract_candidates.go b/internal/mcp/tools_extract_candidates.go index aedb26a3..22d4d826 100644 --- a/internal/mcp/tools_extract_candidates.go +++ b/internal/mcp/tools_extract_candidates.go @@ -57,6 +57,93 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call pathPrefix := strings.TrimSpace(req.GetString("path_prefix", "")) limit := max(req.GetInt("limit", 25), 1) + rows := s.collectExtractionCandidates(ctx, minLines, minCallers, minFanOut, pathPrefix) + + sort.Slice(rows, func(i, j int) bool { + if rows[i].Score != rows[j].Score { + return rows[i].Score > rows[j].Score + } + return rows[i].ID < rows[j].ID + }) + truncated := false + if len(rows) > limit { + rows = rows[:limit] + truncated = true + } + + return s.respondJSONOrTOON(ctx, req, map[string]any{ + "candidates": rows, + "total": len(rows), + "truncated": truncated, + "thresholds": map[string]any{ + "min_lines": minLines, + "min_callers": minCallers, + "min_fan_out": minFanOut, + }, + }) +} + +// collectExtractionCandidates evaluates the three threshold gates +// (min lines, min callers, min fan-out) over every function/method +// in scope, returning the surviving rows. +// +// Picks ExtractCandidatesScanner when the backend implements it: that +// path runs the caller-count + fan-out aggregations server-side in +// one Cypher per direction instead of the AllNodes + per-node +// GetInEdges + GetOutEdges loop the fallback runs. On Ladybug the +// fallback fires 2N cgo round-trips per call and materialises every +// edge bucket just to count distinct endpoints. The pushdown drops +// the call to two aggregations the planner can index. +// +// The session's workspace scope is applied as a post-filter when +// the capability is used — kind / threshold pre-filtering is the +// dominant win, so workspace gating Go-side is cheap. +func (s *Server) collectExtractionCandidates( + ctx context.Context, + minLines, minCallers, minFanOut int, + pathPrefix string, +) []extractCandidateRow { + callKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeCrossRepoCalls} + if scanner, ok := s.graph.(graph.ExtractCandidatesScanner); ok { + raw := scanner.ExtractCandidates(callKinds, minLines, minCallers, minFanOut, pathPrefix) + // Session-scope post-filter: skip the lookup when the session + // is unbound (every node is in scope) so the bench-friendly + // path stays a pure stream of rows. + _, _, bound := s.sessionScope(ctx) + var scopeIDs map[string]*graph.Node + if bound { + ids := make([]string, 0, len(raw)) + for _, r := range raw { + ids = append(ids, r.NodeID) + } + scopeIDs = s.graph.GetNodesByIDs(ids) + } + out := make([]extractCandidateRow, 0, len(raw)) + for _, r := range raw { + if bound { + n := scopeIDs[r.NodeID] + if n == nil || !s.nodeInSessionScope(ctx, n) { + continue + } + } + score := math.Log1p(float64(r.LineCount)) * + math.Log1p(float64(r.CallerCount)) * + math.Log1p(float64(r.FanOut)) + out = append(out, extractCandidateRow{ + ID: r.NodeID, Name: r.Name, File: r.FilePath, + StartLine: r.StartLine, + EndLine: r.EndLine, + LineCount: r.LineCount, + CallerCount: r.CallerCount, + FanOut: r.FanOut, + Score: roundScore(score), + Rationale: buildExtractRationale(r.LineCount, r.CallerCount, r.FanOut), + }) + } + return out + } + // In-memory fallback — kept inline so the call site doesn't + // branch on the capability twice. scoped := s.scopedNodes(ctx) rows := make([]extractCandidateRow, 0, len(scoped)) for _, n := range scoped { @@ -73,7 +160,6 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call if lineCount < minLines { continue } - callers := callerCount(s.graph, n.ID) if callers < minCallers { continue @@ -82,13 +168,9 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call if fanOut < minFanOut { continue } - - // Log-scaled composite — long-tail values don't dominate the - // short-tail. Adding 1 inside each log keeps the score >= 0. score := math.Log1p(float64(lineCount)) * math.Log1p(float64(callers)) * math.Log1p(float64(fanOut)) - rows = append(rows, extractCandidateRow{ ID: n.ID, Name: n.Name, File: n.FilePath, StartLine: n.StartLine, EndLine: n.EndLine, @@ -99,34 +181,12 @@ func (s *Server) handleGetExtractionCandidates(ctx context.Context, req mcp.Call Rationale: buildExtractRationale(lineCount, callers, fanOut), }) } - - sort.Slice(rows, func(i, j int) bool { - if rows[i].Score != rows[j].Score { - return rows[i].Score > rows[j].Score - } - return rows[i].ID < rows[j].ID - }) - truncated := false - if len(rows) > limit { - rows = rows[:limit] - truncated = true - } - - return s.respondJSONOrTOON(ctx, req, map[string]any{ - "candidates": rows, - "total": len(rows), - "truncated": truncated, - "thresholds": map[string]any{ - "min_lines": minLines, - "min_callers": minCallers, - "min_fan_out": minFanOut, - }, - }) + return rows } // callerCount returns the number of distinct call-site origins for // the given node. Counts EdgeCalls and the cross-repo call variant. -func callerCount(g *graph.Graph, id string) int { +func callerCount(g graph.Store, id string) int { seen := map[string]bool{} for _, e := range g.GetInEdges(id) { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeCrossRepoCalls { @@ -140,7 +200,7 @@ func callerCount(g *graph.Graph, id string) int { // distinctCalleeCount returns how many distinct functions/methods // the node calls. Proxy for internal complexity — a function that // orchestrates 20 different callees is probably doing too much. -func distinctCalleeCount(g *graph.Graph, id string) int { +func distinctCalleeCount(g graph.Store, id string) int { seen := map[string]bool{} for _, e := range g.GetOutEdges(id) { if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeCrossRepoCalls { diff --git a/internal/mcp/tools_find_declaration.go b/internal/mcp/tools_find_declaration.go index 23538970..db4b3ffb 100644 --- a/internal/mcp/tools_find_declaration.go +++ b/internal/mcp/tools_find_declaration.go @@ -88,7 +88,12 @@ func (s *Server) handleFindDeclaration(ctx context.Context, req mcp.CallToolRequ // Stage 2 — resolve each use site to a declaration. eng := s.engineFor(ctx) - fileIdx := buildDeclFileIndex(eng, matches) + // Pass the NodesInFilesByKindFinder capability when the backend + // implements it; buildDeclFileIndex falls back to AllNodes() when + // finder is nil (e.g. behind an overlay view that doesn't expose + // the capability). + finder, _ := s.graph.(graph.NodesInFilesByKindFinder) + fileIdx := buildDeclFileIndex(eng, finder, matches) groups := make(map[string]*declGroup) var declOrder []string @@ -173,24 +178,63 @@ func (s *Server) findUseSiteMatches(useSite string, isRegex bool, pathPrefix str // matches, so the enclosing symbol of any match line can be found // quickly. It mirrors buildFileSymbolIndex but is keyed off the match // set directly rather than astquery targets. -func buildDeclFileIndex(eng *query.Engine, matches []trigram.Match) map[string]*fileSymbolIndex { +// +// finder may be nil when no NodesInFilesByKindFinder-capable backend +// is available (e.g. when running through an editor-buffer overlay +// whose underlying view doesn't expose the capability); the function +// then falls back to walking eng.AllNodes() Go-side, identical to +// the pre-capability shape. Backends that ship the capability +// (Ladybug) collapse the per-call node fetch into one Cypher join +// scoped to the trigram-match file set — on the gortex workspace +// that was ~70k AllNodes() rows over cgo just to keep the few +// hundred whose FilePath sat in the small match-file set. +func buildDeclFileIndex(eng *query.Engine, finder graph.NodesInFilesByKindFinder, matches []trigram.Match) map[string]*fileSymbolIndex { wanted := make(map[string]struct{}, len(matches)) + files := make([]string, 0, len(matches)) for _, m := range matches { + if _, ok := wanted[m.Path]; ok { + continue + } wanted[m.Path] = struct{}{} + files = append(files, m.Path) } out := make(map[string]*fileSymbolIndex, len(wanted)) - for _, n := range eng.AllNodes() { - if _, ok := wanted[n.FilePath]; !ok { - continue + + add := func(n *graph.Node) { + if n == nil { + return } - switch n.Kind { - case graph.KindFunction, graph.KindMethod, graph.KindClosure, graph.KindType, graph.KindInterface: - idx := out[n.FilePath] - if idx == nil { - idx = &fileSymbolIndex{} - out[n.FilePath] = idx + idx := out[n.FilePath] + if idx == nil { + idx = &fileSymbolIndex{} + out[n.FilePath] = idx + } + idx.add(n) + } + + if finder != nil { + kinds := []graph.NodeKind{ + graph.KindFunction, + graph.KindMethod, + graph.KindClosure, + graph.KindType, + graph.KindInterface, + } + for _, n := range finder.NodesInFilesByKind(files, kinds) { + if _, ok := wanted[n.FilePath]; !ok { + continue + } + add(n) + } + } else { + for _, n := range eng.AllNodes() { + if _, ok := wanted[n.FilePath]; !ok { + continue + } + switch n.Kind { + case graph.KindFunction, graph.KindMethod, graph.KindClosure, graph.KindType, graph.KindInterface: + add(n) } - idx.add(n) } } for _, idx := range out { @@ -212,7 +256,7 @@ func resolveUseSiteDecl(eng *query.Engine, fileIdx map[string]*fileSymbolIndex, if e.Line != m.Line || !declResolveKinds[e.Kind] { continue } - if strings.HasPrefix(e.To, "unresolved::") || strings.HasPrefix(e.To, "external::") { + if graph.IsUnresolvedTarget(e.To) || strings.HasPrefix(e.To, "external::") { continue } // Prefer a call edge over a plain reference when the diff --git a/internal/mcp/tools_graph_completion.go b/internal/mcp/tools_graph_completion.go index ade6f675..ded90ea3 100644 --- a/internal/mcp/tools_graph_completion.go +++ b/internal/mcp/tools_graph_completion.go @@ -100,14 +100,22 @@ func (s *Server) handleGraphCompletionSearch(ctx context.Context, req mcp.CallTo // substring (case-insensitive). Replaceable by callers who plug in // vector search or another retrieval scheme via the public Retriever // interface. -func (s *Server) nameMatchSeeder(ctx context.Context, g *graph.Graph, query string, limit int) ([]*rerank.Candidate, error) { - q := strings.ToLower(query) - out := make([]*rerank.Candidate, 0, limit) - for _, n := range g.AllNodes() { - if ctx.Err() != nil { - return out, ctx.Err() - } - if !strings.Contains(strings.ToLower(n.Name), q) { +func (s *Server) nameMatchSeeder(ctx context.Context, g graph.Store, query string, limit int) ([]*rerank.Candidate, error) { + // FindNodesByNameContaining pushes the case-insensitive substring + // filter into the backend — on Ladybug that's a Cypher + // WHERE LOWER(n.name) CONTAINS $q against the indexed name column, + // so only matching rows cross cgo instead of the legacy AllNodes() + // materialisation + per-row Go string check. The in-memory backend + // already had a tight implementation behind the same surface, so + // this is a strict win on disk backends and matches today's cost + // in-memory. + matches := g.FindNodesByNameContaining(query, limit) + if ctx.Err() != nil { + return nil, ctx.Err() + } + out := make([]*rerank.Candidate, 0, len(matches)) + for _, n := range matches { + if n == nil { continue } out = append(out, &rerank.Candidate{Node: n, TextRank: len(out)}) diff --git a/internal/mcp/tools_graph_query.go b/internal/mcp/tools_graph_query.go index db62fd9c..f29bee12 100644 --- a/internal/mcp/tools_graph_query.go +++ b/internal/mcp/tools_graph_query.go @@ -3,6 +3,7 @@ package mcp import ( "context" "fmt" + "iter" "regexp" "strings" @@ -270,12 +271,47 @@ func evalGraphQuery(eng *query.Engine, stages []gqStage, limit int) (*query.SubG for _, st := range stages { switch st.kind { case gqStageNodes: - for _, n := range eng.AllNodes() { - if matchesAll(n, st.filters) { - add(n) - if len(working) >= limit { + // When the pipeline opens with a `kind=` predicate (the + // common case — e.g. `nodes kind=function ...`), iterate + // the backend's per-kind bucket instead of AllNodes(). On + // Ladybug NodesByKind hits a server-side filter and only + // the matching rows cross cgo; AllNodes() materialised the + // whole node table per request. Other filters + // (`name~`/`path=`/`lang=`) still post-filter in Go. + // + // Overlay views (NodesByKindReader-unaware) fall through + // to the AllNodes() walk — they're already in-memory, so + // the bucket optimisation has no win there. + seedKinds := seedKindsFromFilters(st.filters) + byKind, _ := eng.Reader().(nodesByKindReader) + if byKind != nil && len(seedKinds) > 0 { + done := false + for _, k := range seedKinds { + if done { break } + for n := range byKind.NodesByKind(k) { + if n == nil { + continue + } + if !matchesAll(n, st.filters) { + continue + } + add(n) + if len(working) >= limit { + done = true + break + } + } + } + } else { + for _, n := range eng.AllNodes() { + if matchesAll(n, st.filters) { + add(n) + if len(working) >= limit { + break + } + } } } @@ -340,7 +376,7 @@ func evalGraphQuery(eng *query.Engine, stages []gqStage, limit int) (*query.SubG } targetID = e.From } - if strings.HasPrefix(targetID, "unresolved::") || + if graph.IsUnresolvedTarget(targetID) || strings.HasPrefix(targetID, "external::") { continue } @@ -398,3 +434,35 @@ func evalGraphQuery(eng *query.Engine, stages []gqStage, limit int) (*query.SubG TotalEdges: len(edges), }, nil } + +// nodesByKindReader is the optional read-side capability the eng.Reader +// underlying type may implement. *graph.Graph satisfies it directly +// (Store has NodesByKind); OverlaidView does not, which is fine — +// overlays already work in-memory and don't benefit from the bucket +// fast path. +type nodesByKindReader interface { + NodesByKind(kind graph.NodeKind) iter.Seq[*graph.Node] +} + +// seedKindsFromFilters extracts every `kind=` predicate from a stage's +// filter list so the seed loop can iterate the corresponding NodesByKind +// buckets instead of AllNodes(). Returns nil when no `kind=` filter is +// present — the caller falls back to the AllNodes() walk in that case. +// Duplicates are deduped so a sloppy author writing `kind=function +// kind=function` doesn't double-iterate. +func seedKindsFromFilters(filters []gqFilter) []graph.NodeKind { + var out []graph.NodeKind + seen := make(map[graph.NodeKind]struct{}, len(filters)) + for _, f := range filters { + if f.op != "kind=" { + continue + } + k := graph.NodeKind(f.value) + if _, ok := seen[k]; ok { + continue + } + seen[k] = struct{}{} + out = append(out, k) + } + return out +} diff --git a/internal/mcp/tools_knowledge_gaps.go b/internal/mcp/tools_knowledge_gaps.go index 9d6c5e7d..2e052b0a 100644 --- a/internal/mcp/tools_knowledge_gaps.go +++ b/internal/mcp/tools_knowledge_gaps.go @@ -78,11 +78,17 @@ func (s *Server) handleGetKnowledgeGaps(ctx context.Context, req mcp.CallToolReq perCategoryLimit := max(req.GetInt("limit_per_category", 20), 1) pathPrefix := strings.TrimSpace(req.GetString("path_prefix", "")) - scoped := s.scopedNodes(ctx) + // degreeByID maps node id -> (in, out) edge counts for every + // function/method in scope, computed once via the backend's + // NodeDegreeByKinds path when available. The legacy + // NodeDegreeCounts route shipped a 30k-element IN-list per call + // on Ladybug; NodeDegreeByKinds runs the same aggregate over the + // kind-filtered node set so the planner never builds the list. + degreeByID, scoped := s.scopedFunctionDegrees(ctx, pathPrefix) - disconnected := s.collectDisconnected(scoped, pathPrefix, perCategoryLimit) + disconnected := s.collectDisconnected(scoped, pathPrefix, perCategoryLimit, degreeByID) thin, singleFile := s.collectCommunityGaps(thinSize, pathPrefix, perCategoryLimit) - untested := s.collectUntestedHotspots(scoped, pathPrefix, hotspotLimit, minCov, perCategoryLimit) + untested := s.collectUntestedHotspots(scoped, pathPrefix, hotspotLimit, minCov, perCategoryLimit, degreeByID) return s.respondJSONOrTOON(ctx, req, map[string]any{ "disconnected_nodes": disconnected, @@ -104,27 +110,102 @@ func (s *Server) handleGetKnowledgeGaps(ctx context.Context, req mcp.CallToolReq }) } +// scopedFunctionDegrees returns the per-node in/out degree map and +// the scoped function/method node list, in two pushdown calls. +// NodeDegreeByKinds runs server-side over the kind-filtered node +// table — the previous path fed NodeDegreeCounts a 30k-element +// IN-list, which the planner had to materialise before joining. The +// scoped node list is built from NodesByKinds (or AllNodes when the +// backend has no NodesByKindsScanner) and post-filtered for the +// session workspace, matching scopedNodesByKinds' contract. +func (s *Server) scopedFunctionDegrees(ctx context.Context, pathPrefix string) (map[string]graph.NodeDegreeRow, []*graph.Node) { + kinds := []graph.NodeKind{graph.KindFunction, graph.KindMethod} + scoped := s.scopedNodesByKinds(ctx, kinds) + var degByID map[string]graph.NodeDegreeRow + if dk, ok := s.graph.(graph.NodeDegreeByKinds); ok { + rows := dk.NodeDegreeByKinds(kinds, pathPrefix) + degByID = make(map[string]graph.NodeDegreeRow, len(rows)) + for _, r := range rows { + degByID[r.NodeID] = r + } + } + return degByID, scoped +} + // collectDisconnected returns function/method nodes with zero // incoming and zero outgoing edges in the scoped subgraph. The // kind filter mirrors handleAnalyzeCoverageGaps' default — variables // and constants always look disconnected, so including them would // flood the result. -func (s *Server) collectDisconnected(scoped []*graph.Node, pathPrefix string, limit int) []gapDisconnected { - out := make([]gapDisconnected, 0) +// +// Reads from the prebuilt degree map when present (the storage +// backend computed it once in scopedFunctionDegrees), falls back to +// per-node GetInEdges / GetOutEdges otherwise. The legacy +// NodeDegreeAggregator path is kept as a tertiary fallback for +// backends that publish NodeDegreeCounts but not NodeDegreeByKinds. +func (s *Server) collectDisconnected(scoped []*graph.Node, pathPrefix string, limit int, degreeByID map[string]graph.NodeDegreeRow) []gapDisconnected { + candidates := make([]*graph.Node, 0, len(scoped)) for _, n := range scoped { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - if len(s.graph.GetInEdges(n.ID)) > 0 || len(s.graph.GetOutEdges(n.ID)) > 0 { - continue + candidates = append(candidates, n) + } + + out := make([]gapDisconnected, 0) + switch { + case degreeByID != nil: + for _, n := range candidates { + r, ok := degreeByID[n.ID] + if !ok { + // Absent from the aggregate => zero edges, by + // definition of the kind-filtered aggregate. + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + continue + } + if r.InCount > 0 || r.OutCount > 0 { + continue + } + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + } + default: + if agg, ok := s.graph.(graph.NodeDegreeAggregator); ok && len(candidates) > 0 { + ids := make([]string, 0, len(candidates)) + byID := make(map[string]*graph.Node, len(candidates)) + for _, n := range candidates { + ids = append(ids, n.ID) + byID[n.ID] = n + } + for _, r := range agg.NodeDegreeCounts(ids, nil) { + if r.InCount > 0 || r.OutCount > 0 { + continue + } + n := byID[r.NodeID] + if n == nil { + continue + } + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + } + } else { + for _, n := range candidates { + if len(s.graph.GetInEdges(n.ID)) > 0 || len(s.graph.GetOutEdges(n.ID)) > 0 { + continue + } + out = append(out, gapDisconnected{ + ID: n.ID, Name: n.Name, Kind: string(n.Kind), + File: n.FilePath, Line: n.StartLine, + }) + } } - out = append(out, gapDisconnected{ - ID: n.ID, Name: n.Name, Kind: string(n.Kind), - File: n.FilePath, Line: n.StartLine, - }) } sort.Slice(out, func(i, j int) bool { if out[i].File != out[j].File { @@ -193,20 +274,50 @@ func (s *Server) collectCommunityGaps(thinSize int, pathPrefix string, limit int // coverage_pct < minCov or no coverage data at all. Independent of // analyze hotspots (which gates on mean+2σ) so it still surfaces // load-bearing nodes in small repos. -func (s *Server) collectUntestedHotspots(scoped []*graph.Node, pathPrefix string, hotspotLimit int, minCov float64, limit int) []gapUntestedHotspot { +// +// Reads from the prebuilt NodeDegreeByKinds aggregate when present; +// falls back to NodeDegreeAggregator (the older IN-list shape) for +// backends that only publish that one, and finally to per-node +// GetInEdges for everyone else. +func (s *Server) collectUntestedHotspots(scoped []*graph.Node, pathPrefix string, hotspotLimit int, minCov float64, limit int, degreeByID map[string]graph.NodeDegreeRow) []gapUntestedHotspot { type ranked struct { node *graph.Node fanIn int } - candidates := make([]ranked, 0, len(scoped)) + pool := make([]*graph.Node, 0, len(scoped)) for _, n := range scoped { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } if pathPrefix != "" && !strings.HasPrefix(n.FilePath, pathPrefix) { continue } - candidates = append(candidates, ranked{node: n, fanIn: len(s.graph.GetInEdges(n.ID))}) + pool = append(pool, n) + } + candidates := make([]ranked, 0, len(pool)) + switch { + case degreeByID != nil: + for _, n := range pool { + r := degreeByID[n.ID] + candidates = append(candidates, ranked{node: n, fanIn: r.InCount}) + } + default: + if agg, ok := s.graph.(graph.NodeDegreeAggregator); ok && len(pool) > 0 { + ids := make([]string, 0, len(pool)) + byID := make(map[string]*graph.Node, len(pool)) + for _, n := range pool { + ids = append(ids, n.ID) + byID[n.ID] = n + } + for _, r := range agg.NodeDegreeCounts(ids, nil) { + n := byID[r.NodeID] + if n == nil { + continue + } + candidates = append(candidates, ranked{node: n, fanIn: r.InCount}) + } + } else { + for _, n := range pool { + candidates = append(candidates, ranked{node: n, fanIn: len(s.graph.GetInEdges(n.ID))}) + } + } } sort.Slice(candidates, func(i, j int) bool { return candidates[i].fanIn > candidates[j].fanIn diff --git a/internal/mcp/tools_nav.go b/internal/mcp/tools_nav.go index 88a6dc34..97e118b0 100644 --- a/internal/mcp/tools_nav.go +++ b/internal/mcp/tools_nav.go @@ -272,7 +272,7 @@ func navNeighbours(eng engineLike, edges []*graph.Edge, kind graph.EdgeKind, for } else { id = e.From } - if seen[id] || strings.HasPrefix(id, "unresolved::") || strings.HasPrefix(id, "external::") { + if seen[id] || graph.IsUnresolvedTarget(id) || strings.HasPrefix(id, "external::") { continue } n := eng.GetSymbol(id) diff --git a/internal/mcp/tools_nav_test.go b/internal/mcp/tools_nav_test.go index 363ce6ac..d539205c 100644 --- a/internal/mcp/tools_nav_test.go +++ b/internal/mcp/tools_nav_test.go @@ -22,7 +22,7 @@ import ( // setupNavServer indexes a Go source with a deeper call graph and a type // carrying several methods, so the nav tool's into / up / sibling moves // have real candidates to choose between. -func setupNavServer(t *testing.T) (*Server, *graph.Graph) { +func setupNavServer(t *testing.T) (*Server, graph.Store) { t.Helper() dir := t.TempDir() src := `package svc @@ -73,7 +73,7 @@ func navResult(t *testing.T, result *mcplib.CallToolResult) map[string]any { } // navFindMethod returns the graph ID of a method named `name`. -func navFindMethod(t *testing.T, g *graph.Graph, name string) string { +func navFindMethod(t *testing.T, g graph.Store, name string) string { t.Helper() for _, n := range g.AllNodes() { if n.Name == name && (n.Kind == graph.KindMethod || n.Kind == graph.KindFunction) { diff --git a/internal/mcp/tools_outline.go b/internal/mcp/tools_outline.go index dbf6b0b5..e0e1e5f6 100644 --- a/internal/mcp/tools_outline.go +++ b/internal/mcp/tools_outline.go @@ -2,6 +2,7 @@ package mcp import ( "context" + "maps" "sort" "github.com/mark3labs/mcp-go/mcp" @@ -35,10 +36,18 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque // outline is byte-identical to the legacy global view. inScope is // the node-ID set used to bound the edge-driven and analyzer-driven // sections; nil for an unbound session means "no filter". - scoped := s.scopedNodes(ctx) _, _, bound := s.sessionScope(ctx) + + // Pull the full scoped node slice only when the session is bound + // — the lang count, total-node count, and edge filter need it then. + // Unbound sessions get the same numbers from the backend's cached + // Stats() (one indexed groupby on disk backends) and the + // callable-only entry-point pass, neither of which materialises + // the whole node table over cgo. + var scoped []*graph.Node var inScope map[string]bool if bound { + scoped = s.scopedNodes(ctx) inScope = make(map[string]bool, len(scoped)) for _, n := range scoped { inScope[n.ID] = true @@ -52,10 +61,20 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque Nodes int `json:"nodes"` } langCounts := make(map[string]int) - for _, n := range scoped { - if n.Language != "" { - langCounts[n.Language]++ + totalScopedNodes := 0 + if bound { + for _, n := range scoped { + if n.Language != "" { + langCounts[n.Language]++ + } } + totalScopedNodes = len(scoped) + } else { + // Unbound: Stats().ByLanguage already aggregates this server- + // side; the cgo cost is one GROUP BY instead of one row per node. + stats := s.graph.Stats() + maps.Copy(langCounts, stats.ByLanguage) + totalScopedNodes = stats.TotalNodes } var languages []langEntry for name, n := range langCounts { @@ -76,16 +95,23 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque } // Edge count, bounded to edges whose endpoints are both in scope. + // Unbound sessions never set inScope, so the count is exactly + // the backend's EdgeCount() — an O(1) lookup that skips + // materialising every edge over cgo. totalEdges := 0 - for _, e := range s.graph.AllEdges() { - if inScope != nil && (!inScope[e.From] || !inScope[e.To]) { - continue + if inScope == nil { + totalEdges = s.graph.EdgeCount() + } else { + for _, e := range s.graph.AllEdges() { + if !inScope[e.From] || !inScope[e.To] { + continue + } + totalEdges++ } - totalEdges++ } summary := map[string]any{ - "total_nodes": len(scoped), + "total_nodes": totalScopedNodes, "total_edges": totalEdges, "primary_language": primaryLang, "languages": languages, @@ -126,7 +152,7 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque // threshold to ensure we get the top N regardless of repo size. // Post-filtered to the session's workspace. hotspotsSection := []map[string]any{} - hs := analysis.FindHotspots(s.graph, s.getCommunities(), 0) + hs := s.getHotspots() for _, h := range hs { if len(hotspotsSection) >= topHotspotsN { break @@ -150,7 +176,7 @@ func (s *Server) handleGetRepoOutline(ctx context.Context, req mcp.CallToolReque "communities": communitiesSection, "hotspots": hotspotsSection, "most_imported_files": mostImportedFiles(s.graph, inScope, topMostImportedN), - "entry_points": entryPoints(scoped, topEntryPointsN), + "entry_points": entryPoints(s.graph, inScope, topEntryPointsN), }) } @@ -176,31 +202,55 @@ func topCommunitiesSummary(comms []analysis.Community) []map[string]any { // "here's where the gravity lives" signal for newcomers. // inScope, when non-nil, bounds the ranking to imports whose target // node is inside the session's workspace. -func mostImportedFiles(g *graph.Graph, inScope map[string]bool, topN int) []map[string]any { +// +// Picks the FileImportAggregator capability when the backend +// implements it (one Cypher GROUP BY ships back the per-file count +// instead of materialising every edge over cgo just to bucket). +// Falls back to the AllEdges-driven loop on backends that don't. +func mostImportedFiles(g graph.Store, inScope map[string]bool, topN int) []map[string]any { type fileCount struct { path string count int } counts := make(map[string]int) - for _, e := range g.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } - target := g.GetNode(e.To) - if target == nil { - continue + if ag, ok := g.(graph.FileImportAggregator); ok { + var scope []string + if inScope != nil { + scope = make([]string, 0, len(inScope)) + for id := range inScope { + scope = append(scope, id) + } + // An empty inScope means "nothing matches" — the + // aggregator contract maps that to nil so we never + // fire a whole-graph Cypher scan on a bound session. + if len(scope) == 0 { + scope = []string{} + } } - if inScope != nil && !inScope[target.ID] { - continue + for _, r := range ag.FileImportCounts(scope) { + counts[r.FilePath] = r.Count } - // Aggregate at the file level. For Import-kind nodes the node's - // FilePath is the file being imported; for File-kind nodes the - // ID is already the path. - path := target.FilePath - if path == "" { - path = target.ID + } else { + for _, e := range g.AllEdges() { + if e.Kind != graph.EdgeImports { + continue + } + target := g.GetNode(e.To) + if target == nil { + continue + } + if inScope != nil && !inScope[target.ID] { + continue + } + // Aggregate at the file level. For Import-kind nodes the node's + // FilePath is the file being imported; for File-kind nodes the + // ID is already the path. + path := target.FilePath + if path == "" { + path = target.ID + } + counts[path]++ } - counts[path]++ } var ranked []fileCount @@ -231,18 +281,28 @@ func mostImportedFiles(g *graph.Graph, inScope map[string]bool, topN int) []map[ // (the Go / Rust / C convention) and top-level functions with no callers // in files named `main.*` or `cmd/**`. Good enough for the outline; a // fuller process-based walk is what `get_processes` does separately. -func entryPoints(nodes []*graph.Node, topN int) []map[string]any { +// +// Lookup goes through FindNodesByName so the name index runs server- +// side on disk backends — the legacy nodes-slice walk pulled the whole +// node table just to keep the ~10 nodes literally named "main". When +// an inScope filter is supplied (bound session), it's applied after +// the name lookup so a bound session never sees mains from other +// workspaces. +func entryPoints(g graph.Store, inScope map[string]bool, topN int) []map[string]any { type ep struct { id string name string filePath string } var out []ep - for _, n := range nodes { + for _, n := range g.FindNodesByName("main") { + if n == nil { + continue + } if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue } - if n.Name != "main" { + if inScope != nil && !inScope[n.ID] { continue } out = append(out, ep{id: n.ID, name: n.Name, filePath: n.FilePath}) diff --git a/internal/mcp/tools_releases_test.go b/internal/mcp/tools_releases_test.go new file mode 100644 index 00000000..61ca593c --- /dev/null +++ b/internal/mcp/tools_releases_test.go @@ -0,0 +1,119 @@ +package mcp + +import ( + "context" + "encoding/json" + "testing" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// seedReleasesGraph populates the graph with a KindRelease timeline +// and a couple of file nodes whose meta.added_in maps onto the +// releases. Mirrors what releases.EnrichGraphForBranch would have +// written; lets the read-side handler be tested without a real git +// repo. +func seedReleasesGraph(t *testing.T) *Server { + t.Helper() + g := graph.New() + g.AddNode(&graph.Node{ + ID: "release::v0.1", + Kind: graph.KindRelease, + Name: "v0.1", + Meta: map[string]any{ + "tag": "v0.1", + "file_count": 1, + "order": 0, + }, + }) + g.AddNode(&graph.Node{ + ID: "release::v0.2", + Kind: graph.KindRelease, + Name: "v0.2", + Meta: map[string]any{ + "tag": "v0.2", + "file_count": 2, + "order": 1, + }, + }) + g.AddNode(&graph.Node{ + ID: "a.go", Kind: graph.KindFile, FilePath: "a.go", + Meta: map[string]any{"added_in": "v0.1"}, + }) + g.AddNode(&graph.Node{ + ID: "b.go", Kind: graph.KindFile, FilePath: "b.go", + Meta: map[string]any{"added_in": "v0.2"}, + }) + return &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } +} + +func callAnalyzeReleases(t *testing.T, s *Server, args map[string]any) map[string]any { + t.Helper() + req := mcp.CallToolRequest{} + req.Params.Arguments = args + res, err := s.handleAnalyzeReleases(context.Background(), req) + require.NoError(t, err) + require.NotNil(t, res) + tc, ok := res.Content[0].(mcp.TextContent) + require.True(t, ok) + var m map[string]any + require.NoError(t, json.Unmarshal([]byte(tc.Text), &m)) + return m +} + +func TestAnalyzeReleases_Timeline(t *testing.T) { + s := seedReleasesGraph(t) + out := callAnalyzeReleases(t, s, map[string]any{}) + releases, _ := out["releases"].([]any) + require.Len(t, releases, 2) + first := releases[0].(map[string]any) + assert.Equal(t, "v0.1", first["tag"], "ordered by Meta.order asc — oldest first") + assert.EqualValues(t, 0, first["order"]) + assert.EqualValues(t, 1, first["file_count"]) +} + +func TestAnalyzeReleases_TagFilterReturnsFiles(t *testing.T) { + s := seedReleasesGraph(t) + out := callAnalyzeReleases(t, s, map[string]any{"tag": "v0.2"}) + releases, _ := out["releases"].([]any) + require.Len(t, releases, 1) + first := releases[0].(map[string]any) + files, _ := first["files"].([]any) + require.Len(t, files, 1) + assert.Equal(t, "b.go", files[0]) + assert.EqualValues(t, 1, out["file_hits"]) +} + +func TestAnalyzeReleases_TagFilterUnknownTag(t *testing.T) { + s := seedReleasesGraph(t) + out := callAnalyzeReleases(t, s, map[string]any{"tag": "v99"}) + require.NotEmpty(t, out["error"]) + assert.Equal(t, "enrich_releases", out["suggestion"]) +} + +func TestAnalyzeReleases_ErrorsWhenNoMeta(t *testing.T) { + g := graph.New() + g.AddNode(&graph.Node{ID: "x.go", Kind: graph.KindFile, FilePath: "x.go"}) + s := &Server{ + graph: g, + session: newSessionState(), + tokenStats: &tokenStats{}, + symHistory: &symbolHistory{entries: make(map[string][]SymbolModification)}, + sessions: newSessionMap(), + toolScopes: newScopeRegistry(), + } + out := callAnalyzeReleases(t, s, map[string]any{}) + require.NotEmpty(t, out["error"]) + assert.Equal(t, "enrich_releases", out["suggestion"]) +} diff --git a/internal/mcp/tools_replay_episode.go b/internal/mcp/tools_replay_episode.go index 4eed9358..1213b78f 100644 --- a/internal/mcp/tools_replay_episode.go +++ b/internal/mcp/tools_replay_episode.go @@ -137,9 +137,17 @@ func (s *Server) replayTimeline(radius map[string]int, windowDays, limit int) [] if windowDays > 0 { cutoff = time.Now().Add(-time.Duration(windowDays) * 24 * time.Hour) } + // Batch-fetch every node in the radius; the radius is the BFS + // frontier (often hundreds of IDs), and per-id GetNode on Ladybug + // would issue that many cgo round-trips per replay call. + ids := make([]string, 0, len(radius)) + for id := range radius { + ids = append(ids, id) + } + nodeByID := s.graph.GetNodesByIDs(ids) rows := make([]replayTimelineRow, 0, len(radius)) for id := range radius { - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } @@ -197,12 +205,23 @@ func (s *Server) replayTimeline(radius map[string]int, windowDays, limit int) [] } func (s *Server) replayCallers(radius map[string]int, anchor string, limit int) []replayCallerRow { + // Batch-fetch the radius minus the anchor; same rationale as + // replayTimeline — per-id GetNode on Ladybug cost one cgo call + // per BFS node. + ids := make([]string, 0, len(radius)) + for id := range radius { + if id == anchor { + continue + } + ids = append(ids, id) + } + nodeByID := s.graph.GetNodesByIDs(ids) rows := make([]replayCallerRow, 0, len(radius)) for id, d := range radius { if id == anchor { continue } - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } @@ -226,9 +245,15 @@ func (s *Server) replayCallers(radius map[string]int, anchor string, limit int) } func (s *Server) replayCoverageGaps(radius map[string]int, limit int) []replayCoverageRow { + // Batch-fetch the radius — same rationale as replayTimeline. + ids := make([]string, 0, len(radius)) + for id := range radius { + ids = append(ids, id) + } + nodeByID := s.graph.GetNodesByIDs(ids) rows := make([]replayCoverageRow, 0) for id := range radius { - n := s.graph.GetNode(id) + n := nodeByID[id] if n == nil { continue } diff --git a/internal/mcp/tools_safe_delete.go b/internal/mcp/tools_safe_delete.go index 3f9b73a7..fb848c5d 100644 --- a/internal/mcp/tools_safe_delete.go +++ b/internal/mcp/tools_safe_delete.go @@ -363,7 +363,7 @@ func expandDeleteRange(node *graph.Node, lines []string) (int, int) { // target. Iteration is bounded by cascadeIterationCap; if hit, the // caller surfaces cascade_truncated so the agent knows the closure // may be incomplete. -func computeCascadeClosure(g *graph.Graph, target *graph.Node, cascadeIntoTests bool) ([]cascadeClosureEntry, bool) { +func computeCascadeClosure(g graph.Store, target *graph.Node, cascadeIntoTests bool) ([]cascadeClosureEntry, bool) { closure := []cascadeClosureEntry{} inClosure := map[string]bool{target.ID: true} reasons := map[string]string{} @@ -423,7 +423,7 @@ func computeCascadeClosure(g *graph.Graph, target *graph.Node, cascadeIntoTests // collectCascadeCandidates returns every distinct node ID that an // in-closure node points at via a referencing edge — the only // possible new entrants to the closure on this iteration. -func collectCascadeCandidates(g *graph.Graph, inClosure map[string]bool) []string { +func collectCascadeCandidates(g graph.Store, inClosure map[string]bool) []string { seen := map[string]bool{} out := []string{} for from := range inClosure { @@ -448,7 +448,7 @@ func collectCascadeCandidates(g *graph.Graph, inClosure map[string]bool) []strin // reports whether the node has no caller outside the current // closure. Returns a human-readable reason string when the node // qualifies (used for the response payload). -func candidateQualifies(g *graph.Graph, cn *graph.Node, inClosure map[string]bool, cascadeIntoTests bool) (string, bool) { +func candidateQualifies(g graph.Store, cn *graph.Node, inClosure map[string]bool, cascadeIntoTests bool) (string, bool) { targetWS := "" // Build an "in-closure caller" list so the reason string can // name the symbol(s) that are the only ones still calling this @@ -540,7 +540,7 @@ func workspaceKey(n *graph.Node) string { // represents real use (someone calls, implements, extends, or // references this symbol). Structural edges (defines, member_of) // are excluded because they don't block a delete. -func collectReferencingEdges(g *graph.Graph, id string) []safeDeleteReference { +func collectReferencingEdges(g graph.Store, id string) []safeDeleteReference { out := make([]safeDeleteReference, 0) seen := map[string]bool{} for _, e := range g.GetInEdges(id) { diff --git a/internal/mcp/tools_search_assist.go b/internal/mcp/tools_search_assist.go index 42c56067..0ded7fbc 100644 --- a/internal/mcp/tools_search_assist.go +++ b/internal/mcp/tools_search_assist.go @@ -3,6 +3,7 @@ package mcp import ( "context" "strings" + "time" mcpgo "github.com/mark3labs/mcp-go/mcp" @@ -149,25 +150,72 @@ func expandSearchTerms(ctx context.Context, s *Server, query string) []string { return res.Terms } -// fetchAndMergeBM25 runs BM25 once per term (original + expansions), -// then folds the results into a single deduplicated slice. The -// original query's hits win position; expansion hits append in their -// own BM25 order with duplicates skipped. +// fetchAndMergeBM25 fires (at most) two BM25 calls — one for the +// primary query alone (so we can attribute primaryCount honestly for +// the debug surface) and one for the combined OR-merge of every +// expansion term — then folds the results into a single deduplicated +// slice. The original query's hits win position; the combined- +// expansion hits append in their own BM25 order with duplicates +// skipped. // -// fetchLimit is the per-term over-fetch budget. Bounded by the caller -// so a wide expansion can't blow up the candidate pool. +// Both BM25 backends (BM25Backend and Ladybug's FTS via +// QUERY_FTS_INDEX) treat a multi-token query as an OR-style union +// with a single global BM25 score, so one combined call replaces +// the prior N per-term fan-out (the N+1 round-trip pattern dominated +// the search hot path on disk backends). +// +// A per-fragment exact-name rescue runs after the combined call — +// one batched FindNodesByNames on the engine's reader. This +// preserves the per-term behaviour where a fragment like +// "BillingInvoice" finds its exact-name node even when BM25 +// tokenisation drops the PascalCase concatenation. +// +// fetchLimit caps each call so a wide expansion can't blow up the +// candidate pool. // // primaryCount is the size of the original-query BM25 result before -// merging; useful for diagnostic / debug surfaces that want to show -// how many candidates expansion contributed. +// merging — surfaced on the assist debug field so callers can see how +// much expansion contributed. func fetchAndMergeBM25(eng *query.Engine, original string, expanded []string, fetchLimit int, scope query.QueryOptions) (merged []*graph.Node, primaryCount int) { + return fetchAndMergeBM25Timed(eng, original, expanded, fetchLimit, scope, nil) +} + +// fetchAndMergeBM25Timed is fetchAndMergeBM25 with per-phase wall-clock +// breakdowns. The MCP handler hands a fresh SearchTimings struct so +// the resulting Debug log line attributes BM25 time honestly across +// the primary call and the combined-expansion call. Pass nil to skip +// instrumentation (e.g. unit tests that don't care). +func fetchAndMergeBM25Timed(eng *query.Engine, original string, expanded []string, fetchLimit int, scope query.QueryOptions, timings *query.SearchTimings) (merged []*graph.Node, primaryCount int) { + // The merged candidate set is reranked by the handler with the + // full session-aware context; the per-call inner rerank inside + // SearchSymbolsRanked would be wasted work whose output the + // merge discards. SkipInnerRerank collapses the N+1 engine + // rerank invocations to zero — drops ~150-300ms per call on + // Ladybug (each inner rerank's Context.prepare costs at minimum + // two batched edge fetches when the bundle cache misses). + scope.SkipInnerRerank = true + primaryStart := time.Now() primary := eng.SearchSymbolsScoped(original, fetchLimit, scope) primaryCount = len(primary) - if len(expanded) == 0 { + if timings != nil { + timings.BM25PrimaryMS += time.Since(primaryStart).Milliseconds() + } + + // Trim and de-empty the expansion list. When nothing useful + // survives we skip the combined call entirely. + cleanedExpansion := make([]string, 0, len(expanded)) + for _, t := range expanded { + t = strings.TrimSpace(t) + if t != "" { + cleanedExpansion = append(cleanedExpansion, t) + } + } + if len(cleanedExpansion) == 0 { return primary, primaryCount } - seen := make(map[string]bool, len(primary)) - merged = make([]*graph.Node, 0, len(primary)) + + seen := make(map[string]bool, len(primary)+fetchLimit) + merged = make([]*graph.Node, 0, len(primary)+fetchLimit) for _, n := range primary { if seen[n.ID] { continue @@ -175,23 +223,83 @@ func fetchAndMergeBM25(eng *query.Engine, original string, expanded []string, fe seen[n.ID] = true merged = append(merged, n) } - for _, term := range expanded { - term = strings.TrimSpace(term) - if term == "" { + + // Combined OR-merge: pass every expansion term — concatenated by + // whitespace — as ONE BM25 call. Tokenisation + IDF scoring run + // once across the whole bag of terms instead of N times. + // + // The concatenated bag of terms is never going to match any + // node's literal Name, so the engine's exact-name splice would + // pay a guaranteed-empty FindNodesByName Cypher round-trip every + // fan-out. SkipExactNameSplice tells gatherBackendCandidates to + // skip it — the per-fragment exact-name rescue below covers the + // load-bearing PascalCase-fragment case the splice was insuring + // against, so dropping the round-trip is safe. + combined := strings.Join(cleanedExpansion, " ") + expansionScope := scope + expansionScope.SkipExactNameSplice = true + expansionStart := time.Now() + extra := eng.SearchSymbolsScoped(combined, fetchLimit, expansionScope) + if timings != nil { + timings.BM25ExpansionMS += time.Since(expansionStart).Milliseconds() + } + for _, n := range extra { + if seen[n.ID] { continue } - extra := eng.SearchSymbolsScoped(term, fetchLimit, scope) - for _, n := range extra { - if seen[n.ID] { - continue + seen[n.ID] = true + merged = append(merged, n) + } + + // Per-fragment exact-name union — cheap (one name-bucket lookup + // per term on in-memory, a single `WHERE name IN $names` Cypher + // round-trip on Ladybug via FindNodesByNames). Preserves the + // per-term behaviour where a fragment like "BillingInvoice" + // finds its exact-name node even when BM25 tokenisation misses + // the PascalCase concatenated token. Without this rescue, + // soup-split mode silently dropped exact matches that the + // per-term loop used to surface via the engine's FindNodesByName + // fallback. + if rdr, ok := graphReaderFromEngine(eng); ok { + nameMap := rdr.FindNodesByNames(cleanedExpansion) + for _, term := range cleanedExpansion { + for _, n := range nameMap[term] { + if n == nil || seen[n.ID] { + continue + } + if n.Kind == graph.KindFile || n.Kind == graph.KindImport { + continue + } + if scope.WorkspaceID != "" && !scope.ScopeAllows(n) { + continue + } + seen[n.ID] = true + merged = append(merged, n) } - seen[n.ID] = true - merged = append(merged, n) } } return merged, primaryCount } +// graphReaderFromEngine returns the engine's underlying graph reader +// if it also exposes the batched FindNodesByNames method (every +// production backend does — in-memory, Ladybug, and OverlaidView via +// the layered base). Falls back to (nil, false) when an embedded +// test engine wires a stripped-down reader — the rescue step is then +// skipped, matching the contract that callers without a names-batch +// reader simply get the BM25-only result. +type namesReader interface { + FindNodesByNames(names []string) map[string][]*graph.Node +} + +func graphReaderFromEngine(eng *query.Engine) (namesReader, bool) { + if eng == nil { + return nil, false + } + r, ok := eng.Reader().(namesReader) + return r, ok +} + // rerankCap bounds how many candidates the rerank pass sees. The // model has limited working memory; past ~25 items its judgement // degrades and the prompt blows the assist context. Trailing diff --git a/internal/mcp/tools_search_assist_test.go b/internal/mcp/tools_search_assist_test.go index 69968ce9..e4e87e77 100644 --- a/internal/mcp/tools_search_assist_test.go +++ b/internal/mcp/tools_search_assist_test.go @@ -176,6 +176,84 @@ func TestFetchAndMergeBM25_DedupesAcrossTerms(t *testing.T) { assert.Equal(t, idsOf(primary), idsOf(merged)) } +// TestFetchAndMergeBM25_CombinedQueryUnionIsSuperset is the load-bearing +// guard for the "combine expansion terms into one BM25 query" +// optimisation. The merged result MUST contain at least every node +// that a per-term fan-out would have returned — otherwise switching +// from N BM25 calls to (primary + combined) drops candidates the +// rerank pipeline used to see. Exact-name rescue (the per-fragment +// FindNodesByNames step) is what makes this hold for tokenisation +// edge cases like PascalCase concatenated names that BM25 misses. +func TestFetchAndMergeBM25_CombinedQueryUnionIsSuperset(t *testing.T) { + srv, _ := setupTestServer(t) + scope := query.QueryOptions{} + + // Per-term fan-out (the OLD behaviour). For each fragment, run + // the engine search separately and collect every distinct node ID + // it surfaces — this is the worst-case "no candidate may be + // dropped by collapsing into one query" set. + terms := []string{"helper", "main"} + unionExpected := map[string]bool{} + for _, t := range terms { + for _, n := range srv.engine.SearchSymbolsScoped(t, 20, scope) { + unionExpected[n.ID] = true + } + } + require.NotEmpty(t, unionExpected, "per-term fan-out produced nothing — test corpus drifted") + + // New behaviour: primary + combined-OR + per-fragment exact-name + // rescue, all driven by fetchAndMergeBM25. + merged, _ := fetchAndMergeBM25(srv.engine, terms[0], terms[1:], 20, scope) + mergedSet := map[string]bool{} + for _, n := range merged { + mergedSet[n.ID] = true + } + + for id := range unionExpected { + require.True(t, mergedSet[id], "merged result missing per-term hit %q", id) + } +} + +// TestFetchAndMergeBM25_ExactNameRescuePreserved is the regression +// guard for the soup-mode + PascalCase fragment case that per-term +// fan-out used to handle implicitly. When BM25 tokenisation misses +// a fragment ("BillingInvoice" tokenises to one term `billinginvoice` +// which the camelCase-split index doesn't carry), the per-fragment +// FindNodesByNames rescue MUST still surface its exact-name node. +// This mirrors the failure mode TestSearchSymbols_PathScoping caught +// when soup-split fragments first went through the combined query +// path. +func TestFetchAndMergeBM25_ExactNameRescuePreserved(t *testing.T) { + srv, _ := setupTestServer(t) + + // The test corpus carries no PascalCase-concatenated names by + // default, so add three synthetic ones — these never reach BM25 + // (we don't re-index it for the test) but they are what the + // rescue step has to surface. + for path, name := range map[string]string{ + "svc/billing/Invoice.go": "BillingInvoice", + "svc/auth/Login.go": "AuthLogin", + "libs/money/Amount.go": "MoneyAmount", + } { + id := path + "::" + name + srv.graph.AddNode(&graph.Node{ + ID: id, Kind: graph.KindFunction, Name: name, + FilePath: path, StartLine: 1, EndLine: 5, Language: "go", + }) + } + + terms := []string{"BillingInvoice", "AuthLogin", "MoneyAmount"} + merged, _ := fetchAndMergeBM25(srv.engine, terms[0], terms[1:], 20, query.QueryOptions{}) + + mergedNames := map[string]bool{} + for _, n := range merged { + mergedNames[n.Name] = true + } + for _, want := range terms { + require.True(t, mergedNames[want], "exact-name rescue dropped %q from merged result", want) + } +} + // TestFetchAndMergeBM25_AppendsNewMatches verifies that expansion // terms bring in additional candidates the primary term missed. func TestFetchAndMergeBM25_AppendsNewMatches(t *testing.T) { diff --git a/internal/mcp/tools_search_fast_path_test.go b/internal/mcp/tools_search_fast_path_test.go new file mode 100644 index 00000000..6ff98ca1 --- /dev/null +++ b/internal/mcp/tools_search_fast_path_test.go @@ -0,0 +1,207 @@ +package mcp + +import ( + "context" + "encoding/json" + "sync/atomic" + "testing" + + mcplib "github.com/mark3labs/mcp-go/mcp" + "github.com/stretchr/testify/require" + "go.uber.org/zap" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/query" + "github.com/zzet/gortex/internal/search" +) + +// recordingBackend is a search.Backend that counts how many times the +// engine called into Search, VectorChannelOnly, and +// SearchSymbolBundles. The identifier-shape fast path test reads these +// counters to assert the handler skipped the vector channel and skipped +// the combined-OR fan-out. +// +// Implements search.Backend, search.ChannelSearcher, +// search.SymbolBundleSearcherBackend, and the VectorChannelOnly +// duck-typed interface the engine queries on the bundle-bypass path. +type recordingBackend struct { + hits []search.SearchResult + nodes map[string]*graph.Node + searchCalls atomic.Int32 + bundleCalls atomic.Int32 + vectorOnlyCalls atomic.Int32 + channelCalls atomic.Int32 + queriesMu atomic.Pointer[[]string] +} + +func newRecordingBackend(nodes map[string]*graph.Node, hits []search.SearchResult) *recordingBackend { + rb := &recordingBackend{hits: hits, nodes: nodes} + empty := []string{} + rb.queriesMu.Store(&empty) + return rb +} + +func (rb *recordingBackend) recordQuery(q string) { + for { + oldPtr := rb.queriesMu.Load() + newList := append([]string(nil), *oldPtr...) + newList = append(newList, q) + if rb.queriesMu.CompareAndSwap(oldPtr, &newList) { + return + } + } +} + +func (rb *recordingBackend) queries() []string { + return *rb.queriesMu.Load() +} + +func (rb *recordingBackend) Add(id string, fields ...string) {} +func (rb *recordingBackend) Remove(id string) {} +func (rb *recordingBackend) Count() int { return len(rb.hits) } +func (rb *recordingBackend) Close() {} + +func (rb *recordingBackend) Search(query string, limit int) []search.SearchResult { + rb.searchCalls.Add(1) + rb.recordQuery(query) + return rb.hits +} + +func (rb *recordingBackend) SearchChannels(query string, limit int) ([]search.SearchResult, []string) { + rb.channelCalls.Add(1) + rb.recordQuery(query) + return rb.hits, nil +} + +func (rb *recordingBackend) VectorChannelOnly(query string, limit int) ([]string, search.ChannelTimings) { + rb.vectorOnlyCalls.Add(1) + return nil, search.ChannelTimings{} +} + +// SearchSymbolBundles satisfies the bundle interface so the engine +// takes the bundle fast path on this backend. Edges are nil — the +// rerank tolerates an empty edge cache (it'll fall back to per-node +// fetches via Graph, but for the test we just care that the call +// signature flows through). +func (rb *recordingBackend) SearchSymbolBundles(query string, limit int) []search.SymbolBundle { + rb.bundleCalls.Add(1) + rb.recordQuery(query) + if len(rb.hits) == 0 { + return nil + } + out := make([]search.SymbolBundle, 0, len(rb.hits)) + for _, h := range rb.hits { + n := rb.nodes[h.ID] + if n == nil { + continue + } + out = append(out, search.SymbolBundle{Node: n, Score: h.Score}) + } + return out +} + +// identifierFastPathTestServer wires a Server around the recording backend so a +// search_symbols call can be inspected for vector / expansion fan-out +// activity. +func identifierFastPathTestServer(t *testing.T, names []string) (*Server, *recordingBackend) { + t.Helper() + g := graph.New() + nodes := make(map[string]*graph.Node, len(names)) + hits := make([]search.SearchResult, 0, len(names)) + for i, n := range names { + id := "pkg/" + n + ".go::" + n + node := &graph.Node{ + ID: id, Kind: graph.KindFunction, Name: n, + FilePath: "pkg/" + n + ".go", StartLine: i + 1, EndLine: i + 5, Language: "go", + } + g.AddNode(node) + nodes[id] = node + hits = append(hits, search.SearchResult{ID: id, Score: 1.0 / float64(i+1)}) + } + rb := newRecordingBackend(nodes, hits) + eng := query.NewEngine(g) + eng.SetSearch(rb) + srv := NewServer(eng, g, nil, nil, zap.NewNop(), nil) + srv.RunAnalysis() + return srv, rb +} + +// TestSearchSymbols_IdentifierFastPath_SkipsVectorAndExpansion is the +// behavioural guard for the QueryClassSymbol / Path / Signature fast +// path. Three contracts must hold: +// +// 1. The vector channel (VectorChannelOnly on the bundle path, +// SearchChannels on the legacy path) is NEVER called. +// 2. Only the primary query reaches the backend — no combined-OR +// fan-out gets emitted (no second Search / Bundle call carrying +// a concatenated expansion-term string). +// 3. The query_class echoed back in the response matches what the +// handler actually treated the query as. +// +// "NewServer" is the canonical identifier-shape probe (PascalCase, no +// whitespace, no separator) — classifies as QueryClassSymbol. +func TestSearchSymbols_IdentifierFastPath_SkipsVectorAndExpansion(t *testing.T) { + srv, rb := identifierFastPathTestServer(t, []string{"NewServer", "NewClient", "StartServer", "Server"}) + + req := mcplib.CallToolRequest{} + req.Params.Name = "search_symbols" + req.Params.Arguments = map[string]any{"query": "NewServer", "limit": 10} + res, err := srv.handleSearchSymbols(context.Background(), req) + require.NoError(t, err) + require.False(t, res.IsError, "search errored: %v", res.Content) + + // Contract 1: no vector channel call. The bundle path's + // VectorChannelOnly is the production-shape probe; SearchChannels + // is the legacy fallback. Neither may fire for an identifier query. + require.Equal(t, int32(0), rb.vectorOnlyCalls.Load(), + "identifier fast path must not call VectorChannelOnly; queries=%v", rb.queries()) + require.Equal(t, int32(0), rb.channelCalls.Load(), + "identifier fast path must not call SearchChannels; queries=%v", rb.queries()) + + // Contract 2: only the primary query reaches the backend. Bundle + // path: one call to SearchSymbolBundles with the bare query. + // Fallback Search may also fire (zero candidates → fallback tier), + // but the combined-OR expansion call is the regression to guard + // against — no Search/Bundle query carries a multi-token expansion + // payload like "NewServer StartServer Server …". + require.Equal(t, int32(1), rb.bundleCalls.Load(), + "primary bundle call should fire exactly once; queries=%v", rb.queries()) + for _, q := range rb.queries() { + require.Equal(t, "NewServer", q, + "only the original query is allowed to reach the backend on the identifier fast path; saw %q in %v", q, rb.queries()) + } + + // Contract 3: response echoes the class. + var resp map[string]any + require.NoError(t, json.Unmarshal([]byte(res.Content[0].(mcplib.TextContent).Text), &resp)) + require.Equal(t, "symbol", resp["query_class"], + "response must echo the classified query_class") +} + +// TestSearchSymbols_ConceptQuery_DoesNotEngageFastPath is the negative +// guard: a natural-language query (concept class) keeps the legacy +// pipeline — vector channel allowed, expansion allowed. Without this +// the fast-path optimisation could silently swallow concept queries. +func TestSearchSymbols_ConceptQuery_DoesNotEngageFastPath(t *testing.T) { + srv, rb := identifierFastPathTestServer(t, []string{"AuthMiddleware", "ValidateToken", "ParseConfig", "Helper"}) + + req := mcplib.CallToolRequest{} + req.Params.Name = "search_symbols" + // Multi-word natural-language query → QueryClassConcept. + req.Params.Arguments = map[string]any{"query": "where do we validate the user token auth", "limit": 10} + res, err := srv.handleSearchSymbols(context.Background(), req) + require.NoError(t, err) + require.False(t, res.IsError, "search errored: %v", res.Content) + + // Concept queries MUST still let the engine fan out to the vector + // channel — the bundle's VectorChannelOnly call fires on the + // bundle hot path. Anything that prevented this would silently + // downgrade the natural-language search experience. + require.GreaterOrEqual(t, rb.vectorOnlyCalls.Load(), int32(1), + "concept query must still pull the vector channel; queries=%v", rb.queries()) + + var resp map[string]any + require.NoError(t, json.Unmarshal([]byte(res.Content[0].(mcplib.TextContent).Text), &resp)) + require.Equal(t, "concept", resp["query_class"], + "NL query must classify as concept") +} diff --git a/internal/mcp/tools_suggest_queries.go b/internal/mcp/tools_suggest_queries.go index f3f59509..deb16e91 100644 --- a/internal/mcp/tools_suggest_queries.go +++ b/internal/mcp/tools_suggest_queries.go @@ -78,7 +78,7 @@ func (s *Server) buildSuggestedQueries(scoped []*graph.Node, inScope map[string] } // 1. Entry points — where the program starts executing. - for i, ep := range entryPoints(scoped, 3) { + for i, ep := range entryPoints(s.graph, inScope, 3) { if i >= 2 { break } @@ -90,27 +90,42 @@ func (s *Server) buildSuggestedQueries(scoped []*graph.Node, inScope map[string] // and by how many of those edges cross a community boundary. Done // directly off the graph rather than via FindHotspots, whose // mean+2σ threshold returns nothing on small repositories. + // + // EdgesByKind streams from the storage layer (one Cypher per kind + // on Ladybug, an indexed bucket scan in-memory) so the cost is + // O(call+reference edges) once — replacing the per-node + // GetInEdges loop that was N cgo round-trips materialising the + // full in-edge bucket per candidate. nodeToComm := map[string]string{} if comms := s.getCommunities(); comms != nil { nodeToComm = comms.NodeToComm } - var stats []symbolStat + statByID := make(map[string]*symbolStat, len(scoped)) + stats := make([]symbolStat, 0, len(scoped)) for _, n := range scoped { if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod && n.Kind != graph.KindType { continue } - st := symbolStat{node: n} - myComm := nodeToComm[n.ID] - for _, e := range s.graph.GetInEdges(n.ID) { - if e.Kind != graph.EdgeCalls && e.Kind != graph.EdgeReferences { + stats = append(stats, symbolStat{node: n}) + } + for i := range stats { + statByID[stats[i].node.ID] = &stats[i] + } + for _, k := range []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} { + for e := range s.graph.EdgesByKind(k) { + if e == nil { + continue + } + st, ok := statByID[e.To] + if !ok { continue } st.fanIn++ + myComm := nodeToComm[e.To] if c := nodeToComm[e.From]; myComm != "" && c != "" && c != myComm { st.crossings++ } } - stats = append(stats, st) } // 2. Bridges — symbols pulled at from the most other subsystems. diff --git a/internal/mcp/tools_surprising.go b/internal/mcp/tools_surprising.go index 9a65c196..883d5b2c 100644 --- a/internal/mcp/tools_surprising.go +++ b/internal/mcp/tools_surprising.go @@ -61,27 +61,55 @@ func (s *Server) handleGetSurprisingConnections(ctx context.Context, req mcp.Cal nodeToComm = cr.NodeToComm } - // Build a fast scoped-node index and an in-edge counter for - // the hub check. Counting once is cheaper than calling - // GetInEdges per edge. + // Build a fast scoped-node index. We still need ALL kinds here — + // edges in the surprise tally can land on any node, not just + // function/method. Use scopedNodes' single bulk pull rather than + // the per-edge GetNode lookups the legacy path fell back to. scopedSet := make(map[string]*graph.Node, 1024) for _, n := range s.scopedNodes(ctx) { scopedSet[n.ID] = n } - allEdges := s.graph.AllEdges() - inDegree := make(map[string]int, len(scopedSet)) + // Kind tally — short-circuit the AllEdges scan when the backend + // implements EdgeKindCounter (returns one row per distinct kind, + // not one per edge — a few-dozen-row response replaces a ~286k + // edge round-trip on Ladybug). The total edge count then comes + // from the per-kind sum so we don't need a second backend call. kindCounts := make(map[graph.EdgeKind]int, 16) + totalEdges := 0 + var allEdges []*graph.Edge + if counter, ok := s.graph.(graph.EdgeKindCounter); ok { + for k, c := range counter.EdgeKindCounts() { + kindCounts[k] = c + totalEdges += c + } + } else { + allEdges = s.graph.AllEdges() + for _, e := range allEdges { + kindCounts[e.Kind]++ + } + totalEdges = len(allEdges) + } + + // In-degree still walks edges Go-side — the per-edge anomaly walk + // further down already pulls the full edge stream, so bucketing + // fan-in during that traversal is free. The InDegreeForNodes + // capability runs one COUNT { … } per id; on the gortex workspace + // the scoped set is ~30k function/method nodes, and tens of + // thousands of indexed subqueries are noticeably slower than the + // single AllEdges materialisation the anomaly walk already pays. + if allEdges == nil { + allEdges = s.graph.AllEdges() + } + inDegree := make(map[string]int, len(scopedSet)) for _, e := range allEdges { if _, ok := scopedSet[e.To]; ok { inDegree[e.To]++ } - kindCounts[e.Kind]++ } // Determine which edge kinds are "unusual" — share of total // edges is at or below rare_kind_pct. Recomputed once per call. - totalEdges := len(allEdges) rareKinds := make(map[graph.EdgeKind]bool, len(kindCounts)) if totalEdges > 0 { thresholdFrac := rareKindPct / 100.0 diff --git a/internal/mcp/tools_untested.go b/internal/mcp/tools_untested.go index 53096f26..560c9a18 100644 --- a/internal/mcp/tools_untested.go +++ b/internal/mcp/tools_untested.go @@ -33,12 +33,11 @@ func (s *Server) handleGetUntestedSymbols(ctx context.Context, req mcp.CallToolR // Fan-in map for ranking — incoming calls/references only; imports and // defines would flood every exported symbol with meaningless coverage. - fanIn := make(map[string]int) - for _, e := range s.graph.AllEdges() { - if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { - fanIn[e.To]++ - } - } + // Backends that implement graph.InEdgeCounter serve this from one + // Cypher count(*) join — on Ladybug the legacy AllEdges() loop + // materialised every edge over cgo just to bucket two kinds. The + // fallback walks AllEdges() as before. + fanIn := collectFanInByKind(s.graph, []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences}) type untestedEntry struct { ID string `json:"id"` @@ -51,10 +50,8 @@ func (s *Server) handleGetUntestedSymbols(ctx context.Context, req mcp.CallToolR var entries []untestedEntry totalCandidates := 0 - for _, n := range s.scopedNodes(ctx) { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } + scoped := s.scopedNodesByKinds(ctx, []graph.NodeKind{graph.KindFunction, graph.KindMethod}) + for _, n := range scoped { // Skip symbols defined inside test files — those ARE test code. if isTestFile(n.FilePath) { continue @@ -117,26 +114,49 @@ func (s *Server) handleGetUntestedSymbols(ctx context.Context, req mcp.CallToolR // Test files are detected via isTestFile so this works across languages // (Go _test.go, Python test_*.py, JS .spec.ts, etc.) without per-language // special-casing here. -func reachableFromTests(g *graph.Graph) map[string]bool { - covered := make(map[string]bool) - - // Seed: every function/method defined in a test file. - var frontier []string - for _, n := range g.AllNodes() { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue - } - if !isTestFile(n.FilePath) { - continue +// +// Seeds the frontier via NodesByKind(function|method) so disk backends +// only materialise the two kinds rather than the whole node table. +// The test-file predicate is a Go string heuristic — the backend has +// no equivalent — so it stays in the post-filter. +// +// The BFS itself runs through graph.ReachableForwardByKinds when the +// backend implements it (one Cypher query per layer over the frontier +// IN-list instead of N+1 GetOutEdges cgo round-trips). Falls back to +// the per-id GetOutEdges loop on backends that don't. +func reachableFromTests(g graph.Store) map[string]bool { + // Seed: every function/method defined in a test file. NodesByKind + // pushes the kind filter into the backend; isTestFile stays Go. + seeds := make([]string, 0) + for _, kind := range []graph.NodeKind{graph.KindFunction, graph.KindMethod} { + for n := range g.NodesByKind(kind) { + if n == nil || !isTestFile(n.FilePath) { + continue + } + seeds = append(seeds, n.ID) } - if !covered[n.ID] { - covered[n.ID] = true - frontier = append(frontier, n.ID) + } + if len(seeds) == 0 { + return map[string]bool{} + } + + kinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + if rf, ok := g.(graph.ReachableForwardByKinds); ok { + if got := rf.ReachableForwardByKinds(seeds, kinds); got != nil { + return got } + return map[string]bool{} } - // Forward BFS along calls + references. A test function that calls X - // covers X; X transitively covers whatever X calls, etc. + // Fallback: layer-by-layer BFS using per-id GetOutEdges. + covered := make(map[string]bool, len(seeds)) + frontier := make([]string, 0, len(seeds)) + for _, id := range seeds { + if !covered[id] { + covered[id] = true + frontier = append(frontier, id) + } + } for len(frontier) > 0 { next := frontier[:0:0] for _, id := range frontier { @@ -154,3 +174,32 @@ func reachableFromTests(g *graph.Graph) map[string]bool { } return covered } + +// collectFanInByKind returns the per-target incoming-edge count for +// every edge whose kind is in the allowlist. Prefers the +// graph.InEdgeCounter capability — backends that ship it run one +// Cypher count(*) per request instead of an AllEdges() materialisation +// + Go-side bucketing. +func collectFanInByKind(g graph.Store, kinds []graph.EdgeKind) map[string]int { + if len(kinds) == 0 { + return map[string]int{} + } + if ic, ok := g.(graph.InEdgeCounter); ok { + if got := ic.InEdgeCountsByKind(kinds); got != nil { + return got + } + return map[string]int{} + } + allowed := make(map[graph.EdgeKind]struct{}, len(kinds)) + for _, k := range kinds { + allowed[k] = struct{}{} + } + out := make(map[string]int) + for _, e := range g.AllEdges() { + if _, ok := allowed[e.Kind]; !ok { + continue + } + out[e.To]++ + } + return out +} diff --git a/internal/mcp/tools_wakeup.go b/internal/mcp/tools_wakeup.go index da04d12d..1ca2dd30 100644 --- a/internal/mcp/tools_wakeup.go +++ b/internal/mcp/tools_wakeup.go @@ -41,6 +41,13 @@ type WakeupOptions struct { TopCommunities int TopHotspots int TopEntryPoints int + // PrecomputedHotspots, when non-nil, is the default-threshold + // hotspot ranking the caller has already paid for. Threaded by + // the MCP handler from the server-wide cache so the wakeup turn + // skips a redundant FindHotspots (and its ComputeBetweenness + // pass). nil means BuildWakeup computes it fresh — the CLI + // `gortex wakeup` path. + PrecomputedHotspots []analysis.HotspotEntry } // DefaultWakeupOptions returns the defaults the MCP handler uses. @@ -58,7 +65,7 @@ func DefaultWakeupOptions() WakeupOptions { // communities. Returns the markdown body and an approximate token // count (bytes / 4). Exposed so CLI and MCP paths share one // implementation. -func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts WakeupOptions) (markdown string, tokensEst int) { +func BuildWakeup(g graph.Store, communities *analysis.CommunityResult, opts WakeupOptions) (markdown string, tokensEst int) { if opts.MaxTokens <= 0 { opts.MaxTokens = 500 } @@ -72,16 +79,23 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak opts.TopEntryPoints = 5 } - nodes := g.AllNodes() + // Wakeup is a whole-repo digest — language tally + hotspot list + + // entry-point list, with no session scoping. The lang count can + // come from Stats() (one indexed groupby on disk backends); + // hotspots and entry points already iterate the function/method + // subset via the analyzers / NodesByKindsScanner path, so the + // AllNodes() pull the legacy build used to feed the lang summary + // just adds a redundant 107k-row cgo trip on Ladybug. + stats := g.Stats() var b strings.Builder b.WriteString("# Codebase wakeup\n\n") - // Summary line: total nodes, top 3 languages. langCounts := map[string]int{} - for _, n := range nodes { - if n.Language != "" { - langCounts[n.Language]++ + for lang, c := range stats.ByLanguage { + if lang == "" { + continue } + langCounts[lang] = c } type langRow struct { name string @@ -105,8 +119,9 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak for _, l := range topLangs { langSummary = append(langSummary, fmt.Sprintf("%s (%d)", l.name, l.count)) } + fileCount := stats.ByKind[string(graph.KindFile)] fmt.Fprintf(&b, "**Scale.** %d indexed symbols across %d files. Primary: %s.\n\n", - len(nodes), countFileNodes(nodes), strings.Join(langSummary, ", ")) + stats.TotalNodes, fileCount, strings.Join(langSummary, ", ")) // Communities. if communities != nil && len(communities.Communities) > 0 { @@ -131,7 +146,12 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak } // Hotspots. - hotspots := analysis.FindHotspots(g, communities, 0) + var hotspots []analysis.HotspotEntry + if opts.PrecomputedHotspots != nil { + hotspots = opts.PrecomputedHotspots + } else { + hotspots = analysis.FindHotspots(g, communities, 0) + } if len(hotspots) > opts.TopHotspots { hotspots = hotspots[:opts.TopHotspots] } @@ -144,7 +164,7 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak } // Entry points. - entries := wakeupEntryPoints(nodes, g, opts.TopEntryPoints) + entries := wakeupEntryPoints(g, opts.TopEntryPoints) if len(entries) > 0 { b.WriteString("**Entry points.**\n") for _, e := range entries { @@ -158,42 +178,80 @@ func BuildWakeup(g *graph.Graph, communities *analysis.CommunityResult, opts Wak return out, len(out) / 4 } -func countFileNodes(nodes []*graph.Node) int { - n := 0 - for _, x := range nodes { - if x.Kind == graph.KindFile { - n++ + +// wakeupEntryPoints returns functions/methods with zero incoming +// edges and at least one outgoing edge, ranked by out-degree. +// +// Uses NodeDegreeAggregator when the backend implements it (one +// batched in/out count instead of up to 3N GetInEdges/GetOutEdges +// cgo round-trips on Ladybug — the sort path called GetOutEdges +// twice per candidate, the worst single hot spot in this file). We +// stash the fan-out alongside each node so the sort never has to +// re-query. +func wakeupEntryPoints(g graph.Store, top int) []*graph.Node { + type entry struct { + node *graph.Node + fanOut int + } + // Pull only the callable subset via NodesByKindsScanner so disk + // backends never materialise the whole node table for an entry- + // point candidate set that only ranges across function + method. + var pool []*graph.Node + if scan, ok := g.(graph.NodesByKindsScanner); ok { + pool = scan.NodesByKinds([]graph.NodeKind{graph.KindFunction, graph.KindMethod}) + } else { + all := g.AllNodes() + pool = make([]*graph.Node, 0, len(all)) + for _, n := range all { + if n.Kind == graph.KindFunction || n.Kind == graph.KindMethod { + pool = append(pool, n) + } } } - return n -} - -func wakeupEntryPoints(nodes []*graph.Node, g *graph.Graph, top int) []*graph.Node { - candidates := make([]*graph.Node, 0) - for _, n := range nodes { - if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { - continue + entries := make([]entry, 0, len(pool)) + if agg, ok := g.(graph.NodeDegreeAggregator); ok && len(pool) > 0 { + ids := make([]string, 0, len(pool)) + byID := make(map[string]*graph.Node, len(pool)) + for _, n := range pool { + ids = append(ids, n.ID) + byID[n.ID] = n } - if len(g.GetInEdges(n.ID)) > 0 { - continue + for _, r := range agg.NodeDegreeCounts(ids, nil) { + if r.InCount > 0 || r.OutCount == 0 { + continue + } + n := byID[r.NodeID] + if n == nil { + continue + } + entries = append(entries, entry{node: n, fanOut: r.OutCount}) } - if len(g.GetOutEdges(n.ID)) == 0 { - continue + } else { + for _, n := range pool { + if len(g.GetInEdges(n.ID)) > 0 { + continue + } + out := len(g.GetOutEdges(n.ID)) + if out == 0 { + continue + } + entries = append(entries, entry{node: n, fanOut: out}) } - candidates = append(candidates, n) } - sort.Slice(candidates, func(i, j int) bool { - oi := len(g.GetOutEdges(candidates[i].ID)) - oj := len(g.GetOutEdges(candidates[j].ID)) - if oi != oj { - return oi > oj + sort.Slice(entries, func(i, j int) bool { + if entries[i].fanOut != entries[j].fanOut { + return entries[i].fanOut > entries[j].fanOut } - return candidates[i].ID < candidates[j].ID + return entries[i].node.ID < entries[j].node.ID }) - if len(candidates) > top { - candidates = candidates[:top] + if len(entries) > top { + entries = entries[:top] + } + out := make([]*graph.Node, 0, len(entries)) + for _, e := range entries { + out = append(out, e.node) } - return candidates + return out } // trimToTokens caps the markdown to the requested approximate token @@ -226,6 +284,7 @@ func (s *Server) handleGortexWakeup(ctx context.Context, req mcp.CallToolRequest opts.TopEntryPoints = v } + opts.PrecomputedHotspots = s.getHotspots() md, est := BuildWakeup(s.graph, s.getCommunities(), opts) format := strings.ToLower(strings.TrimSpace(req.GetString("format", "markdown"))) diff --git a/internal/modules/scanner.go b/internal/modules/scanner.go index 2630aa20..3357fbd5 100644 --- a/internal/modules/scanner.go +++ b/internal/modules/scanner.go @@ -948,7 +948,7 @@ func BuildGraphArtifacts(filePath string, specs []Spec) ([]*graph.Node, []*graph // dependencies. Multi-version imports (Go's `module/v2` shape) // match the longest spec; a manifest declaring both `bar` and // `bar/v2` will resolve `import bar/v2/sub` to the v2 spec. -func LinkImports(g *graph.Graph, specs []Spec, ownModulePath string) int { +func LinkImports(g graph.Store, specs []Spec, ownModulePath string) int { if g == nil { return 0 } @@ -961,7 +961,7 @@ func LinkImports(g *graph.Graph, specs []Spec, ownModulePath string) int { // in multi-repo mode should pass the repo's own KindImport nodes (e.g. // from g.GetRepoNodes(repoPrefix) filtered by Kind) so each pass stays // O(repo size). -func LinkImportsIn(g *graph.Graph, importNodes []*graph.Node, specs []Spec, ownModulePath string) int { +func LinkImportsIn(g graph.Store, importNodes []*graph.Node, specs []Spec, ownModulePath string) int { if g == nil || len(specs) == 0 || len(importNodes) == 0 { return 0 } diff --git a/internal/parser/languages/go_dataflow.go b/internal/parser/languages/go_dataflow.go index 1b6c6d5c..d32a0213 100644 --- a/internal/parser/languages/go_dataflow.go +++ b/internal/parser/languages/go_dataflow.go @@ -23,13 +23,24 @@ import ( // `x := …` / `var x = …` / a range clause / a type-switch / a for- // statement init clause maps to a synthetic ID: // -// #local:@ +// #local:@+ // -// where ownerID is the enclosing function/method node and line is -// the 1-based decl line. These IDs are valid edge endpoints — the -// BFS in `flow_between` traverses them — but no graph node is -// materialised, keeping symbol search free of every transient -// binding in every function body. +// where ownerID is the enclosing function/method node and the +// offset is the local's 1-based line minus the function-decl's +// 1-based line. The leading `+` flags the value as a relative +// offset rather than an absolute line — important for the +// incremental indexer: adding a line *above* the enclosing +// function leaves every local-binding ID inside it stable, so the +// per-save edge churn collapses from O(locals-in-file) to +// O(locals-below-the-edit). +// +// Each binding is materialised as a KindLocal graph node anchored +// to the enclosing function via EdgeMemberOf, so dataflow edges +// targeting locals are not orphan endpoints — they navigate to a +// first-class node like every other edge. KindLocal nodes are +// excluded from the BM25 search index (see +// internal/indexer.shouldIndexForSearch) so identifiers like +// `err` / `data` / `n` / `i` don't flood search results. // // v1 limitations: // @@ -46,7 +57,7 @@ import ( // mirrors the call edge for the same call site. Indexer post- // resolution rewrites them once the callee is known — see // `materializeDataflowParams` in internal/indexer. -func emitGoDataflow(ownerID string, body *sitter.Node, paramsByName map[string]string, src []byte, filePath string, result *parser.ExtractionResult) { +func emitGoDataflow(ownerID string, ownerStartLine int, body *sitter.Node, paramsByName map[string]string, imports map[string]string, src []byte, filePath string, result *parser.ExtractionResult) { if body == nil { return } @@ -59,15 +70,51 @@ func emitGoDataflow(ownerID string, body *sitter.Node, paramsByName map[string]s scope.bindings[name] = []string{paramID} } walker := &goFlowWalker{ - ownerID: ownerID, - filePath: filePath, - src: src, - scope: scope, - result: result, + ownerID: ownerID, + ownerStartLine: ownerStartLine, + filePath: filePath, + src: src, + scope: scope, + result: result, + emittedLocals: map[string]struct{}{}, + imports: imports, } walker.walk(body) } +// bindLocal computes the canonical local-binding ID, registers it in +// scope, and on first sight emits the corresponding KindLocal node + +// EdgeMemberOf edge so the binding is a first-class graph element +// rather than a phantom edge endpoint. Returns the ID. Dedupe key is +// the ID itself: a binding visited through multiple walk paths still +// produces one node row. +func (w *goFlowWalker) bindLocal(name string, line int) string { + id := w.localID(name, line) + w.scope.bindings[name] = []string{id} + if _, ok := w.emittedLocals[id]; ok { + return id + } + w.emittedLocals[id] = struct{}{} + w.result.Nodes = append(w.result.Nodes, &graph.Node{ + ID: id, + Kind: graph.KindLocal, + Name: name, + FilePath: w.filePath, + StartLine: line, + EndLine: line, + Language: "go", + }) + w.result.Edges = append(w.result.Edges, &graph.Edge{ + From: id, + To: w.ownerID, + Kind: graph.EdgeMemberOf, + FilePath: w.filePath, + Line: line, + Origin: graph.OriginASTResolved, + }) + return id +} + // goFlowScope tracks the most recent source IDs for each named // binding inside a function body. Reassignment replaces the slice @@ -83,13 +130,29 @@ func newGoFlowScope() *goFlowScope { // goFlowWalker carries the per-function state needed to emit // dataflow edges. ownerID is the enclosing function node ID; -// scope tracks live bindings; result accumulates emitted edges. +// ownerStartLine is the 1-based source line of the function's +// declaration — local-binding IDs are anchored to it so edits +// above the function don't churn every binding inside; +// scope tracks live bindings; result accumulates emitted edges; +// emittedLocals dedupes KindLocal node emissions so a binding +// visited through more than one walk path doesn't produce +// duplicate node rows. type goFlowWalker struct { - ownerID string - filePath string - src []byte - scope *goFlowScope - result *parser.ExtractionResult + ownerID string + ownerStartLine int + filePath string + src []byte + scope *goFlowScope + result *parser.ExtractionResult + emittedLocals map[string]struct{} + // imports maps the file's package aliases to their import paths + // (`fmt → "fmt"`, `assert → "github.com/stretchr/testify/assert"`). + // Threaded through so the selector-expression cases in calleeRef / + // exprSources can emit `unresolved::extern::::` + // when the LHS identifier is an imported package — matching the + // shape the call extractor uses — instead of collapsing the + // qualifier to `*.` and losing the resolution evidence. + imports map[string]string } func (w *goFlowWalker) walk(n *sitter.Node) { @@ -126,10 +189,20 @@ func (w *goFlowWalker) walk(n *sitter.Node) { } // localID returns the synthetic local-binding ID for `name` at the -// given line. Always anchored to ownerID so two functions can have -// identically-named locals without colliding. +// given absolute line. Always anchored to ownerID so two functions +// can have identically-named locals without colliding. The line is +// encoded as an offset from the owner's declaration line (prefixed +// `+` so it's unambiguous): a same-function shift caused by an edit +// above the function leaves the ID stable. A defensive zero-anchor +// fallback handles cases where the caller didn't supply an owner +// start line (the walker is constructed with one in production; the +// fallback keeps misuse from producing IDs missing the @ separator). func (w *goFlowWalker) localID(name string, line int) string { - return w.ownerID + "#local:" + name + "@" + strconv.Itoa(line) + offset := line + if w.ownerStartLine > 0 { + offset = line - w.ownerStartLine + 1 + } + return w.ownerID + "#local:" + name + "@+" + strconv.Itoa(offset) } func (w *goFlowWalker) handleShortVarDecl(n *sitter.Node) { @@ -224,9 +297,7 @@ func (w *goFlowWalker) declareTarget(lhs *sitter.Node, decl bool, line int) (str if name == "" || name == "_" { return "", false } - id := w.localID(name, line) - w.scope.bindings[name] = []string{id} - return id, true + return w.bindLocal(name, line), true case "selector_expression": // `x.field = …` — write goes to the field node when known. field := lhs.ChildByFieldName("field") @@ -343,8 +414,7 @@ func (w *goFlowWalker) handleRangeClause(n *sitter.Node) { if name == "" || name == "_" { continue } - id := w.localID(name, line) - w.scope.bindings[name] = []string{id} + id := w.bindLocal(name, line) for _, src := range rhsSources { if src == "" || src == id { continue @@ -477,11 +547,22 @@ func (w *goFlowWalker) calleeRef(call *sitter.Node) string { if method == "" { return "" } - // Receiver-typed targets (e.g. an import alias dispatch) - // can't be reconstructed without the file's import map. - // Fall through to the generic "*." form — same shape the - // call extractor uses when receiver is a local. - _ = recv + // Package-qualified call: when the receiver is a bare + // identifier matching one of the file's import aliases, + // emit the same `unresolved::extern::::` + // shape the call extractor uses for explicit calls (see + // golang.go::Extract `imports[c.receiver]` branch). The + // resolver's resolveExtern pass then lands these on + // stdlib::/dep::/external:: targets or the real cross-repo + // symbol when the import path resolves to an indexed file. + // Without this branch the qualifier is dropped and we leak + // `unresolved::*.` for every package call inside a + // dataflow context. + if recv != nil && recv.Type() == "identifier" { + if importPath := w.importPathFor(recv.Content(w.src)); importPath != "" { + return "unresolved::extern::" + importPath + "::" + method + } + } return "unresolved::*." + method case "generic_function": // `f[T](args)` — strip the type instantiation wrapper. @@ -551,6 +632,17 @@ func (w *goFlowWalker) exprSources(n *sitter.Node) []string { if fieldName == "" { return nil } + // Package-qualified value: when the receiver is a bare + // identifier matching one of the file's import aliases, + // emit `unresolved::extern::::` so the + // resolver can land it on stdlib::/dep::/external::. See + // the matching comment in calleeRef. + operand := n.ChildByFieldName("operand") + if operand != nil && operand.Type() == "identifier" { + if importPath := w.importPathFor(operand.Content(w.src)); importPath != "" { + return []string{"unresolved::extern::" + importPath + "::" + fieldName} + } + } return []string{"unresolved::*." + fieldName} case "call_expression": ref := w.calleeRef(n) @@ -666,3 +758,17 @@ func (w *goFlowWalker) emitValueFlow(src, dst string, line int) { Origin: graph.OriginASTResolved, }) } + +// importPathFor returns the import path the given identifier names +// as a package alias in the current file, or "" when the identifier +// doesn't match any import. The walker's imports map is the same +// map populated by the Go extractor's emitImport handler, so an +// `assert` alias for `github.com/stretchr/testify/assert` resolves +// here exactly as it does in the call extractor's +// `imports[c.receiver]` branch. +func (w *goFlowWalker) importPathFor(name string) string { + if name == "" || w.imports == nil { + return "" + } + return w.imports[name] +} diff --git a/internal/parser/languages/go_dataflow_local_nodes_test.go b/internal/parser/languages/go_dataflow_local_nodes_test.go new file mode 100644 index 00000000..b287bd79 --- /dev/null +++ b/internal/parser/languages/go_dataflow_local_nodes_test.go @@ -0,0 +1,118 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestGoDataflow_LocalsMaterialiseAsKindLocal is the regression for +// the design change that lifted intra-function bindings from +// edge-endpoint-only IDs to first-class KindLocal nodes. Storage +// backends that enforce rel-table FK (Ladybug) had to +// auto-stub empty Node rows for every local-binding edge endpoint — +// 51k+ stubs on the gortex codebase. Materialising as KindLocal +// converges every backend's node count and gives locals a proper +// home in the graph via EdgeMemberOf to the enclosing function. +func TestGoDataflow_LocalsMaterialiseAsKindLocal(t *testing.T) { + src := `package foo + +func Handler(x int) int { + y := x + z := y + return z +} +` + fix := runGoExtract(t, src) + owner := "pkg/foo.go::Handler" + + locals := fix.nodesByKind[graph.KindLocal] + require.NotEmpty(t, locals, "extractor should emit KindLocal nodes for short_var_decl bindings") + + names := map[string]*graph.Node{} + for _, n := range locals { + names[n.Name] = n + } + for _, want := range []string{"y", "z"} { + n, ok := names[want] + require.Truef(t, ok, "missing KindLocal for %q; got: %v", want, names) + assert.Equal(t, graph.KindLocal, n.Kind) + assert.Equal(t, "pkg/foo.go", n.FilePath, "local %q should carry the file it lives in", want) + assert.Equal(t, "go", n.Language, "local %q should carry language", want) + assert.Greater(t, n.StartLine, 0, "local %q should carry a source line", want) + // The node ID must be exactly the same string the dataflow + // edges target — they're keyed by edge endpoint, so a + // mismatch silently breaks flow_between BFS. + assert.True(t, strings.HasPrefix(n.ID, owner+"#local:"+want+"@+"), + "local node ID must follow the function-relative offset convention, got %q", n.ID) + } + + // Every materialised local must have an EdgeMemberOf edge to the + // enclosing function — that's what makes the local discoverable + // as a member of its owner via get_callers / class_hierarchy. + memberEdges := fix.edgesByKind[graph.EdgeMemberOf] + memberOwners := map[string]string{} + for _, e := range memberEdges { + memberOwners[e.From] = e.To + } + for _, n := range locals { + owner, ok := memberOwners[n.ID] + assert.Truef(t, ok, "local %q must have an EdgeMemberOf edge", n.Name) + assert.Equalf(t, "pkg/foo.go::Handler", owner, + "local %q's EdgeMemberOf target must be the enclosing function", n.Name) + } +} + +// TestGoDataflow_LocalsDedupedAcrossWalks guards against duplicate +// KindLocal node emissions if the same binding is visited through +// more than one walk path (e.g., short_var + a subsequent reference +// in the same scope). The walker's emittedLocals set must collapse +// repeat visits to one node row. +func TestGoDataflow_LocalsDedupedAcrossWalks(t *testing.T) { + src := `package foo + +func Multi() { + y := 1 + _ = y + _ = y + _ = y +} +` + fix := runGoExtract(t, src) + ys := []string{} + for _, n := range fix.nodesByKind[graph.KindLocal] { + if n.Name == "y" { + ys = append(ys, n.ID) + } + } + assert.Lenf(t, ys, 1, "exactly one KindLocal row per (function, binding) — got: %v", ys) +} + +// TestGoDataflow_RangeClauseEmitsKindLocal covers the second binding +// site (the range-clause path) — confirms the materialisation isn't +// limited to short_var_decl / var_spec. +func TestGoDataflow_RangeClauseEmitsKindLocal(t *testing.T) { + src := `package foo + +func Iter(xs []int) int { + total := 0 + for i, v := range xs { + _ = i + total += v + } + return total +} +` + fix := runGoExtract(t, src) + names := map[string]bool{} + for _, n := range fix.nodesByKind[graph.KindLocal] { + names[n.Name] = true + } + for _, want := range []string{"total", "i", "v"} { + assert.Truef(t, names[want], "missing KindLocal for range binding %q; got %v", want, names) + } +} diff --git a/internal/parser/languages/go_dataflow_offset_test.go b/internal/parser/languages/go_dataflow_offset_test.go new file mode 100644 index 00000000..ab63f4e2 --- /dev/null +++ b/internal/parser/languages/go_dataflow_offset_test.go @@ -0,0 +1,177 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +// TestGoDataflow_LocalIDsAreFunctionRelative is the regression for +// the absolute-line local-ID encoding that produced O(locals-in-file) +// edge churn on every save: adding an unrelated line above a function +// shifted every local-binding ID inside it, so the per-file +// incremental update had to delete + re-insert every dataflow edge +// even when nothing inside the function changed. +// +// The function-relative encoding (#local:@+) +// anchors each binding's ID to the owner's declaration line, so the +// IDs are invariant under shifts of the function as a whole — only +// edits *inside* the function above a binding shift that binding's +// ID. The test indexes the same source twice — once verbatim, once +// with a comment inserted above the function — and asserts the local +// IDs match exactly. +func TestGoDataflow_LocalIDsAreFunctionRelative(t *testing.T) { + original := `package foo + +func Handler(x int) int { + y := x + z := y + return z +} +` + // Same Handler, but with 5 unrelated lines of comments above it. + // If local IDs used absolute lines, every #local: target in the + // extracted edges would shift by 5 and would NOT match the + // originals. + shifted := `package foo + +// shimmer +// shimmer +// shimmer +// shimmer +// shimmer +func Handler(x int) int { + y := x + z := y + return z +} +` + + collectLocalIDs := func(t *testing.T, src string) map[string]struct{} { + t.Helper() + fix := runGoExtract(t, src) + ids := map[string]struct{}{} + for _, edges := range fix.edgesByKind { + for _, e := range edges { + for _, ep := range []string{e.From, e.To} { + if strings.Contains(ep, "#local:") { + ids[ep] = struct{}{} + } + } + } + } + return ids + } + + origIDs := collectLocalIDs(t, original) + shiftedIDs := collectLocalIDs(t, shifted) + + // Sanity: the function actually has locals to compare. + assert.NotEmpty(t, origIDs, "extractor should emit #local: edge endpoints") + + // The two sets must match. Any divergence means a local-ID shifted + // because of the lines added *above* the function — the exact + // churn case the offset encoding is meant to prevent. + assert.Equal(t, origIDs, shiftedIDs, + "local IDs must stay stable when only lines ABOVE the function move") + + // Belt + suspenders: every #local: ID must carry the offset + // marker (`@+`) rather than the legacy `@`. + for id := range origIDs { + at := strings.LastIndex(id, "@") + assert.Greater(t, at, 0, "id has no @ separator: %q", id) + assert.Equal(t, byte('+'), id[at+1], "id must encode offset (`@+`), got %q", id) + } +} + +// TestGoDataflow_LocalIDsShiftOnIntraFunctionEdit confirms the +// converse: edits *inside* the function above a binding still shift +// that binding's ID. (The offset encoding only neutralises edits +// outside the function, not inside it — local-line motion within the +// function is the load-bearing disambiguator for the same name +// shadowed at different lines.) +func TestGoDataflow_LocalIDsShiftOnIntraFunctionEdit(t *testing.T) { + base := `package foo + +func Handler(x int) int { + y := x + return y +} +` + withInternalShift := `package foo + +func Handler(x int) int { + _ = 1 // <-- inserted INSIDE the function, above y + y := x + return y +} +` + collect := func(t *testing.T, src string) map[string]struct{} { + t.Helper() + ids := map[string]struct{}{} + for _, edges := range runGoExtract(t, src).edgesByKind { + for _, e := range edges { + for _, ep := range []string{e.From, e.To} { + if strings.Contains(ep, "#local:y@") { + ids[ep] = struct{}{} + } + } + } + } + return ids + } + + a := collect(t, base) + b := collect(t, withInternalShift) + assert.NotEmpty(t, a) + assert.NotEmpty(t, b) + assert.NotEqual(t, a, b, + "adding a line INSIDE the function above the binding MUST shift the local ID — this is the disambiguator for re-bound names") +} + +// TestGoClosureIDsAreFunctionRelative is the closure analogue of the +// local-binding test. The closure's anchor used to be the absolute +// `#closure@`; switching it to `#closure@+` gives the +// same churn-reduction benefit. The Name field still carries the +// absolute line for human readability in outlines. +func TestGoClosureIDsAreFunctionRelative(t *testing.T) { + original := `package foo + +func Outer() func() int { + return func() int { return 42 } +} +` + shifted := `package foo + +// a +// b +// c +func Outer() func() int { + return func() int { return 42 } +} +` + closureNodes := func(t *testing.T, src string) map[string]*graph.Node { + t.Helper() + fix := runGoExtract(t, src) + out := map[string]*graph.Node{} + for _, n := range fix.nodesByKind[graph.KindClosure] { + out[n.ID] = n + } + return out + } + + a := closureNodes(t, original) + b := closureNodes(t, shifted) + assert.NotEmpty(t, a, "extractor should emit at least one closure node") + + // IDs must match across the shift. + for id := range a { + assert.Contains(t, b, id, + "closure ID must stay stable when only lines ABOVE the enclosing function move") + assert.True(t, strings.Contains(id, "#closure@+"), + "closure ID must use the `@+` form, got %q", id) + } +} diff --git a/internal/parser/languages/go_dataflow_qualifier_test.go b/internal/parser/languages/go_dataflow_qualifier_test.go new file mode 100644 index 00000000..561ac1d3 --- /dev/null +++ b/internal/parser/languages/go_dataflow_qualifier_test.go @@ -0,0 +1,161 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +// TestGoDataflow_SelectorCallPreservesPackageQualifier is the +// regression for the dataflow walker dropping the package qualifier +// on selector calls (`fmt.Sprintf`, `strings.Join`, `assert.True`) +// and leaking `unresolved::*.` instead of the proper +// `unresolved::extern::::` shape the call +// extractor uses. The resolver's resolveExtern pass then lands +// these on stdlib::/dep::/external::, so without preserving the +// qualifier here every package-qualified call inside a dataflow +// context (argument source, return target, value flow) stays as +// an unresolved phantom. +func TestGoDataflow_SelectorCallPreservesPackageQualifier(t *testing.T) { + src := `package foo + +import ( + "fmt" + "strings" +) + +func Handler(input string) string { + cleaned := strings.TrimSpace(input) + return fmt.Sprintf("got: %s", cleaned) +} +` + fix := runGoExtract(t, src) + + // Every `unresolved::extern::::` target the + // dataflow walker emits must use the canonical import path, + // not the `*.method` collapsed form. + var hasStringsTrimSpace, hasFmtSprintf bool + for _, edges := range fix.edgesByKind { + for _, e := range edges { + switch e.To { + case "unresolved::extern::strings::TrimSpace": + hasStringsTrimSpace = true + case "unresolved::extern::fmt::Sprintf": + hasFmtSprintf = true + } + } + } + + assert.True(t, hasStringsTrimSpace, + "dataflow walker must preserve the `strings` qualifier on TrimSpace(...) calls — got: %s", + dumpDataflowSelectorTargets(fix)) + assert.True(t, hasFmtSprintf, + "dataflow walker must preserve the `fmt` qualifier on Sprintf(...) calls — got: %s", + dumpDataflowSelectorTargets(fix)) + + // And the collapsed `*.TrimSpace`/`*.Sprintf` shape must NOT + // appear for these calls. + for _, edges := range fix.edgesByKind { + for _, e := range edges { + assert.NotEqual(t, "unresolved::*.TrimSpace", e.To, + "package-qualified Trim should never land as `unresolved::*.TrimSpace`") + assert.NotEqual(t, "unresolved::*.Sprintf", e.To, + "package-qualified Sprintf should never land as `unresolved::*.Sprintf`") + } + } +} + +// TestGoDataflow_NonImportedReceiverFallsBack ensures the pass +// doesn't false-positive: when the receiver is NOT a package alias +// (a local variable, a struct field), it must keep emitting the +// `unresolved::*.` form so other passes can apply their +// own heuristics. +func TestGoDataflow_NonImportedReceiverFallsBack(t *testing.T) { + src := `package foo + +type Buffer struct{} + +func (b *Buffer) Write(p []byte) {} + +func Run(buf *Buffer, data []byte) { + buf.Write(data) +} +` + fix := runGoExtract(t, src) + + // `buf.Write(data)` — buf is a parameter, NOT an import; the + // walker's fallback path must keep `*.` (the call extractor's + // own path already records receiver_type on the call edge). + var seen bool + for _, edges := range fix.edgesByKind { + for _, e := range edges { + if e.To == "unresolved::*.Write" { + seen = true + } + assert.NotEqual(t, "unresolved::extern::buf::Write", e.To, + "`buf` is a parameter — must not be classified as a package alias") + } + } + assert.True(t, seen, "the walker must still emit `unresolved::*.Write` for non-import receivers; "+ + "got: %s", dumpDataflowSelectorTargets(fix)) +} + +func dumpDataflowSelectorTargets(fix *extractedFixture) string { + var b strings.Builder + for _, edges := range fix.edgesByKind { + for _, e := range edges { + if strings.Contains(e.To, "Sprintf") || strings.Contains(e.To, "TrimSpace") || strings.Contains(e.To, "Write") { + b.WriteString("\n [" + string(e.Kind) + "] " + e.From + " -> " + e.To) + } + } + } + return b.String() +} + +// guard: also verifies the same fix applies in exprSources (not just +// calleeRef) — a selector accessed as a value (not invoked) should +// also preserve its qualifier. Uses a real stdlib import so the +// extractor's emitImport handler matches its production code path. +func TestGoDataflow_SelectorValuePreservesQualifier(t *testing.T) { + src := `package foo + +import "os" + +func DefaultPerm() any { + return os.ModePerm +} +` + fix := runGoExtract(t, src) + _ = graph.KindFunction + + var foundProperShape bool + for _, edges := range fix.edgesByKind { + for _, e := range edges { + // handleReturn emits `From: src, To: owner` — flow goes + // FROM the value source TO the function's owner. So the + // qualified target lives on e.From, not e.To. + if strings.HasPrefix(e.From, "unresolved::extern::os::") || + strings.HasPrefix(e.To, "unresolved::extern::os::") { + foundProperShape = true + } + } + } + assert.True(t, foundProperShape, + "selector-value access (os.ModePerm) must emit the extern:: shape; got:\n%s", + dumpAllSelectorish(fix)) +} + +func dumpAllSelectorish(fix *extractedFixture) string { + var b strings.Builder + for _, edges := range fix.edgesByKind { + for _, e := range edges { + if strings.Contains(e.To, "ModePerm") || strings.Contains(e.To, "::os::") || strings.HasPrefix(e.To, "unresolved::*.") { + b.WriteString(" [" + string(e.Kind) + "] " + e.From + " -> " + e.To + "\n") + } + } + } + return b.String() +} diff --git a/internal/parser/languages/go_function_shape.go b/internal/parser/languages/go_function_shape.go index 27cebdc5..7b6211ce 100644 --- a/internal/parser/languages/go_function_shape.go +++ b/internal/parser/languages/go_function_shape.go @@ -24,7 +24,7 @@ import ( // declLine is the 1-based line of the declaration, used as the // anchor for nodes/edges that don't have a finer-grained AST // position to reference. -func emitGoFunctionShape(ownerID string, defNode *sitter.Node, paramsCap, resultCap *parser.CapturedNode, src []byte, filePath string, declLine int, result *parser.ExtractionResult) { +func emitGoFunctionShape(ownerID string, defNode *sitter.Node, paramsCap, resultCap *parser.CapturedNode, src []byte, filePath string, declLine int, imports map[string]string, result *parser.ExtractionResult) { if defNode == nil { return } @@ -32,15 +32,21 @@ func emitGoFunctionShape(ownerID string, defNode *sitter.Node, paramsCap, result emitGoReturnEdges(ownerID, resultCap, src, filePath, declLine, result) emitGoGenericParamNodes(ownerID, defNode, src, filePath, declLine, result) if body := goFuncBody(defNode); body != nil { - emitGoClosureNodes(ownerID, body, src, filePath, result) + emitGoClosureNodes(ownerID, declLine, body, src, filePath, result) emitGoChannelOps(ownerID, body, src, filePath, result) // CPG-lite intra-procedural dataflow: emits EdgeValueFlow, // EdgeArgOf, and EdgeReturnsTo placeholders. Inter-procedural // targets are lifted by the indexer's // MaterializeDataflowParams pass once the call resolver - // has landed every callee. + // has landed every callee. declLine anchors local-binding + // IDs as offsets so edits above the function don't churn + // every binding inside. imports are the file's package + // aliases so selector-expression cases inside the walker + // can rewrite `pkg.Method` calls to the proper + // `unresolved::extern::::` shape + // instead of dropping the qualifier. paramsByName := goParamNamesFromCapture(paramsCap, src) - emitGoDataflow(ownerID, body, paramsByName, src, filePath, result) + emitGoDataflow(ownerID, declLine, body, paramsByName, imports, src, filePath, result) } } @@ -388,7 +394,7 @@ func emitGoGenericParamNodes(ownerID string, defNode *sitter.Node, src []byte, f // enclosing function. Re-attributing them would require teaching // the call-emit walker to recognise closure boundaries — tracked as // a Phase 1.5 follow-up. -func emitGoClosureNodes(ownerID string, body *sitter.Node, src []byte, filePath string, result *parser.ExtractionResult) { +func emitGoClosureNodes(ownerID string, ownerStartLine int, body *sitter.Node, src []byte, filePath string, result *parser.ExtractionResult) { if body == nil { return } @@ -398,7 +404,15 @@ func emitGoClosureNodes(ownerID string, body *sitter.Node, src []byte, filePath return true } startLine := int(n.StartPoint().Row) + 1 - closureID := ownerID + "#closure@" + strconv.Itoa(startLine) + // ID anchors on the owner-relative offset (+ prefix) so edits + // above the enclosing function don't churn the closure's ID. + // Name keeps the absolute line for human readability in search + // results / outlines. + offset := startLine + if ownerStartLine > 0 { + offset = startLine - ownerStartLine + 1 + } + closureID := ownerID + "#closure@+" + strconv.Itoa(offset) // If two anonymous functions start on the same line, append a // stable suffix so IDs stay unique. Rare in practice but // defensive. diff --git a/internal/parser/languages/golang.go b/internal/parser/languages/golang.go index 50a3a8b3..9df7e1de 100644 --- a/internal/parser/languages/golang.go +++ b/internal/parser/languages/golang.go @@ -279,10 +279,10 @@ func (e *GoExtractor) Extract(filePath string, src []byte) (*parser.ExtractionRe // No-op (the package name is not currently surfaced as a node). case m.Captures["func.def"] != nil: - e.emitFunction(m, filePath, fileID, src, result, paramsByFunc) + e.emitFunction(m, filePath, fileID, src, result, paramsByFunc, imports) case m.Captures["method.def"] != nil: - e.emitMethod(m, filePath, fileID, src, result, paramsByFunc) + e.emitMethod(m, filePath, fileID, src, result, paramsByFunc, imports) case m.Captures["typedef.def"] != nil: e.emitTypeDecl(m, filePath, fileID, src, result, seenTypeName) @@ -831,7 +831,7 @@ func (e *GoExtractor) Extract(filePath string, src []byte) (*parser.ExtractionRe // --- Per-match emit helpers ----------------------------------------- -func (e *GoExtractor) emitFunction(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv) { +func (e *GoExtractor) emitFunction(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv, imports map[string]string) { name := m.Captures["func.name"].Text def := m.Captures["func.def"] id := filePath + "::" + name @@ -875,7 +875,7 @@ func (e *GoExtractor) emitFunction(m parser.QueryResult, filePath, fileID string }) emitGoThrowsEdges(node, m.Captures["func.result"], filePath, result) emitGoFunctionShape(id, def.Node, m.Captures["func.params"], m.Captures["func.result"], - src, filePath, def.StartLine+1, result) + src, filePath, def.StartLine+1, imports, result) } // goFuncBody returns the `block` body child of a function/method @@ -897,7 +897,7 @@ func goFuncBody(decl *sitter.Node) *sitter.Node { return nil } -func (e *GoExtractor) emitMethod(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv) { +func (e *GoExtractor) emitMethod(m parser.QueryResult, filePath, fileID string, src []byte, result *parser.ExtractionResult, paramsByFunc map[string]typeEnv, imports map[string]string) { name := m.Captures["method.name"].Text def := m.Captures["method.def"] receiverText := m.Captures["method.receiver"].Text @@ -958,7 +958,7 @@ func (e *GoExtractor) emitMethod(m parser.QueryResult, filePath, fileID string, }) emitGoThrowsEdges(node, m.Captures["method.result"], filePath, result) emitGoFunctionShape(id, def.Node, m.Captures["method.params"], m.Captures["method.result"], - src, filePath, def.StartLine+1, result) + src, filePath, def.StartLine+1, imports, result) } // goTypeParams reads the `type_parameters` child of a Go declaration @@ -1459,12 +1459,15 @@ func (e *GoExtractor) emitImport(m parser.QueryResult, filePath, fileID string, Language: "go", Meta: importMeta, }) - // File → import-node edge (Defines), so get_file_summary picks - // it up under the file's children. + // File → import-node edge. EdgeContains is the semantic fit (the + // file *contains* an import statement; it doesn't *define* the + // imported package). The Ladybug-backed GetFileSubGraph walks + // EdgeDefines ∪ EdgeContains from the file node to enumerate the + // full neighbourhood in one rel-index pass. result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: importNodeID, - Kind: graph.EdgeDefines, + Kind: graph.EdgeContains, FilePath: filePath, Line: line, }) diff --git a/internal/parser/languages/python.go b/internal/parser/languages/python.go index b689cae9..9cee7cb4 100644 --- a/internal/parser/languages/python.go +++ b/internal/parser/languages/python.go @@ -876,9 +876,13 @@ func pyEmitImportNode(filePath, fileID, importPath, alias string, line int, resu Language: "python", Meta: meta, }) + // File → import-node uses EdgeContains (the file contains an + // import statement; it doesn't define the imported module). + // GetFileSubGraph walks EdgeDefines ∪ EdgeContains to recover the + // full file neighbourhood. result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: importNodeID, - Kind: graph.EdgeDefines, FilePath: filePath, Line: line, + Kind: graph.EdgeContains, FilePath: filePath, Line: line, }) } diff --git a/internal/parser/languages/ts_dataflow.go b/internal/parser/languages/ts_dataflow.go new file mode 100644 index 00000000..6c5e405d --- /dev/null +++ b/internal/parser/languages/ts_dataflow.go @@ -0,0 +1,244 @@ +package languages + +import ( + "strconv" + + sitter "github.com/zzet/gortex/internal/parser/tsitter" + + "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/parser" +) + +// emitTSLocalBindings walks a TypeScript / JavaScript function body +// and materialises a KindLocal node for every introduced binding +// (`let x = …`, `const x = …`, `var x = …`, destructured shorthand, +// for-in/for-of induction vars, catch clause bindings, ...). Each +// binding gets: +// +// - ID `#local:@+` +// (function-relative offset like the Go walker, so an edit +// above the function leaves the IDs stable), +// - Name = the identifier, +// - FilePath / StartLine = the binding's source position, +// - EdgeMemberOf back to the enclosing function so the resolver's +// scope-aware bare-name binding (#81) can find it by walking +// the function's inbound EdgeMemberOf of KindLocal. +// +// TS doesn't (yet) have a dataflow walker analogous to +// emitGoDataflow, so no value_flow / arg_of / returns_to edges +// target these locals today. Their value is semantic parity with +// Go: every introduced binding is a first-class graph node with +// stable identity, ready for the dataflow / scope-resolution +// passes downstream. KindLocal is excluded from BM25 search via +// shouldIndexForSearch so the materialisation doesn't pollute name +// lookups with per-function `err` / `data` / `i` rows. +// +// Mirrors emitGoDataflow's bindLocal helper for the +// node-emission side; the walk shape is TypeScript-specific +// (different AST node types). +func emitTSLocalBindings(ownerID string, ownerStartLine int, body *sitter.Node, src []byte, filePath string, result *parser.ExtractionResult) { + if body == nil || ownerID == "" { + return + } + w := &tsBindingWalker{ + ownerID: ownerID, + ownerStartLine: ownerStartLine, + filePath: filePath, + src: src, + result: result, + emitted: map[string]struct{}{}, + } + w.walk(body) +} + +type tsBindingWalker struct { + ownerID string + ownerStartLine int + filePath string + src []byte + result *parser.ExtractionResult + emitted map[string]struct{} +} + +func (w *tsBindingWalker) walk(n *sitter.Node) { + if n == nil { + return + } + switch n.Type() { + case "function_declaration", "method_definition", "function", "arrow_function", "generator_function", "generator_function_declaration", "function_expression": + // Don't descend into nested functions — their bindings + // belong to the inner function's scope. The TS extractor's + // own pass handles each inner function separately. + return + case "lexical_declaration", "variable_declaration": + w.handleVarDecl(n) + // Fall through to children for any nested expressions + // (e.g. an initializer that contains a destructure pattern + // is already captured by handleVarDecl; no extra walk). + return + case "for_in_statement", "for_of_statement": + w.handleForInOf(n) + // Continue into the body to pick up nested declarations. + if body := n.ChildByFieldName("body"); body != nil { + w.walk(body) + } + return + case "catch_clause": + w.handleCatchClause(n) + if body := n.ChildByFieldName("body"); body != nil { + w.walk(body) + } + return + } + for i := 0; i < int(n.NamedChildCount()); i++ { + w.walk(n.NamedChild(i)) + } +} + +// handleVarDecl visits `let`, `const`, `var` declarations and emits +// a KindLocal node per declarator. Each declarator's `name` field +// is either an identifier (simplest case) or a destructure pattern +// (object_pattern / array_pattern) — for patterns we descend and +// emit one node per shorthand identifier. +func (w *tsBindingWalker) handleVarDecl(decl *sitter.Node) { + for i := 0; i < int(decl.NamedChildCount()); i++ { + c := decl.NamedChild(i) + if c == nil || c.Type() != "variable_declarator" { + continue + } + name := c.ChildByFieldName("name") + if name == nil { + continue + } + w.emitFromPattern(name, int(decl.StartPoint().Row)+1) + } +} + +// handleForInOf visits `for (const x of items)` / `for (let k in obj)` +// and materialises the induction var(s) declared on the LHS. +func (w *tsBindingWalker) handleForInOf(n *sitter.Node) { + left := n.ChildByFieldName("left") + if left == nil { + return + } + line := int(n.StartPoint().Row) + 1 + switch left.Type() { + case "lexical_declaration", "variable_declaration": + w.handleVarDecl(left) + case "identifier": + w.bindLocal(left.Content(w.src), line) + default: + w.emitFromPattern(left, line) + } +} + +// handleCatchClause materialises the catch parameter (`catch (err) +// { ... }`). TS supports both an identifier and a destructure +// pattern as the catch binding. +func (w *tsBindingWalker) handleCatchClause(n *sitter.Node) { + param := n.ChildByFieldName("parameter") + if param == nil { + return + } + w.emitFromPattern(param, int(n.StartPoint().Row)+1) +} + +// emitFromPattern recursively visits a binding pattern (identifier +// at the leaf; object_pattern / array_pattern in the middle) and +// emits a KindLocal node for every leaf identifier. Shorthand +// (`{ a, b }`) and renamed (`{ a: aliased }`) both produce +// identifier leaves the walker handles uniformly. +func (w *tsBindingWalker) emitFromPattern(node *sitter.Node, line int) { + if node == nil { + return + } + switch node.Type() { + case "identifier", "shorthand_property_identifier_pattern": + w.bindLocal(node.Content(w.src), line) + case "object_pattern", "array_pattern": + for i := 0; i < int(node.NamedChildCount()); i++ { + c := node.NamedChild(i) + if c == nil { + continue + } + switch c.Type() { + case "pair_pattern": + // `{ a: aliased }` — the bound name lives on the + // `value` field. + if v := c.ChildByFieldName("value"); v != nil { + w.emitFromPattern(v, line) + } + case "rest_pattern": + for j := 0; j < int(c.NamedChildCount()); j++ { + w.emitFromPattern(c.NamedChild(j), line) + } + default: + w.emitFromPattern(c, line) + } + } + case "assignment_pattern": + // `let x = 1` inside a destructure — the bound name is on + // the `left` field; the right is the default. + if l := node.ChildByFieldName("left"); l != nil { + w.emitFromPattern(l, line) + } + case "rest_pattern": + for i := 0; i < int(node.NamedChildCount()); i++ { + w.emitFromPattern(node.NamedChild(i), line) + } + } +} + +// bindLocal emits the KindLocal node + owner edge. Idempotent on +// the binding ID so a name visited through more than one walk path +// produces exactly one node row. +func (w *tsBindingWalker) bindLocal(name string, line int) { + if name == "" || name == "_" { + return + } + offset := line + if w.ownerStartLine > 0 { + offset = line - w.ownerStartLine + 1 + } + id := w.ownerID + "#local:" + name + "@+" + strconv.Itoa(offset) + if _, ok := w.emitted[id]; ok { + return + } + w.emitted[id] = struct{}{} + // Language tag mirrors the file's source language; the + // extractor's caller passes the file path so we recover it + // from the suffix. Defaults to typescript when ambiguous. + lang := "typescript" + switch { + case hasSuffix(w.filePath, ".tsx"): + lang = "tsx" + case hasSuffix(w.filePath, ".jsx"): + lang = "javascript" + case hasSuffix(w.filePath, ".js"), hasSuffix(w.filePath, ".mjs"), hasSuffix(w.filePath, ".cjs"): + lang = "javascript" + } + w.result.Nodes = append(w.result.Nodes, &graph.Node{ + ID: id, + Kind: graph.KindLocal, + Name: name, + FilePath: w.filePath, + StartLine: line, + EndLine: line, + Language: lang, + }) + w.result.Edges = append(w.result.Edges, &graph.Edge{ + From: id, + To: w.ownerID, + Kind: graph.EdgeMemberOf, + FilePath: w.filePath, + Line: line, + Origin: graph.OriginASTResolved, + }) +} + +func hasSuffix(s, suf string) bool { + if len(s) < len(suf) { + return false + } + return s[len(s)-len(suf):] == suf +} diff --git a/internal/parser/languages/ts_dataflow_test.go b/internal/parser/languages/ts_dataflow_test.go new file mode 100644 index 00000000..d0731363 --- /dev/null +++ b/internal/parser/languages/ts_dataflow_test.go @@ -0,0 +1,188 @@ +package languages + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// runTSLocalExtract is a thin adapter over the package's runTSExtract +// (declared in ts_function_shape_test.go) that returns the nodes and +// edges as a single struct convenient for the binding assertions +// below. +type tsLocalFixture struct { + nodes []*graph.Node + edges []*graph.Edge +} + +func runTSLocalExtract(t *testing.T, fileName, src string) tsLocalFixture { + t.Helper() + nodes, edges := runTSExtract(t, "pkg/"+fileName, src) + return tsLocalFixture{nodes: nodes, edges: edges} +} + +// TestEmitTSLocalBindings_LetConstVar covers the headline case: +// `let`, `const`, `var` declarations each produce a KindLocal node +// anchored to the enclosing function via EdgeMemberOf, with a +// function-relative offset ID so the binding stays stable across +// edits above the function. +func TestEmitTSLocalBindings_LetConstVar(t *testing.T) { + src := `function handler(req: any): string { + const raw = req.headers.authorization; + let token = raw.replace("Bearer ", ""); + var fallback = "anon"; + return token || fallback; +} +` + result := runTSLocalExtract(t, "auth.ts", src) + owner := "pkg/auth.ts::handler" + + locals := map[string]*graph.Node{} + for _, n := range result.nodes { + if n.Kind == graph.KindLocal { + locals[n.Name] = n + } + } + for _, want := range []string{"raw", "token", "fallback"} { + n, ok := locals[want] + require.Truef(t, ok, "missing KindLocal %q; got %v", want, mapKeys(locals)) + assert.Equal(t, graph.KindLocal, n.Kind) + assert.Equal(t, "pkg/auth.ts", n.FilePath) + assert.Truef(t, strings.HasPrefix(n.ID, owner+"#local:"+want+"@+"), + "local %q ID must be function-relative; got %q", want, n.ID) + } + + // Every local must have an EdgeMemberOf back to the owner. + memberFor := map[string]string{} + for _, e := range result.edges { + if e.Kind == graph.EdgeMemberOf { + memberFor[e.From] = e.To + } + } + for _, n := range locals { + assert.Equal(t, owner, memberFor[n.ID], + "local %q must own-link to enclosing function", n.Name) + } +} + +// TestEmitTSLocalBindings_DestructurePatterns ensures the walker +// handles object and array destructure patterns — common in JS/TS +// codebases (`const { foo, bar: aliased } = obj`). +func TestEmitTSLocalBindings_DestructurePatterns(t *testing.T) { + src := `function unpack(obj: any) { + const { foo, bar: aliased } = obj; + const [first, second] = obj.list; +} +` + result := runTSLocalExtract(t, "unpack.ts", src) + names := map[string]bool{} + for _, n := range result.nodes { + if n.Kind == graph.KindLocal { + names[n.Name] = true + } + } + for _, want := range []string{"foo", "aliased", "first", "second"} { + assert.Truef(t, names[want], "missing KindLocal for destructure %q; got %v", want, names) + } +} + +// TestEmitTSLocalBindings_ForOfBinding covers for-of induction vars +// — the parser's other binding-introduction site beyond plain +// declarations. +func TestEmitTSLocalBindings_ForOfBinding(t *testing.T) { + src := `function each(items: any[]) { + for (const item of items) { + const inner = item.value; + } +} +` + result := runTSLocalExtract(t, "each.ts", src) + names := map[string]bool{} + for _, n := range result.nodes { + if n.Kind == graph.KindLocal { + names[n.Name] = true + } + } + assert.True(t, names["item"], "for-of induction var must be materialised") + assert.True(t, names["inner"], "binding inside the loop body must be materialised") +} + +// TestEmitTSLocalBindings_NestedFunctionsScopeIsolated guards the +// walker against descending into nested functions (their bindings +// belong to their own scope, not the outer function's). +func TestEmitTSLocalBindings_NestedFunctionsScopeIsolated(t *testing.T) { + src := `function outer() { + const x = 1; + function inner() { + const y = 2; + } +} +` + result := runTSLocalExtract(t, "nested.ts", src) + outerOwner := "pkg/nested.ts::outer" + memberOwners := map[string]string{} + for _, e := range result.edges { + if e.Kind == graph.EdgeMemberOf { + memberOwners[e.From] = e.To + } + } + for _, n := range result.nodes { + if n.Kind != graph.KindLocal { + continue + } + switch n.Name { + case "x": + assert.Equal(t, outerOwner, memberOwners[n.ID], + "outer's local must own-link to outer") + case "y": + assert.NotEqual(t, outerOwner, memberOwners[n.ID], + "inner's local must NOT own-link to outer — different scope") + } + } +} + +// TestEmitTSLocalBindings_FunctionRelativeOffsetIsStable mirrors the +// Go regression at #76: adding a line above the function must NOT +// shift any local-binding ID inside it. +func TestEmitTSLocalBindings_FunctionRelativeOffsetIsStable(t *testing.T) { + orig := `function f() { + const x = 1; + const y = 2; +} +` + shifted := `// header +// header +// header +function f() { + const x = 1; + const y = 2; +} +` + collect := func(t *testing.T, src string) map[string]struct{} { + t.Helper() + ids := map[string]struct{}{} + for _, n := range runTSLocalExtract(t, "stable.ts", src).nodes { + if n.Kind == graph.KindLocal { + ids[n.ID] = struct{}{} + } + } + return ids + } + a := collect(t, orig) + b := collect(t, shifted) + assert.NotEmpty(t, a) + assert.Equal(t, a, b, + "local IDs must stay stable when only lines ABOVE the function move") +} + +func mapKeys(m map[string]*graph.Node) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} diff --git a/internal/parser/languages/ts_function_shape.go b/internal/parser/languages/ts_function_shape.go index d58062cf..26018cc4 100644 --- a/internal/parser/languages/ts_function_shape.go +++ b/internal/parser/languages/ts_function_shape.go @@ -34,6 +34,13 @@ func emitTSFunctionShape(ownerID string, declNode *sitter.Node, src []byte, file if body := tsFunctionBody(declNode); body != nil { emitTSAsyncSpawns(ownerID, body, src, filePath, result) emitTSFieldAccess(ownerID, body, src, filePath, result) + // Materialise let / const / var / range / catch bindings as + // KindLocal nodes — semantic parity with the Go extractor's + // #77 work. Idempotent on the binding ID (function-relative + // offset), excluded from BM25 search by shouldIndexForSearch, + // and consumed by the resolver's scope-aware bare-name bind + // (#81) for future dataflow / scope-resolution work. + emitTSLocalBindings(ownerID, declLine, body, src, filePath, result) } } diff --git a/internal/parser/languages/typescript.go b/internal/parser/languages/typescript.go index 8af445a3..528b5659 100644 --- a/internal/parser/languages/typescript.go +++ b/internal/parser/languages/typescript.go @@ -803,9 +803,14 @@ func (e *TypeScriptExtractor) emitImport(m parser.QueryResult, filePath, fileID Language: "typescript", Meta: importMeta, }) + // File → import-node uses EdgeContains (the file contains the + // import statement; it doesn't define the imported module). The + // resolver-facing file → unresolved::import path stays on + // EdgeImports unchanged — that's a file-to-file dependency, a + // different relationship. result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: importNodeID, - Kind: graph.EdgeDefines, FilePath: filePath, Line: line, + Kind: graph.EdgeContains, FilePath: filePath, Line: line, }) result.Edges = append(result.Edges, &graph.Edge{ From: fileID, To: "unresolved::import::" + importPath, diff --git a/internal/progress/zaplog.go b/internal/progress/zaplog.go new file mode 100644 index 00000000..8e98424c --- /dev/null +++ b/internal/progress/zaplog.go @@ -0,0 +1,114 @@ +package progress + +import ( + "context" + "sync" + "time" + + "go.uber.org/zap" +) + +// ZapReporter logs every Report call as a zap INFO line. Used in +// non-TTY environments (the daemon, CI) where the Spinner is +// silent so progress is invisible. Stage transitions get logged +// immediately; intra-stage progress (current/total) gets logged on +// transition AND every progressInterval seconds so a slow stage +// emits a heartbeat instead of going quiet. +type ZapReporter struct { + logger *zap.Logger + prefix string + interval time.Duration + + mu sync.Mutex + lastStage string + stageStart time.Time + lastEmitted time.Time + lastCur int + lastTotal int +} + +// NewZapReporter creates a reporter that logs to the given logger. +// prefix is added to every log line ("indexer", "multi-repo", …). +// interval is the heartbeat cadence for intra-stage progress +// (0 disables heartbeats — only stage transitions log). +func NewZapReporter(logger *zap.Logger, prefix string, interval time.Duration) *ZapReporter { + if logger == nil { + logger = zap.NewNop() + } + return &ZapReporter{ + logger: logger, + prefix: prefix, + interval: interval, + } +} + +// Report records a stage advancement. Always logs on a stage +// transition; logs intra-stage updates at most once per interval. +func (r *ZapReporter) Report(stage string, cur, total int) { + r.mu.Lock() + defer r.mu.Unlock() + now := time.Now() + if stage != r.lastStage { + if r.lastStage != "" { + r.logger.Info(r.prefix+": stage end", + zap.String("stage", r.lastStage), + zap.Duration("elapsed", now.Sub(r.stageStart)), + ) + } + r.lastStage = stage + r.stageStart = now + r.lastEmitted = now + r.lastCur = cur + r.lastTotal = total + r.logger.Info(r.prefix+": stage start", + zap.String("stage", stage), + zap.Int("current", cur), + zap.Int("total", total), + ) + return + } + // Same stage — heartbeat at most once per interval. + if r.interval > 0 && now.Sub(r.lastEmitted) < r.interval { + return + } + r.lastEmitted = now + r.lastCur = cur + r.lastTotal = total + r.logger.Info(r.prefix+": stage progress", + zap.String("stage", stage), + zap.Int("current", cur), + zap.Int("total", total), + zap.Duration("elapsed", now.Sub(r.stageStart)), + ) +} + +// StartHeartbeat runs a goroutine that logs an "alive" line every +// interval until the context is done. Useful when the indexer is +// inside a long-running phase that doesn't call Report itself +// (e.g. ladybug's per-row Cypher writes during a slow drain). +func StartHeartbeat(ctx context.Context, logger *zap.Logger, prefix string, interval time.Duration, snapshot func() map[string]any) { + if logger == nil || interval <= 0 { + return + } + go func() { + t := time.NewTicker(interval) + defer t.Stop() + start := time.Now() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + fields := []zap.Field{ + zap.Duration("elapsed", time.Since(start)), + } + if snapshot != nil { + for k, v := range snapshot() { + fields = append(fields, zap.Any(k, v)) + } + } + logger.Info(prefix+": heartbeat", fields...) + } + } + }() +} diff --git a/internal/query/class_hierarchy.go b/internal/query/class_hierarchy.go index 0feccdef..b27a8f40 100644 --- a/internal/query/class_hierarchy.go +++ b/internal/query/class_hierarchy.go @@ -50,6 +50,13 @@ var methodHierarchyEdgeKinds = map[graph.EdgeKind]bool{ // Workspace / project scope is enforced via opts.ScopeAllows on every // neighbour. opts.MinTier is applied as a post-pass over the collected // edges (consistent with the rest of the engine surface). +// +// Picks ClassHierarchyTraverser when the backend implements it: that +// path runs the BFS as one variable-length traversal per direction +// inside the engine, replacing the per-node GetNode + GetIn/OutEdges +// loop the fallback runs. On Ladybug a deep walk over a wide +// implementer set previously fired hundreds of cgo round-trips per +// call — the pushdown drops to one or two queries. func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, depth int, includeMethods bool, opts QueryOptions) *SubGraph { if direction == "" { direction = HierarchyBoth @@ -61,6 +68,272 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep depth = 64 } + seed := e.g.GetNode(seedID) + if seed == nil { + return &SubGraph{} + } + + if _, ok := e.g.(graph.ClassHierarchyTraverser); ok { + return e.classHierarchyPushdown(seed, direction, depth, includeMethods, opts) + } + return e.classHierarchyWalk(seed, direction, depth, includeMethods, opts) +} + +// classHierarchyPushdown runs the BFS through the +// ClassHierarchyTraverser capability. Each direction issues one or +// two backend round-trips (the type-edge kinds, optionally chasing +// methods through EdgeMemberOf) instead of the per-frontier per-hop +// loop the fallback runs. +func (e *Engine) classHierarchyPushdown( + seed *graph.Node, + direction HierarchyDirection, + depth int, + includeMethods bool, + opts QueryOptions, +) *SubGraph { + tr := e.g.(graph.ClassHierarchyTraverser) + walkUp := direction == HierarchyUp || direction == HierarchyBoth + walkDown := direction == HierarchyDown || direction == HierarchyBoth + + typeKinds := []graph.EdgeKind{graph.EdgeExtends, graph.EdgeImplements, graph.EdgeComposes} + methodKinds := []graph.EdgeKind{graph.EdgeOverrides} + + // Per-direction walks: type-hierarchy kinds rooted at seed if seed + // is a type/interface; method-hierarchy kinds rooted at seed if + // seed is a method/function. Methods reached via includeMethods + // are added as separate roots in a follow-up pass. + var rows []graph.ClassHierarchyRow + seedIsType := seed.Kind == graph.KindType || seed.Kind == graph.KindInterface + seedIsMethod := seed.Kind == graph.KindMethod || seed.Kind == graph.KindFunction + if seedIsType { + if walkUp { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "up", typeKinds, depth)...) + } + if walkDown { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "down", typeKinds, depth)...) + } + } else if seedIsMethod { + if walkUp { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "up", methodKinds, depth)...) + } + if walkDown { + rows = append(rows, tr.ClassHierarchyTraverse(seed.ID, "down", methodKinds, depth)...) + } + } + + // Collect the node IDs visited so we can resolve them in one + // batched fetch, instead of one GetNode per row. + visited := map[string]bool{seed.ID: true} + for _, r := range rows { + for _, id := range r.Path { + visited[id] = true + } + } + + // includeMethods folds in EdgeMemberOf hops from every visited + // type node. The override walk on each method then runs as a + // further pushdown call. + memberLinks := []struct { + from, to string + kind graph.EdgeKind + }{} + if includeMethods { + typeIDs := make([]string, 0, len(visited)) + for id := range visited { + n := e.g.GetNode(id) + if n == nil { + continue + } + if n.Kind == graph.KindType || n.Kind == graph.KindInterface { + typeIDs = append(typeIDs, id) + } + } + if len(typeIDs) > 0 { + memberIns := e.g.GetInEdgesByNodeIDs(typeIDs) + methodRoots := []string{} + for _, id := range typeIDs { + for _, ed := range memberIns[id] { + if ed == nil || ed.Kind != graph.EdgeMemberOf { + continue + } + member := e.g.GetNode(ed.From) + if member == nil { + continue + } + if member.Kind != graph.KindMethod && member.Kind != graph.KindFunction { + continue + } + memberLinks = append(memberLinks, struct { + from, to string + kind graph.EdgeKind + }{from: member.ID, to: id, kind: graph.EdgeMemberOf}) + if !visited[member.ID] { + visited[member.ID] = true + methodRoots = append(methodRoots, member.ID) + } + } + } + for _, mid := range methodRoots { + if walkUp { + subRows := tr.ClassHierarchyTraverse(mid, "up", methodKinds, depth) + for _, sr := range subRows { + for _, id := range sr.Path { + visited[id] = true + } + } + rows = append(rows, methodPathsWithRoot(mid, subRows)...) + } + if walkDown { + subRows := tr.ClassHierarchyTraverse(mid, "down", methodKinds, depth) + for _, sr := range subRows { + for _, id := range sr.Path { + visited[id] = true + } + } + rows = append(rows, methodPathsWithRoot(mid, subRows)...) + } + } + } + } + + // Resolve every visited node + collect the edge pointers in one + // place. The capability doesn't carry edge pointers (Ladybug edges + // aren't first-class objects), so we re-resolve them via + // GetOutEdgesByNodeIDs / GetInEdgesByNodeIDs once per direction. + allIDs := make([]string, 0, len(visited)) + for id := range visited { + allIDs = append(allIDs, id) + } + nodeMap := e.g.GetNodesByIDs(allIDs) + if nodeMap[seed.ID] == nil { + nodeMap[seed.ID] = seed + } + + resultNodes := make([]*graph.Node, 0, len(allIDs)) + for _, id := range allIDs { + n := nodeMap[id] + if n == nil { + continue + } + if opts.WorkspaceID != "" && id != seed.ID && !opts.ScopeAllows(n) { + continue + } + resultNodes = append(resultNodes, n) + } + + // Reconstruct edges: each row's Path[i] → Path[i+1] (for i>=0) + // carries an edge of EdgeKinds[i]. The seed's first hop is from + // seed → Path[0]. The direction the walk came from determines + // whether the edge points seed→neighbour or neighbour→seed. + resultEdges := make([]*graph.Edge, 0) + seenEdge := make(map[string]bool) + addEdge := func(from, to string, kind graph.EdgeKind) { + // Find the actual *Edge so the downstream FilterByMinTier + // still has the origin / tier columns to read. + var found *graph.Edge + for _, ed := range e.g.GetOutEdges(from) { + if ed == nil { + continue + } + if ed.To == to && ed.Kind == kind { + found = ed + break + } + } + if found == nil { + // Direction-flipped lookup — happens when "down" walks + // hand back paths whose hops are in-edges of the seed. + for _, ed := range e.g.GetInEdges(from) { + if ed == nil { + continue + } + if ed.From == to && ed.Kind == kind { + found = ed + break + } + } + } + if found == nil { + return + } + k := found.From + "→" + found.To + "::" + string(found.Kind) + ":" + edgeMetaTag(found) + if seenEdge[k] { + return + } + seenEdge[k] = true + resultEdges = append(resultEdges, found) + } + for _, r := range rows { + prev := seed.ID + for i, nb := range r.Path { + if i >= len(r.EdgeKinds) { + break + } + addEdge(prev, nb, r.EdgeKinds[i]) + prev = nb + } + } + for _, link := range memberLinks { + addEdge(link.from, link.to, link.kind) + } + + // Workspace-scope post-filter for edges (any edge whose endpoints + // were dropped from resultNodes is also dropped). + if opts.WorkspaceID != "" { + nodeSet := make(map[string]bool, len(resultNodes)) + for _, n := range resultNodes { + nodeSet[n.ID] = true + } + filtered := resultEdges[:0] + for _, ed := range resultEdges { + if !nodeSet[ed.From] || !nodeSet[ed.To] { + continue + } + filtered = append(filtered, ed) + } + resultEdges = filtered + } + + sg := &SubGraph{ + Nodes: resultNodes, + Edges: resultEdges, + TotalNodes: len(resultNodes), + TotalEdges: len(resultEdges), + } + if opts.MinTier != "" { + sg.FilterByMinTier(opts.MinTier) + } + return sg +} + +// methodPathsWithRoot rebases the traversal rows so the seed prefix +// in their paths reflects the method root they came from rather than +// the outer ClassHierarchy seed. Returned rows are otherwise +// unchanged. +func methodPathsWithRoot(root string, rows []graph.ClassHierarchyRow) []graph.ClassHierarchyRow { + out := make([]graph.ClassHierarchyRow, len(rows)) + for i, r := range rows { + newPath := append([]string{root}, r.Path...) + newKinds := append([]graph.EdgeKind{}, r.EdgeKinds...) + // The seed→Path[0] hop is encoded by EdgeMemberOf in the outer + // addEdge pass, so we keep the EdgeKinds slice aligned with + // the slice the caller iterates ([0]=Path[0]→Path[1]). + out[i] = graph.ClassHierarchyRow{Path: newPath[1:], EdgeKinds: newKinds} + _ = newPath + } + return out +} + +// classHierarchyWalk is the in-memory BFS path. Kept verbatim so the +// in-memory backend has the same shape it had before the pushdown +// landed. +func (e *Engine) classHierarchyWalk( + seed *graph.Node, + direction HierarchyDirection, + depth int, + includeMethods bool, + opts QueryOptions, +) *SubGraph { walkUp := direction == HierarchyUp || direction == HierarchyBoth walkDown := direction == HierarchyDown || direction == HierarchyBoth @@ -77,9 +350,6 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep resultNodes = append(resultNodes, n) } - // Edges are deduped by their source pointer identity — the graph - // store hands out stable pointers per edge, so a pointer key is - // sufficient and avoids constructing a synthetic key per edge. edgeKey := func(ed *graph.Edge) string { return ed.From + "→" + ed.To + "::" + string(ed.Kind) + ":" + edgeMetaTag(ed) } @@ -95,17 +365,13 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep resultEdges = append(resultEdges, ed) } - seed := e.g.GetNode(seedID) - if seed == nil { - return &SubGraph{} - } addNode(seed) type queued struct { id string depth int } - queue := []queued{{id: seedID, depth: 0}} + queue := []queued{{id: seed.ID, depth: 0}} for len(queue) > 0 { cur := queue[0] @@ -122,10 +388,6 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep isType := curNode.Kind == graph.KindType || curNode.Kind == graph.KindInterface isMethod := curNode.Kind == graph.KindMethod || curNode.Kind == graph.KindFunction - // Pull in member methods of type/interface nodes when requested. - // This happens at the visit step (not as a hop), so methods land - // in the result without consuming a depth budget — they're a - // projection of the type, not a separate hierarchy hop. if includeMethods && isType { for _, mEdge := range e.g.GetInEdges(cur.id) { if mEdge.Kind != graph.EdgeMemberOf { @@ -143,15 +405,10 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep } addNode(member) addEdge(mEdge) - // Surface the method itself for the override walk in - // the next iteration. Same depth budget as the parent - // type so a method's overrides cost the same as walking - // to a method-seed at this depth. queue = append(queue, queued{id: member.ID, depth: cur.depth}) } } - // Pick edge kinds based on what kind of node we're standing on. var kindSet map[graph.EdgeKind]bool switch { case isType: @@ -159,7 +416,6 @@ func (e *Engine) ClassHierarchy(seedID string, direction HierarchyDirection, dep case isMethod: kindSet = methodHierarchyEdgeKinds default: - // Fields, params, files, etc. — nothing to walk. continue } diff --git a/internal/query/engine.go b/internal/query/engine.go index cb89b4a4..9767e905 100644 --- a/internal/query/engine.go +++ b/internal/query/engine.go @@ -3,6 +3,7 @@ package query import ( "sort" "strings" + "time" "github.com/zzet/gortex/internal/graph" "github.com/zzet/gortex/internal/search" @@ -51,7 +52,7 @@ func (e *Engine) Reader() graph.Reader { return e.g } // NewEngine creates a query engine wrapping the given graph. The // default 11-signal rerank.Pipeline is wired in; callers wanting a // custom signal set / weights override via SetRerank. -func NewEngine(g *graph.Graph) *Engine { +func NewEngine(g graph.Store) *Engine { return &Engine{g: g, rerank: rerank.NewDefault()} } @@ -128,13 +129,74 @@ func (e *Engine) FindSymbols(name string, kinds ...graph.NodeKind) []*graph.Node return filtered } -// GetFileSymbols returns all symbols defined in a file. +// GetFileSymbolsCounts returns the file's symbols and the count of +// edges adjacent to them, without materialising the edges themselves. +// Use it instead of GetFileSymbols when the caller only needs an +// edge total (gcx + compact output paths in get_file_summary), since +// the disk backends can collapse the edge round-trip into a server- +// side aggregate that's orders of magnitude cheaper than shipping +// every row back over cgo. +// +// Backends that implement graph.FileSubGraphCountReader handle the +// count server-side; others fall through to a full GetFileSymbols call +// and report len(sg.Edges) (correct, just not cheap). +func (e *Engine) GetFileSymbolsCounts(filePath string) *SubGraph { + if pd, ok := e.g.(graph.FileSubGraphCountReader); ok { + nodes, edgeCount := pd.GetFileSubGraphCounts(filePath) + if len(nodes) == 0 { + return &SubGraph{} + } + return &SubGraph{ + Nodes: nodes, + TotalNodes: len(nodes), + TotalEdges: edgeCount, + } + } + sg := e.GetFileSymbols(filePath) + if sg == nil { + return &SubGraph{} + } + // Strip edges — the caller asked for counts only and we don't + // want stale edge buffers riding back on the SubGraph. + sg.Edges = nil + return sg +} + +// GetFileSymbols returns the file node, every symbol the file +// defines or contains, and every edge adjacent to any of them. +// +// Backends that implement graph.FileSubGraphReader (the Ladybug +// store, for instance) handle the whole walk in one method call so +// they can express the symbol enumeration as a primary-key probe + +// rel-table FROM walk instead of a property-filter scan over Node. +// Backends without the capability fall through to the +// GetFileNodes + GetOut/InEdgesByNodeIDs trio — equivalent on the +// in-memory graph (the per-id lookups are already O(1)). func (e *Engine) GetFileSymbols(filePath string) *SubGraph { + if pd, ok := e.g.(graph.FileSubGraphReader); ok { + nodes, edges := pd.GetFileSubGraph(filePath) + if len(nodes) == 0 { + return &SubGraph{} + } + return &SubGraph{ + Nodes: nodes, Edges: edges, + TotalNodes: len(nodes), TotalEdges: len(edges), + } + } nodes := e.g.GetFileNodes(filePath) - var edges []*graph.Edge + if len(nodes) == 0 { + return &SubGraph{} + } + ids := make([]string, 0, len(nodes)) for _, n := range nodes { - edges = append(edges, e.g.GetOutEdges(n.ID)...) - edges = append(edges, e.g.GetInEdges(n.ID)...) + ids = append(ids, n.ID) + } + outByID := e.g.GetOutEdgesByNodeIDs(ids) + inByID := e.g.GetInEdgesByNodeIDs(ids) + var edges []*graph.Edge + for _, id := range ids { + edges = append(edges, outByID[id]...) + edges = append(edges, inByID[id]...) } return &SubGraph{ Nodes: nodes, Edges: dedup(edges), @@ -304,6 +366,32 @@ func (e *Engine) FindUsagesScoped(nodeID string, opts QueryOptions) *SubGraph { edges := e.g.GetInEdges(nodeID) nodeMap := make(map[string]*graph.Node) var filtered []*graph.Edge + + // First pass: collect every From id whose edge kind qualifies as + // a usage. We need the From *Node for the workspace / test + // filters below, but the legacy loop fetched it with one GetNode + // per edge — on Ladybug that's one cgo Cypher round-trip per + // inbound edge, which for hot symbols (hundreds of callers) was + // the dominant cost of find_usages. Pre-filter the kinds, then + // batch the lookup so the disk backend issues one query instead + // of N. The target nodeID rides on the same batch so the + // "include the target node itself" step at the end of this + // function does not need its own per-id call. + fromIDs := make([]string, 0, len(edges)+1) + seenFrom := make(map[string]struct{}, len(edges)) + for _, edge := range edges { + if !isUsageEdgeKind(edge.Kind) { + continue + } + if _, dup := seenFrom[edge.From]; dup { + continue + } + seenFrom[edge.From] = struct{}{} + fromIDs = append(fromIDs, edge.From) + } + fromIDs = append(fromIDs, nodeID) + fromByID := e.g.GetNodesByIDs(fromIDs) + for _, edge := range edges { // EdgeProvides + EdgeConsumes carry DI token relationships — // `@Inject(TOKEN)` and `{ provide: TOKEN, useValue: ... }` @@ -319,17 +407,8 @@ func (e *Engine) FindUsagesScoped(nodeID string, opts QueryOptions) *SubGraph { // callers via the legacy reads_config path; find_usages on a // Service returns Ingresses routing to it (EdgeDependsOn); // find_usages on an Image returns workloads pulling it. - if edge.Kind == graph.EdgeCalls || edge.Kind == graph.EdgeReferences || - edge.Kind == graph.EdgeInstantiates || - edge.Kind == graph.EdgeReturns || edge.Kind == graph.EdgeTypedAs || - edge.Kind == graph.EdgeImplements || edge.Kind == graph.EdgeExtends || - edge.Kind == graph.EdgeComposes || - edge.Kind == graph.EdgeProvides || edge.Kind == graph.EdgeConsumes || - edge.Kind == graph.EdgeReadsConfig || edge.Kind == graph.EdgeWritesConfig || - edge.Kind == graph.EdgeUsesEnv || edge.Kind == graph.EdgeConfigures || - edge.Kind == graph.EdgeMounts || edge.Kind == graph.EdgeExposes || - edge.Kind == graph.EdgeDependsOn { - from := e.g.GetNode(edge.From) + if isUsageEdgeKind(edge.Kind) { + from := fromByID[edge.From] if opts.WorkspaceID != "" && !opts.ScopeAllows(from) { continue } @@ -342,8 +421,8 @@ func (e *Engine) FindUsagesScoped(nodeID string, opts QueryOptions) *SubGraph { } } } - // Include the target node itself. - if n := e.g.GetNode(nodeID); n != nil { + // Include the target node itself (already in the batch above). + if n := fromByID[nodeID]; n != nil { nodeMap[n.ID] = n } nodes := make([]*graph.Node, 0, len(nodeMap)) @@ -389,11 +468,25 @@ func (e *Engine) SearchSymbolsRanked(query string, limit int, opts QueryOptions, } } + // Engine-side rctx wins over the opts-piggybacked one (the explicit + // arg is the load-bearing path for callers that build the context + // inline). Callers (the MCP search_symbols handler) that build the + // rctx upstream and want both BM25 calls to share the same edge- + // cache seeding pass it through opts.RerankContext instead. + gatherCtx := rctx + if gatherCtx == nil { + gatherCtx = opts.RerankContext + } + var cands []*rerank.Candidate if s := e.getSearch(); s != nil && s.Count() > 0 { - cands = e.gatherBackendCandidates(query, fetchLimit) + cands = e.gatherBackendCandidates(query, fetchLimit, opts, gatherCtx) } else { + start := time.Now() nodes := e.searchSubstring(query, fetchLimit) + if opts.SearchTimings != nil { + opts.SearchTimings.FallbackMS += time.Since(start).Milliseconds() + } cands = make([]*rerank.Candidate, 0, len(nodes)) for i, n := range nodes { cands = append(cands, &rerank.Candidate{Node: n, TextRank: i, VectorRank: -1}) @@ -418,13 +511,27 @@ func (e *Engine) SearchSymbolsRanked(query string, limit int, opts QueryOptions, // ranking within one merged corpus. No-op for a single-repo set. crossRepoRerank(cands) - if e.rerank != nil { + if e.rerank != nil && !opts.SkipInnerRerank { ctx := rctx if ctx == nil { ctx = &rerank.Context{} } ctx.Graph = e.g + // When the caller supplied opts.RerankContext (the bundle- + // seeding handler), inherit its cached edges so this per-call + // rerank's prepare can read them — saves the 2 batched edge + // fetches per BM25 fan-out on the bundle hot path. Session + // signals stay scoped to the OUTER rerank (the one the handler + // runs against the merged candidate set); the inner rerank + // gets a structural-only context plus the bundle-cached edges. + if rctx == nil && opts.RerankContext != nil { + ctx.InheritEdgeCacheFrom(opts.RerankContext) + } + rerankStart := time.Now() e.rerank.Rerank(query, cands, ctx) + if opts.SearchTimings != nil { + opts.SearchTimings.EngineRerankMS += time.Since(rerankStart).Milliseconds() + } } if len(cands) > limit { @@ -452,20 +559,186 @@ func (e *Engine) SearchSymbolsScoped(query string, limit int, opts QueryOptions) // substring / bigram-rescue matches. Each candidate carries its // 0-based TextRank and VectorRank (or -1 when the channel didn't // return it) so the rerank pipeline can score per channel. -func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Candidate { +// +// Bundle fast path: when the backend implements +// SymbolBundleSearcherBackend, BM25 hits + their Node payload + their +// in/out edges all arrive in one engine round-trip. The bundle's +// edges seed rctx (when non-nil) so the rerank pipeline's prepare +// pass can skip its own batched fetch entirely. Vector channel IDs +// (which don't carry edges in the bundle) still route through the +// per-call GetNodesByIDs + GetIn/OutEdgesByNodeIDs path; bundle and +// vector candidates merge into one rerank slice. +// +// Fallback (no bundle support): the legacy path — Search() / channel +// for IDs, GetNodesByIDs to materialise. On disk backends (Ladybug) +// the bundle fast path collapses 3 cgo round-trips (FTS + nodes + +// the rerank's 2 edge fetches) into 4 server-side queries with no +// engine→rerank boundary crossings; the GetNodesByIDs cost goes +// away entirely for the BM25 hits. +func (e *Engine) gatherBackendCandidates(query string, limit int, opts QueryOptions, rctx *rerank.Context) []*rerank.Candidate { backend := e.getSearch() + timings := opts.SearchTimings - // Pull text + vector channels separately when the backend exposes - // them (HybridBackend). Otherwise treat plain Search() output as - // text-only. + // Bundle fast path. The SymbolBundleSearcherBackend assertion + // chains through Swappable → HybridBackend → SymbolSearcherBackend + // in production; both Swappable and HybridBackend forward when + // the inner backend supports it. Vector IDs still need the + // per-call materialise — bundles don't carry vector hits. var ( - textResults []search.SearchResult - vectorIDs []string + textResults []search.SearchResult + vectorIDs []string + bundleHandled bool + bundleNodeByID = make(map[string]*graph.Node) ) - if cs, ok := backend.(search.ChannelSearcher); ok { - textResults, vectorIDs = cs.SearchChannels(query, limit*2) - } else { - textResults = backend.Search(query, limit*2) + if bsb, ok := backend.(search.SymbolBundleSearcherBackend); ok { + // Pull the vector channel separately when present. Bundles + // cover BM25 only; the engine merges vector hits below. + // VectorChannelOnly avoids re-running the text BM25 path — + // the bundle already returned the BM25 hits and their full + // node + edge payload. Falling back to SearchChannels here + // would double-pay the FTS Cypher cost per BM25 fan-out. + type vectorOnly interface { + VectorChannelOnly(query string, limit int) ([]string, search.ChannelTimings) + } + vectorOnlyBackend, vectorOnlyOK := backend.(vectorOnly) + bundleStart := time.Now() + bundles := bsb.SearchSymbolBundles(query, limit*2) + if timings != nil { + timings.BundleMS += time.Since(bundleStart).Milliseconds() + } + if len(bundles) > 0 { + bundleHandled = true + textResults = make([]search.SearchResult, 0, len(bundles)) + outSeed := make(map[string][]*graph.Edge, len(bundles)) + inSeed := make(map[string][]*graph.Edge, len(bundles)) + for _, b := range bundles { + if b.Node == nil { + continue + } + bundleNodeByID[b.Node.ID] = b.Node + textResults = append(textResults, search.SearchResult{ID: b.Node.ID, Score: b.Score}) + outSeed[b.Node.ID] = b.OutEdges + inSeed[b.Node.ID] = b.InEdges + } + // Seed the rerank context's edge caches so prepare() can + // skip its own batched fetch for the bundle-covered IDs. + // preSeeded=true is the contract that prepare's batched + // edge fetch is now redundant — see rerank.Context for the + // invariant the engine relies on (the next caller's + // candidate set is fully covered by these maps for the + // BM25 hits; vector / substring fallback hits are still + // served by the per-candidate accessor fallback). + if rctx != nil { + rctx.SeedEdgeCaches(inSeed, outSeed, true) + } + } + // Vector channel: only when the bundle path took the BM25 + // branch. Otherwise the fallback path below pulls both. + // VectorChannelOnly skips the BM25 re-run (the bundle already + // returned text hits + their full payload); a few hundred + // microseconds of embed + ANN, not a second FTS Cypher. + // + // opts.SkipVectorChannel suppresses the embed + ANN entirely. + // The MCP handler flips this on for identifier-shape queries + // (QueryClassSymbol / Path / Signature) where the rerank's + // classWeightTable already proves semantic contributes near- + // zero signal vs the BM25 channel — see classWeightTable in + // internal/search/rerank/query_kind.go. + if vectorOnlyOK && !opts.SkipVectorChannel { + vecIDs, stats := vectorOnlyBackend.VectorChannelOnly(query, limit*2) + vectorIDs = vecIDs + if timings != nil { + timings.EmbedMS += stats.EmbedMS + timings.VectorSearchMS += stats.VectorSearchMS + } + } + } + + // Legacy / fallback path: bundle backend absent OR returned no + // hits. Pull text + vector channels separately when the backend + // exposes them (HybridBackend). Otherwise treat plain Search() + // output as text-only. The wall-clock for the backend search + // call lands on the outer caller's BM25*MS bucket — measuring + // around the engine boundary captures the full per-call cost + // without double-counting against the post-call GetNodesByIDs / + // FindNodesByName / Fallback phases that this function + // instruments individually below. + if !bundleHandled { + type timedChan interface { + SearchChannelsTimed(query string, limit int) ([]search.SearchResult, []string, search.ChannelTimings) + } + switch { + case opts.SkipVectorChannel: + // Identifier-shape fast path: skip the vector channel + // (no embed, no ANN) and run text-only Search. The cost + // saved is the per-call embedder + vector index hit; the + // rerank's classWeightTable proves it's not earning its + // keep for these query classes. + textStart := time.Now() + textResults = backend.Search(query, limit*2) + if timings != nil { + timings.TextBackendMS += time.Since(textStart).Milliseconds() + } + default: + if tc, ok := backend.(timedChan); ok { + var stats search.ChannelTimings + textResults, vectorIDs, stats = tc.SearchChannelsTimed(query, limit*2) + if timings != nil { + timings.TextBackendMS += stats.TextMS + timings.EmbedMS += stats.EmbedMS + timings.VectorSearchMS += stats.VectorSearchMS + } + } else if cs, ok := backend.(search.ChannelSearcher); ok { + textStart := time.Now() + textResults, vectorIDs = cs.SearchChannels(query, limit*2) + if timings != nil { + timings.TextBackendMS += time.Since(textStart).Milliseconds() + } + } else { + textStart := time.Now() + textResults = backend.Search(query, limit*2) + if timings != nil { + timings.TextBackendMS += time.Since(textStart).Milliseconds() + } + } + } + } + + // Collect every ID NOT covered by the bundle path (vector hits + + // fallback path's text hits) and materialise them with one + // batched fetch. Empty IDs are tolerated — the batch lookup + // ignores them and the per-id insert short-circuits below. + idBatch := make([]string, 0, len(textResults)+len(vectorIDs)) + for _, r := range textResults { + if r.ID != "" { + if _, covered := bundleNodeByID[r.ID]; covered { + continue + } + idBatch = append(idBatch, r.ID) + } + } + for _, id := range vectorIDs { + if id != "" { + if _, covered := bundleNodeByID[id]; covered { + continue + } + idBatch = append(idBatch, id) + } + } + getNodesStart := time.Now() + nodeByID := e.g.GetNodesByIDs(idBatch) + if timings != nil { + timings.GetNodesMS += time.Since(getNodesStart).Milliseconds() + } + if nodeByID == nil { + // GetNodesByIDs returns nil for empty input — we still need a + // non-nil map below to merge the bundle's nodes into. + nodeByID = make(map[string]*graph.Node, len(bundleNodeByID)) + } + // Merge the bundle's already-materialised nodes into the same + // lookup map the per-candidate insert step below reads from. + for id, n := range bundleNodeByID { + nodeByID[id] = n } idx := make(map[string]int) // node ID → slice index for dedup @@ -475,7 +748,7 @@ func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Cand if id == "" { return } - node := e.g.GetNode(id) + node := nodeByID[id] if node == nil || node.Kind == graph.KindFile || node.Kind == graph.KindImport { return } @@ -510,50 +783,70 @@ func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Cand } // Exact-name matches that BM25 might rank low — splice them in at - // the tail of the text channel so they're still text-ranked. - for _, n := range e.g.FindNodesByName(query) { - if n.Kind == graph.KindFile || n.Kind == graph.KindImport { - continue + // the tail of the text channel so they're still text-ranked. The + // caller can suppress this when the query string is known to never + // match a literal Name (the combined-OR fan-out's concatenated bag + // of expansion terms, for example) — saves the Cypher round-trip + // that would unconditionally return zero rows. + if !opts.SkipExactNameSplice { + findNameStart := time.Now() + for _, n := range e.g.FindNodesByName(query) { + if n.Kind == graph.KindFile || n.Kind == graph.KindImport { + continue + } + if _, seen := idx[n.ID]; seen { + continue + } + idx[n.ID] = len(cands) + cands = append(cands, &rerank.Candidate{Node: n, TextRank: len(textResults), VectorRank: -1}) } - if _, seen := idx[n.ID]; seen { - continue + if timings != nil { + timings.FindNameMS += time.Since(findNameStart).Milliseconds() } - idx[n.ID] = len(cands) - cands = append(cands, &rerank.Candidate{Node: n, TextRank: len(textResults), VectorRank: -1}) } // Substring fallback for remaining slots — strictly TextRank=-1 // (the rerank pipeline still considers them via signature/recency - // signals, but BM25 can't speak to them). Matches are collected, - // sorted by ID, then truncated, so the candidate set does not - // depend on the randomised map-iteration order of AllNodes(). + // signals, but BM25 can't speak to them). The store-side + // FindNodesByNameContaining pushes the predicate into the backend + // index instead of materialising every node over cgo and filtering + // in Go — the old AllNodes loop is broken at Linux-kernel scale + // (10M+ symbols, hundreds of MB of nodes per query). We over-fetch + // by a small slack factor so dedup against existing cands still + // leaves room to fill `limit`. if len(cands) < limit { - lower := strings.ToLower(query) - var subMatches []*graph.Node - for _, n := range e.g.AllNodes() { + fallbackStart := time.Now() + fetch := (limit - len(cands)) * 2 + if fetch < limit { + fetch = limit + } + subMatches := e.g.FindNodesByNameContaining(query, fetch) + // Stable ordering — backends may return in catalog order, which + // is not a meaningful relevance signal here. + sort.Slice(subMatches, func(i, j int) bool { return subMatches[i].ID < subMatches[j].ID }) + for _, n := range subMatches { if n.Kind == graph.KindFile || n.Kind == graph.KindImport { continue } if _, seen := idx[n.ID]; seen { continue } - if strings.Contains(strings.ToLower(n.Name), lower) { - subMatches = append(subMatches, n) - } - } - sort.Slice(subMatches, func(i, j int) bool { return subMatches[i].ID < subMatches[j].ID }) - for _, n := range subMatches { idx[n.ID] = len(cands) cands = append(cands, &rerank.Candidate{Node: n, TextRank: -1, VectorRank: -1}) if len(cands) >= limit { break } } + if timings != nil { + timings.FallbackMS += time.Since(fallbackStart).Milliseconds() + } } // Bigram-overlap typo rescue. Same gates as the legacy path: // nothing else surfaced, query is one indivisible 4+ char token, - // backend can provide candidates. + // backend can provide candidates. The bigram backend also returns + // raw IDs — batch-materialise them too rather than fall back to + // per-id GetNode. if len(cands) == 0 && len(query) >= 4 && !strings.ContainsAny(query, " /.:_-") { if bg, ok := backend.(bigramProvider); ok { keys := len(query) - 1 @@ -561,18 +854,25 @@ func (e *Engine) gatherBackendCandidates(query string, limit int) []*rerank.Cand if minOverlap < 3 { minOverlap = 3 } - for _, id := range bg.BigramCandidates(query, minOverlap) { - if _, seen := idx[id]; seen { - continue - } - node := e.g.GetNode(id) - if node == nil || node.Kind == graph.KindFile || node.Kind == graph.KindImport { - continue - } - idx[id] = len(cands) - cands = append(cands, &rerank.Candidate{Node: node, TextRank: -1, VectorRank: -1}) - if len(cands) >= limit { - break + bigramIDs := bg.BigramCandidates(query, minOverlap) + // Skip the batch fetch entirely when the bigram backend + // returned nothing — otherwise we'd issue an empty Cypher + // round-trip. + if len(bigramIDs) > 0 { + bigramNodes := e.g.GetNodesByIDs(bigramIDs) + for _, id := range bigramIDs { + if _, seen := idx[id]; seen { + continue + } + node := bigramNodes[id] + if node == nil || node.Kind == graph.KindFile || node.Kind == graph.KindImport { + continue + } + idx[id] = len(cands) + cands = append(cands, &rerank.Candidate{Node: node, TextRank: -1, VectorRank: -1}) + if len(cands) >= limit { + break + } } } } @@ -725,18 +1025,11 @@ func (e *Engine) bfs(nodeID string, opts QueryOptions, forward bool, edgeKinds [ kindSet[k] = true } - visited := make(map[string]bool) + visited := map[string]bool{nodeID: true} var allNodes []*graph.Node var allEdges []*graph.Edge truncated := false - type item struct { - id string - depth int - } - queue := []item{{id: nodeID, depth: 0}} - visited[nodeID] = true - if n := e.g.GetNode(nodeID); n != nil { // The seed always enters the result, regardless of scope — // callers ask "what reaches X" with X already in mind. The @@ -744,92 +1037,147 @@ func (e *Engine) bfs(nodeID string, opts QueryOptions, forward bool, edgeKinds [ allNodes = append(allNodes, n) } - for len(queue) > 0 { - cur := queue[0] - queue = queue[1:] - - if cur.depth >= opts.Depth { - continue + // admit is the single place edge/node bookkeeping lives, shared by + // the batched and per-node expansion paths. It records the edge + // (unless the node budget is already full — the legacy code grew + // allEdges without bound, so a high-degree hub could pin gigabytes + // of edge structs), then admits a new, in-scope, non-test neighbour + // and returns its id to enqueue ("" = skip). + admit := func(edge *graph.Edge, neighborID string, neighbor *graph.Node) string { + // Skip unresolved/external targets. + if graph.IsUnresolvedTarget(neighborID) || strings.HasPrefix(neighborID, "external::") { + return "" } - - var edges []*graph.Edge - if bidir { - edges = append(e.g.GetOutEdges(cur.id), e.g.GetInEdges(cur.id)...) - } else if forward { - edges = e.g.GetOutEdges(cur.id) - } else { - edges = e.g.GetInEdges(cur.id) + // Once the node budget is full, stop recording edges too: the + // result is already truncated and an unbounded allEdges is the + // memory-blowup vector this guard closes. + if len(allNodes) >= opts.Limit { + truncated = true + return "" } + // ExcludeTests drops neighbours flagged as tests during a reverse + // traversal — a no-op for forward/bidirectional walks. + if opts.ExcludeTests && !forward && !bidir && isTestSource(neighbor) { + return "" + } + // Workspace/project scope: neighbours outside the bound scope are + // dropped along with the edge that pointed at them. + if opts.WorkspaceID != "" && neighbor != nil && !opts.ScopeAllows(neighbor) { + return "" + } + allEdges = append(allEdges, edge) + if visited[neighborID] { + return "" + } + visited[neighborID] = true + if neighbor == nil { + return "" + } + allNodes = append(allNodes, neighbor) + return neighborID + } - for _, edge := range edges { - if !bidir && !kindSet[edge.Kind] { - continue - } - - var neighborID string - if forward || bidir { - if edge.From == cur.id { - neighborID = edge.To - } else if bidir { - neighborID = edge.From - } else { + // A backend that implements graph.FrontierExpander (the ladybug + // store) returns a whole frontier's edges + neighbour nodes in one + // round-trip — no GetNode per edge, no meta decode. Bidirectional + // (cluster) walks and capability-less backends (the in-memory graph, + // whose reads are already O(1)) keep the per-node path. + expander, batched := e.g.(graph.FrontierExpander) + batched = batched && !bidir && len(edgeKinds) > 0 + + frontier := []string{nodeID} + for depth := 0; depth < opts.Depth && len(frontier) > 0 && len(allNodes) < opts.Limit; depth++ { + var next []string + if batched { + for _, h := range expander.ExpandFrontier(frontier, forward, edgeKinds, opts.Limit) { + if h.Edge == nil { continue } - } else { - if edge.To == cur.id { - neighborID = edge.From - } else { - continue + neighborID := h.Edge.To + if !forward { + neighborID = h.Edge.From } - } - - // Skip unresolved/external targets. - if strings.HasPrefix(neighborID, "unresolved::") || strings.HasPrefix(neighborID, "external::") { - continue - } - - // ExcludeTests drops neighbours flagged as tests during a - // reverse traversal — for forward traversals it's a no-op - // because callers asking "who depends on X" (reverse) are - // the only consumers of this filter today. - if opts.ExcludeTests && !forward && !bidir { - if n := e.g.GetNode(neighborID); isTestSource(n) { - continue + if id := admit(h.Edge, neighborID, h.Neighbor); id != "" { + next = append(next, id) } - } - - // Workspace/project scope. When opts.WorkspaceID is set, - // neighbours outside that scope are dropped along with the - // edge that pointed at them. Cross-workspace edges produced - // by the resolver only exist when an explicit - // cross_workspace_dep allows them, so this filter also - // acts as the query-time enforcement of "find_usages on a - // tuck symbol returns hits only from tuck". - if opts.WorkspaceID != "" { - if n := e.g.GetNode(neighborID); n != nil && !opts.ScopeAllows(n) { - continue + if len(allNodes) >= opts.Limit { + truncated = true + break } } - - allEdges = append(allEdges, edge) - - if visited[neighborID] { - continue + } else { + for _, cur := range frontier { + var edges []*graph.Edge + switch { + case bidir: + edges = append(e.g.GetOutEdges(cur), e.g.GetInEdges(cur)...) + case forward: + edges = e.g.GetOutEdges(cur) + default: + edges = e.g.GetInEdges(cur) + } + for _, edge := range edges { + if !bidir && !kindSet[edge.Kind] { + continue + } + var neighborID string + switch { + case forward || bidir: + if edge.From == cur { + neighborID = edge.To + } else if bidir { + neighborID = edge.From + } else { + continue + } + default: + if edge.To == cur { + neighborID = edge.From + } else { + continue + } + } + // One GetNode per neighbour (the legacy path fetched + // it twice — scope check, then materialise). + var neighbor *graph.Node + if !graph.IsUnresolvedTarget(neighborID) && !strings.HasPrefix(neighborID, "external::") { + neighbor = e.g.GetNode(neighborID) + } + if id := admit(edge, neighborID, neighbor); id != "" { + next = append(next, id) + } + if len(allNodes) >= opts.Limit { + truncated = true + break + } + } + if len(allNodes) >= opts.Limit { + break + } } - visited[neighborID] = true + } + frontier = next + } - n := e.g.GetNode(neighborID) - if n == nil { - continue + // ExpandFrontier returns meta-free neighbours; a full-detail caller + // (e.g. one reading Meta["signature"]) gets them re-hydrated in one + // batched round-trip. Brief callers (smart_context's ring, step-7) + // skip this — stripMeta would drop the meta anyway. + if batched && opts.Detail != "brief" && len(allNodes) > 1 { + if hyd, ok := e.g.(interface { + GetNodesByIDs(ids []string) map[string]*graph.Node + }); ok { + ids := make([]string, 0, len(allNodes)) + for _, n := range allNodes { + ids = append(ids, n.ID) } - - if len(allNodes) >= opts.Limit { - truncated = true - continue + if full := hyd.GetNodesByIDs(ids); full != nil { + for i, n := range allNodes { + if fn := full[n.ID]; fn != nil { + allNodes[i] = fn + } + } } - - allNodes = append(allNodes, n) - queue = append(queue, item{id: neighborID, depth: cur.depth + 1}) } } @@ -853,6 +1201,28 @@ func stripMeta(sg *SubGraph) { } } +// isUsageEdgeKind reports whether an edge kind counts as a "usage" +// for FindUsages — the same predicate the legacy inline if-chain +// evaluated. Hoisted into a function so the kind set can be reused +// across the pre-filter pass and the materialisation pass without +// drifting. +func isUsageEdgeKind(k graph.EdgeKind) bool { + switch k { + case graph.EdgeCalls, graph.EdgeReferences, + graph.EdgeInstantiates, + graph.EdgeReturns, graph.EdgeTypedAs, + graph.EdgeImplements, graph.EdgeExtends, + graph.EdgeComposes, + graph.EdgeProvides, graph.EdgeConsumes, + graph.EdgeReadsConfig, graph.EdgeWritesConfig, + graph.EdgeUsesEnv, graph.EdgeConfigures, + graph.EdgeMounts, graph.EdgeExposes, + graph.EdgeDependsOn: + return true + } + return false +} + // isTestSource reports whether a node was flagged as a test by the // indexer's test-edge pass. Used by QueryOptions.ExcludeTests to drop // callers/users that originate in tests, leaving production callers. @@ -865,14 +1235,29 @@ func isTestSource(n *graph.Node) bool { } func dedup(edges []*graph.Edge) []*graph.Edge { - seen := make(map[string]bool) - var out []*graph.Edge + if len(edges) == 0 { + return edges + } + // Struct key avoids the per-edge string concatenation the old + // implementation paid (e.From + "->" + e.To + ":" + kind) — on a + // 4 000-edge file the alloc storm dominated GetFileSymbols. + type dedupKey struct { + from string + to string + kind graph.EdgeKind + } + seen := make(map[dedupKey]struct{}, len(edges)) + out := make([]*graph.Edge, 0, len(edges)) for _, e := range edges { - key := e.From + "->" + e.To + ":" + string(e.Kind) - if !seen[key] { - seen[key] = true - out = append(out, e) + if e == nil { + continue + } + k := dedupKey{from: e.From, to: e.To, kind: e.Kind} + if _, ok := seen[k]; ok { + continue } + seen[k] = struct{}{} + out = append(out, e) } return out } diff --git a/internal/query/subgraph.go b/internal/query/subgraph.go index b7483574..d9265779 100644 --- a/internal/query/subgraph.go +++ b/internal/query/subgraph.go @@ -5,6 +5,7 @@ import ( "strings" "github.com/zzet/gortex/internal/graph" + "github.com/zzet/gortex/internal/search/rerank" ) // SubGraph is a JSON-serializable result from a graph query. @@ -60,6 +61,92 @@ type QueryOptions struct { // indexer's test-edge pass. Lets find_usages / get_callers answer // "who depends on X *in production*" without test-noise dilution. ExcludeTests bool `json:"exclude_tests,omitempty"` + + // SearchTimings, when non-nil, is populated by the search hot path + // (SearchSymbolsScoped → gatherBackendCandidates) with per-phase + // wall-clock breakdowns. Used by the MCP search_symbols handler's + // debug log line; nil disables instrumentation. Single-call: the + // caller MUST hand a fresh struct per query (the engine does not + // reset). Never serialised — `json:"-"` keeps the option struct + // JSON shape stable. + SearchTimings *SearchTimings `json:"-"` + + // RerankContext is the optional rerank context the engine uses when + // gathering bundle candidates: each bundle's in/out edges are + // seeded into the context's edge caches so the handler-side + // rerank.Pipeline.Rerank can skip its own batched edge fetch on + // the merged candidate set. Pass nil — the engine's gather path + // still works, the bundle's edges are just discarded after the + // per-call rerank. Never serialised. + RerankContext *rerank.Context `json:"-"` + + // SkipInnerRerank, when true, makes SearchSymbolsRanked skip its + // own per-call rerank.Pipeline.Rerank pass. Callers that fan a + // search across N expansion terms and merge the results themselves + // (the MCP search_symbols handler) re-run the rerank once on the + // merged candidate set with the full session-aware context — the + // inner per-call rerank is wasted work whose output is mostly + // discarded by the merge. Flipping this on collapses N+1 + // engine-side rerank invocations to zero. The merge-side rerank + // is the source of truth either way. + SkipInnerRerank bool `json:"-"` + + // SkipVectorChannel, when true, makes gatherBackendCandidates skip + // the vector channel entirely — no embedder call, no ANN search. + // Set by the MCP search_symbols handler on identifier-shape queries + // (QueryClassSymbol / QueryClassPath / QueryClassSignature) where + // the rerank's classWeightTable already proves the semantic + // channel contributes near-zero useful signal (multipliers 0.65 / + // 0.45 / 0.80 vs the baseline 1.00 for concept). Saves the embed + // + vector search round-trip on the common-case identifier lookup. + // The bundle path's vector-only branch and the legacy + // SearchChannels path both honour this flag. + SkipVectorChannel bool `json:"-"` + + // SkipExactNameSplice, when true, makes gatherBackendCandidates + // skip the FindNodesByName(query) splice-in. Set by callers that + // know the query string cannot match any exact node name — the + // fetchAndMergeBM25 fan-out's combined-OR call is the canonical + // case: a concatenated bag of expansion terms ("NewServer + // StartServer Server.Init …") can't be the literal Name of any + // node, so the FindNodesByName Cypher round-trip is wasted work. + // The primary query still runs the splice. + SkipExactNameSplice bool `json:"-"` +} + +// SearchTimings carries per-phase wall-clock measurements collected +// by the BM25 retrieval pipeline. Zero-valued fields mean the phase +// didn't run on this call (e.g. FallbackMS is 0 when the BM25 result +// already saturated the limit). +type SearchTimings struct { + BM25PrimaryMS int64 // time spent in the primary BM25 backend call + BM25ExpansionMS int64 // time spent across all expansion-term BM25 calls + GetNodesMS int64 // time spent materialising BM25/vector IDs via GetNodesByIDs + FindNameMS int64 // time spent on the FindNodesByName splice-in + FallbackMS int64 // time spent in the substring/name-contains fallback + // Sub-buckets of the BM25*MS totals — proves which phase inside + // the wrapper is actually slow. Accumulated across every + // primary + expansion BM25 invocation. + TextBackendMS int64 // strictly inside Backend.Search / text channel + EmbedMS int64 // inside embedder.Embed (vector path only) + VectorSearchMS int64 // inside vector.Search ANN call (vector path only) + EngineRerankMS int64 // inside rerank.Pipeline.Rerank in SearchSymbolsRanked + // BundleMS accumulates the wall-clock spent inside + // SymbolBundleSearcherBackend.SearchSymbolBundles (one Cypher per + // BM25 fan-out that returns Node + in/out edges in one bundle). + // When the backend supports bundles, the bundle path replaces the + // (TextBackend + GetNodes) sub-buckets; the bm25_backend_ms + // derivation in the handler subtracts BundleMS so the existing + // fields stay meaningful. + BundleMS int64 + // CacheHitRate is the fraction of post-merge candidates whose + // in/out edges were already in the rerank Context cache when the + // handler-side prepare() ran. 1.0 means every candidate was + // pre-seeded from a bundle; 0.0 means the rerank had to fetch + // every candidate's edges itself. Populated by the handler when + // the bundle path is active so the search_symbols debug log can + // surface how often the seeding actually catches. + CacheHitRate float64 } // ScopeAllows reports whether a node passes the workspace/project diff --git a/internal/query/walk.go b/internal/query/walk.go index cf35a1ad..7fb070bc 100644 --- a/internal/query/walk.go +++ b/internal/query/walk.go @@ -204,7 +204,7 @@ func (e *Engine) WalkBudgeted(startID string, opts WalkOptions) *SubGraph { neighborID = edge.From } - if strings.HasPrefix(neighborID, "unresolved::") || + if graph.IsUnresolvedTarget(neighborID) || strings.HasPrefix(neighborID, "external::") { continue } diff --git a/internal/reach/reach.go b/internal/reach/reach.go index ed9edcfd..b3d95fd4 100644 --- a/internal/reach/reach.go +++ b/internal/reach/reach.go @@ -105,7 +105,7 @@ var buildCounter uint64 // Safe to call repeatedly: existing reach_d* entries are overwritten // and the build counter advances each time so any consumer that read // an entry from a prior generation will fall back to a live walk. -func BuildIndex(g *graph.Graph) *Stats { +func BuildIndex(g graph.Store) *Stats { return BuildIndexCtx(context.Background(), g) } @@ -116,7 +116,7 @@ func BuildIndex(g *graph.Graph) *Stats { // longest stages on monorepo-scale graphs (~200 s on k8s with 150 k // impact seeds). Pure operator-visibility instrumentation: the per- // report call is cheap (no I/O when the reporter is the default no-op). -func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { +func BuildIndexCtx(ctx context.Context, g graph.Store) *Stats { if g == nil { return &Stats{} } @@ -146,6 +146,13 @@ func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { const reachProgressEvery = 1000 seedsDone := 0 + // Collect the seed nodes we stamp so we can persist the Meta back + // through the store in one batch at the end. On the in-memory + // backend the in-place stamp already persists (n is canonical); on + // disk backends (Ladybug) n is a GetNode reconstruction, so without + // the write-back the whole reach index would be computed and then + // thrown away. Mirrors the per-seed AddNode in Lookup's slow path. + stamped := make([]*graph.Node, 0, seedTotal) for _, n := range nodes { if n == nil || !ImpactSeedKind(n.Kind) { continue @@ -169,6 +176,7 @@ func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { setOrDeleteStrings(n.Meta, MetaReachD2Label, tiers[1].Labels) setOrDeleteStrings(n.Meta, MetaReachD3Label, tiers[2].Labels) + stamped = append(stamped, n) stats.NodesIndexed++ stats.EntriesD1 += len(tiers[0].IDs) stats.EntriesD2 += len(tiers[1].IDs) @@ -179,6 +187,12 @@ func BuildIndexCtx(ctx context.Context, g *graph.Graph) *Stats { reporter.Report("reachability index", seedsDone, seedTotal) } } + // Persist every stamped node's Meta back through the store in one + // batch (no-op-ish on the in-memory backend, the durable write on + // disk backends). AddBatch with no edges only upserts the nodes. + if len(stamped) > 0 { + g.AddBatch(stamped, nil) + } reporter.Report("reachability index", seedsDone, seedTotal) return stats } @@ -221,14 +235,31 @@ func setOrDeleteFloats(m map[string]any, key string, value []float64) { // filtered with ReachableEdge so the result matches AnalyzeImpact; // file / import nodes are walked through for fan-out but excluded // from the tier slices. -func compute(g *graph.Graph, seedID string) [3]tier { +func compute(g graph.Store, seedID string) [3]tier { var result [3]tier visited := map[string]struct{}{seedID: {}} current := []string{seedID} - for depth := 1; depth <= 3; depth++ { + for depth := 1; depth <= 3 && len(current) > 0; depth++ { + // Batch the whole BFS level's incoming-edge fetch into one + // backend round-trip. The per-node g.GetInEdges(id) form issued + // one Cypher query + cgo crossing per node on disk backends — an + // O(reachable-nodes) query storm that turned a single + // AnalyzeImpact live walk into a multi-minute (timeout) call on + // Ladybug. GetInEdgesByNodeIDs collapses it to one query per depth. + inEdges := g.GetInEdgesByNodeIDs(current) + + // First pass: discover this level's new From-nodes in + // deterministic (current-order, edge-order) order, recording the + // representative in-edge for each. + type cand struct { + from string + conf float64 + kind graph.EdgeKind + } var next []string + var cands []cand for _, id := range current { - for _, e := range g.GetInEdges(id) { + for _, e := range inEdges[id] { if !ReachableEdge(e.Kind) { continue } @@ -237,17 +268,30 @@ func compute(g *graph.Graph, seedID string) [3]tier { } visited[e.From] = struct{}{} next = append(next, e.From) + cands = append(cands, cand{from: e.From, conf: e.Confidence, kind: e.Kind}) + } + } - if n := g.GetNode(e.From); n == nil || - n.Kind == graph.KindFile || n.Kind == graph.KindImport { - continue - } - slot := depth - 1 - result[slot].IDs = append(result[slot].IDs, e.From) - result[slot].Conf = append(result[slot].Conf, e.Confidence) - result[slot].Labels = append(result[slot].Labels, - graph.ConfidenceLabelFor(e.Kind, e.Confidence)) + // Batch the node-kind lookups too — the original called + // g.GetNode(e.From) once per discovered node (a second per-node + // query storm on disk backends). File / import nodes are still + // walked through for fan-out (they stay in `next`) but excluded + // from the result tiers, exactly as before. + ids := make([]string, len(cands)) + for i := range cands { + ids[i] = cands[i].from + } + nodes := g.GetNodesByIDs(ids) + slot := depth - 1 + for _, c := range cands { + n := nodes[c.from] + if n == nil || n.Kind == graph.KindFile || n.Kind == graph.KindImport { + continue } + result[slot].IDs = append(result[slot].IDs, c.from) + result[slot].Conf = append(result[slot].Conf, c.conf) + result[slot].Labels = append(result[slot].Labels, + graph.ConfidenceLabelFor(c.kind, c.conf)) } current = next } @@ -287,7 +331,7 @@ func sortTierByID(t *tier) { // and bumps the build counter so any cached lookups dated to a prior // generation are invalidated. Use when the graph topology has shifted // so far that a full rebuild is cheaper than incremental invalidation. -func ClearIndex(g *graph.Graph) { +func ClearIndex(g graph.Store) { if g == nil { return } @@ -339,7 +383,7 @@ type Entry struct { // given seed, then caches forever. BuildIndex remains available for // `gortex enrich reach` (explicit prebuild) and for callers that // want to pay the cost up front under controlled conditions. -func Lookup(g *graph.Graph, seedID string) (d1, d2, d3 []Entry, hit bool) { +func Lookup(g graph.Store, seedID string) (d1, d2, d3 []Entry, hit bool) { if g == nil { return nil, nil, nil, false } @@ -386,6 +430,17 @@ func Lookup(g *graph.Graph, seedID string) (d1, d2, d3 []Entry, hit bool) { setOrDeleteStrings(n.Meta, MetaReachD2Label, tiers[1].Labels) setOrDeleteStrings(n.Meta, MetaReachD3Label, tiers[2].Labels) + // Persist the freshly-stamped Meta through the store. On the + // in-memory backend n is the canonical node, so the mutations above + // already stuck — AddNode re-inserts the same pointer idempotently. + // On disk backends (Ladybug) n is a per-call reconstruction returned + // by GetNode, so the in-place stamp would otherwise be discarded the + // moment this function returns: the lazy reach cache would never + // survive a single query, forcing a full recompute on every + // AnalyzeImpact / explain_change_impact / get_callers call. AddNode + // upserts the Meta column so the cache actually sticks. + g.AddNode(n) + d1 = readTier(n.Meta, MetaReachD1, MetaReachD1Conf, MetaReachD1Label) d2 = readTier(n.Meta, MetaReachD2, MetaReachD2Conf, MetaReachD2Label) d3 = readTier(n.Meta, MetaReachD3, MetaReachD3Conf, MetaReachD3Label) diff --git a/internal/releases/releases.go b/internal/releases/releases.go index 5d19a785..2a31a33f 100644 --- a/internal/releases/releases.go +++ b/internal/releases/releases.go @@ -37,8 +37,26 @@ import ( // unavailable. Errors silently produce an empty list — releases // enrichment is best-effort like blame. func ListTags(repoRoot string) []string { - cmd := exec.Command("git", "-C", repoRoot, - "for-each-ref", "--sort=creatordate", "--format=%(refname:short)", "refs/tags/") + return ListTagsOnBranch(repoRoot, "") +} + +// ListTagsOnBranch is ListTags scoped to tags reachable from `branch`. +// Empty branch means "every tag in the repo", matching ListTags. +// +// Restricting to a single branch is the canonical defence against +// feature-branch tags polluting the release timeline: tags that were +// only ever pushed on a topic branch (a "v0.0.0-test" tag from a +// rebase scratch, for instance) shouldn't appear in the persisted +// release order. Pass the repo's default branch ("origin/main", +// "main", …) when callers want that semantic. +func ListTagsOnBranch(repoRoot, branch string) []string { + args := []string{"-C", repoRoot, "for-each-ref", + "--sort=creatordate", "--format=%(refname:short)"} + if strings.TrimSpace(branch) != "" { + args = append(args, "--merged="+branch) + } + args = append(args, "refs/tags/") + cmd := exec.Command("git", args...) out, err := cmd.Output() if err != nil { return nil @@ -104,7 +122,7 @@ func ReleaseNodeID(repoPrefix, tag string) string { // // Errors from individual git invocations are tolerated — a broken // ref shouldn't kill enrichment for the rest of the tag set. -func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { +func EnrichGraph(g graph.Store, repoRoot string) (int, error) { return EnrichGraphWithRepoPrefix(g, repoRoot, "") } @@ -112,11 +130,24 @@ func EnrichGraph(g *graph.Graph, repoRoot string) (int, error) { // EnrichGraph. EnrichGraph delegates to it with an empty prefix; the // multi-repo enricher passes the per-repo prefix so KindRelease IDs // stay collision-free across repos. -func EnrichGraphWithRepoPrefix(g *graph.Graph, repoRoot, repoPrefix string) (int, error) { +// +// Walks every tag in the repo. Use EnrichGraphForBranch when callers +// want to restrict the timeline to tags reachable from a specific +// branch — typically the default branch — so topic-branch tags don't +// pollute the persisted history. +func EnrichGraphWithRepoPrefix(g graph.Store, repoRoot, repoPrefix string) (int, error) { + return EnrichGraphForBranch(g, repoRoot, repoPrefix, "") +} + +// EnrichGraphForBranch is EnrichGraphWithRepoPrefix scoped to tags +// reachable from `branch`. Empty branch means "every tag", matching +// the legacy behaviour. Mutations round-trip through g.AddNode so +// LadyBug-backed stores persist the result. +func EnrichGraphForBranch(g graph.Store, repoRoot, repoPrefix, branch string) (int, error) { if g == nil || repoRoot == "" { return 0, nil } - tags := ListTags(repoRoot) + tags := ListTagsOnBranch(repoRoot, branch) if len(tags) == 0 { return 0, nil } @@ -189,6 +220,12 @@ func EnrichGraphWithRepoPrefix(g *graph.Graph, repoRoot, repoPrefix string) (int n.Meta = map[string]any{} } n.Meta["added_in"] = tag + // Re-upsert so LadyBug-backed stores persist the Meta change. + // In-memory stores treat this as a no-op (the pointer is + // already in the graph); the disk-backed implementations need + // the AddNode call to round-trip Meta through their write + // path. Mirrors the churn enricher. + g.AddNode(n) enriched++ } return enriched, nil diff --git a/internal/resolver/backend_resolver.go b/internal/resolver/backend_resolver.go new file mode 100644 index 00000000..03e06f39 --- /dev/null +++ b/internal/resolver/backend_resolver.go @@ -0,0 +1,23 @@ +package resolver + +import ( + "os" + "strings" +) + +// backendResolverEnabled reports whether the resolver should consult +// graph.BackendResolver before running its Go-side worker pool. +// Default on for the ladybug-only daemon: the backend resolver runs +// one Cypher per rule rather than one round-trip per unresolved edge. +// With the multi-repo encoding exposing 100k+ `unresolved::*` edges +// at warmup, the per-edge Go path is the difference between a sub- +// 10-minute warmup and a hang / OOM. Set GORTEX_BACKEND_RESOLVER=0 +// to opt back out for the edge case where a small in-memory corpus +// can be heuristically resolved faster in RAM. +func backendResolverEnabled() bool { + v := os.Getenv("GORTEX_BACKEND_RESOLVER") + if v == "0" || strings.EqualFold(v, "false") { + return false + } + return true +} diff --git a/internal/resolver/bare_name_scope_bind.go b/internal/resolver/bare_name_scope_bind.go new file mode 100644 index 00000000..9f1a2822 --- /dev/null +++ b/internal/resolver/bare_name_scope_bind.go @@ -0,0 +1,195 @@ +package resolver + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// scopeNode is the per-binding payload of the owner-keyed scope +// index built by bindBareNameScopeRefs. Kept as a named struct so +// the bind helpers can share the same signature. +type scopeNode struct { + id string + name string + startLine int + kind graph.NodeKind +} + +// bindBareNameScopeRefs rewrites `unresolved::` edges whose +// source is inside a function scope (or IS a function) onto the +// matching KindLocal / KindParam node that the enclosing function +// declares. Pre-#77 there was nothing to bind to — locals were +// edge-endpoint-only — so the resolver always fell through to +// `unresolved::*`. With #77's KindLocal materialisation the scope is +// now first-class and we can do the bind. +// +// Two precedence rules govern the choice when more than one candidate +// matches the name: +// +// 1. KindLocal beats KindParam — Go shadowing semantics, a local +// declared with the same name as a parameter takes over from its +// declaration line onwards. +// 2. Among KindLocal candidates the most recently declared one before +// the reference line wins (the standard "last shadow in scope" +// rule). The edge's Line field is the reference site; we filter +// candidates to StartLine <= reference line and pick the maximum +// StartLine. +// +// Ambiguous cases that don't resolve to one winner (e.g. two locals +// with the same Name on the same StartLine, or no candidate before +// the reference line) are left untouched so the downstream `unresolved` +// audit can still surface them. +// +// Scope today is Go-only — TypeScript / Python don't materialise +// locals yet, so their unresolved bare-name edges have no candidate +// to bind to. The pass naturally degenerates to a no-op for those +// languages because the candidate index will be empty for their +// owners. +func (r *Resolver) bindBareNameScopeRefs() { + // Index every KindLocal / KindParam by enclosing-function ID. Done + // once up front so the per-edge bind is an O(matching-name) walk + // rather than a graph-wide FindNodesByName. + owned := map[string][]scopeNode{} + for n := range r.graph.NodesByKind(graph.KindLocal) { + owner := enclosingFunctionForBinding(n.ID) + if owner == "" { + continue + } + owned[owner] = append(owned[owner], scopeNode{ + id: n.ID, name: n.Name, startLine: n.StartLine, kind: graph.KindLocal, + }) + } + for n := range r.graph.NodesByKind(graph.KindParam) { + owner := enclosingFunctionForBinding(n.ID) + if owner == "" { + continue + } + owned[owner] = append(owned[owner], scopeNode{ + id: n.ID, name: n.Name, startLine: n.StartLine, kind: graph.KindParam, + }) + } + if len(owned) == 0 { + return + } + + var batch []graph.EdgeReindex + for e := range r.graph.EdgesByKind(graph.EdgeReads) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + for e := range r.graph.EdgesByKind(graph.EdgeReferences) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + // EdgeArgOf and EdgeValueFlow carry the same shape — `unresolved::` + // is the dataflow source/target the parser couldn't bind. + for e := range r.graph.EdgesByKind(graph.EdgeArgOf) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + for e := range r.graph.EdgesByKind(graph.EdgeValueFlow) { + if rewrote := r.tryBindBareName(e, owned); rewrote != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: rewrote}) + } + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} + +// tryBindBareName tries to rewrite e.To from `unresolved::` to a +// matching in-scope KindLocal/KindParam ID. Returns the original To +// value when a rewrite happened (caller batches it for ReindexEdges) +// or "" when the edge was left alone. +func (r *Resolver) tryBindBareName(e *graph.Edge, owned map[string][]scopeNode) string { + if e == nil || !graph.IsUnresolvedTarget(e.To) { + return "" + } + name := graph.UnresolvedName(e.To) + if name == "" || strings.ContainsAny(name, ".*:#") { + // Not a bare identifier — leave to other passes (qualified + // names, *.method, etc.). + return "" + } + ownerID := enclosingFunctionForBinding(e.From) + if ownerID == "" { + return "" + } + candidates := owned[ownerID] + if len(candidates) == 0 { + return "" + } + chosen := pickInScopeBinding(candidates, name, e.Line) + if chosen == "" || chosen == e.To { + return "" + } + oldTo := e.To + e.To = chosen + return oldTo +} + +// pickInScopeBinding implements the precedence rules: +// - prefer KindLocal over KindParam (Go shadowing), +// - among KindLocal, pick the latest StartLine that's still <= refLine, +// - if multiple candidates match the same maximum StartLine, return "" +// (ambiguous — leave the edge unresolved so the audit surfaces it). +// +// owned is the per-owner scope-node slice; name is the bare identifier +// from the edge target; refLine is the edge's line (the reference +// site). Returns the chosen ID, or "" when no unambiguous winner. +func pickInScopeBinding(owned []scopeNode, name string, refLine int) string { + var bestLocal struct { + id string + line int + dups int + } + var paramID string + for _, c := range owned { + if c.name != name { + continue + } + if c.kind == graph.KindLocal { + if refLine > 0 && c.startLine > refLine { + // Declared after the reference — can't be bound here. + continue + } + switch { + case c.startLine > bestLocal.line: + bestLocal.id = c.id + bestLocal.line = c.startLine + bestLocal.dups = 0 + case c.startLine == bestLocal.line && c.id != bestLocal.id: + bestLocal.dups++ + } + } else if c.kind == graph.KindParam { + if paramID != "" && paramID != c.id { + // Two params with the same name in the same function + // shouldn't happen but defensive — abstain. + paramID = "" + } else { + paramID = c.id + } + } + } + if bestLocal.id != "" && bestLocal.dups == 0 { + return bestLocal.id + } + return paramID +} + +// enclosingFunctionForBinding strips the per-binding suffix added by +// the Go extractor (`#local:`, `#param:`, `#closure`, `#tparam:`) to +// recover the owner function/method ID. If `id` has no suffix it's +// returned unchanged — the caller is already a function/method node +// directly (the per-edge From is the function itself for things like +// the `external::foo` import edge inside `func Foo()`). +func enclosingFunctionForBinding(id string) string { + if i := strings.Index(id, "#"); i > 0 { + return id[:i] + } + return id +} diff --git a/internal/resolver/bare_name_scope_bind_test.go b/internal/resolver/bare_name_scope_bind_test.go new file mode 100644 index 00000000..98db3f6b --- /dev/null +++ b/internal/resolver/bare_name_scope_bind_test.go @@ -0,0 +1,200 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +// TestBindBareNameScopeRefs_LocalWins covers the headline case: a +// function declares a KindLocal `key1`; an EdgeReads to +// `unresolved::key1` originating from that function's body should be +// rewritten to point at the KindLocal node. +func TestBindBareNameScopeRefs_LocalWins(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + localID := owner + "#local:key1@+3" + g.AddNode(&graph.Node{ + ID: localID, Kind: graph.KindLocal, Name: "key1", + FilePath: "pkg/foo.go", StartLine: 3, EndLine: 3, Language: "go", + }) + g.AddEdge(&graph.Edge{From: localID, To: owner, Kind: graph.EdgeMemberOf, FilePath: "pkg/foo.go", Line: 3}) + + edge := &graph.Edge{ + From: owner, To: "unresolved::key1", + Kind: graph.EdgeReads, FilePath: "pkg/foo.go", Line: 5, + } + g.AddEdge(edge) + + r := New(g) + r.bindBareNameScopeRefs() + + assert.Equal(t, localID, edge.To, "EdgeReads must be rewritten to the in-scope KindLocal") +} + +// TestBindBareNameScopeRefs_FromBindingResolvesToOwner — the From of +// the edge is itself a per-binding ID (`#local:x@+N`); the +// pass should strip the suffix to recover the enclosing function and +// still bind correctly. +func TestBindBareNameScopeRefs_FromBindingResolvesToOwner(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + keyID := owner + "#local:key@+2" + g.AddNode(&graph.Node{ID: keyID, Kind: graph.KindLocal, Name: "key", FilePath: "pkg/foo.go", StartLine: 2, Language: "go"}) + g.AddEdge(&graph.Edge{From: keyID, To: owner, Kind: graph.EdgeMemberOf}) + + from := owner + "#local:out@+5" + g.AddNode(&graph.Node{ID: from, Kind: graph.KindLocal, Name: "out", FilePath: "pkg/foo.go", StartLine: 5, Language: "go"}) + g.AddEdge(&graph.Edge{From: from, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: from, To: "unresolved::key", Kind: graph.EdgeValueFlow, Line: 5} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, keyID, edge.To, "From with #local: suffix must still resolve via enclosing function") +} + +// TestBindBareNameScopeRefs_ParamFallback covers the Go-shadowing +// fallback: when no local matches, the parameter with the same name +// wins. +func TestBindBareNameScopeRefs_ParamFallback(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + paramID := owner + "#param:req" + g.AddNode(&graph.Node{ID: paramID, Kind: graph.KindParam, Name: "req", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: paramID, To: owner, Kind: graph.EdgeParamOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::req", Kind: graph.EdgeReads, Line: 3} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, paramID, edge.To, "no matching local — param with same name must take over") +} + +// TestBindBareNameScopeRefs_LocalShadowsParam — both a param and a +// local share the same name; the local wins (Go shadowing). +func TestBindBareNameScopeRefs_LocalShadowsParam(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + paramID := owner + "#param:x" + g.AddNode(&graph.Node{ID: paramID, Kind: graph.KindParam, Name: "x", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: paramID, To: owner, Kind: graph.EdgeParamOf}) + + localID := owner + "#local:x@+4" + g.AddNode(&graph.Node{ID: localID, Kind: graph.KindLocal, Name: "x", FilePath: "pkg/foo.go", StartLine: 4, Language: "go"}) + g.AddEdge(&graph.Edge{From: localID, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::x", Kind: graph.EdgeReads, Line: 6} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, localID, edge.To, "KindLocal must shadow KindParam with the same name") +} + +// TestBindBareNameScopeRefs_RefBeforeDeclLeftAlone — a reference +// whose line is BEFORE the local's StartLine can't possibly bind to +// that local. The pass must leave the edge unresolved rather than +// reach backwards. +func TestBindBareNameScopeRefs_RefBeforeDeclLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + localID := owner + "#local:tmp@+10" + g.AddNode(&graph.Node{ID: localID, Kind: graph.KindLocal, Name: "tmp", FilePath: "pkg/foo.go", StartLine: 10, Language: "go"}) + g.AddEdge(&graph.Edge{From: localID, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::tmp", Kind: graph.EdgeReads, Line: 3} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, "unresolved::tmp", edge.To, "reference before declaration must not bind") +} + +// TestBindBareNameScopeRefs_LatestShadowWins covers the standard "last +// shadow in scope" rule when two locals share a name across scopes: +// the binding declared on the higher line (closer to the reference) +// wins. +func TestBindBareNameScopeRefs_LatestShadowWins(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + earlier := owner + "#local:err@+2" + g.AddNode(&graph.Node{ID: earlier, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 2, Language: "go"}) + g.AddEdge(&graph.Edge{From: earlier, To: owner, Kind: graph.EdgeMemberOf}) + + later := owner + "#local:err@+8" + g.AddNode(&graph.Node{ID: later, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 8, Language: "go"}) + g.AddEdge(&graph.Edge{From: later, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::err", Kind: graph.EdgeReads, Line: 12} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, later, edge.To, "the most recent shadow before the reference line must win") +} + +// TestBindBareNameScopeRefs_AmbiguousLeftAlone — two locals with the +// same name declared on the same line (shouldn't happen in valid Go +// but defensive): the pass must leave the edge unresolved rather +// than pick an arbitrary winner. +func TestBindBareNameScopeRefs_AmbiguousLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + a := owner + "#local:err@+5" + b := owner + "#local:err@+5#1" + g.AddNode(&graph.Node{ID: a, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 5, Language: "go"}) + g.AddNode(&graph.Node{ID: b, Kind: graph.KindLocal, Name: "err", FilePath: "pkg/foo.go", StartLine: 5, Language: "go"}) + g.AddEdge(&graph.Edge{From: a, To: owner, Kind: graph.EdgeMemberOf}) + g.AddEdge(&graph.Edge{From: b, To: owner, Kind: graph.EdgeMemberOf}) + + edge := &graph.Edge{From: owner, To: "unresolved::err", Kind: graph.EdgeReads, Line: 7} + g.AddEdge(edge) + + New(g).bindBareNameScopeRefs() + assert.Equal(t, "unresolved::err", edge.To, "ambiguous candidates on same line must leave the edge unresolved") +} + +// TestBindBareNameScopeRefs_QualifiedNotTouched ensures the pass only +// fires on bare names — qualified shapes (`*.Method`, `pkg.Name`, +// `unresolved::pyrel::...`) are left to other passes. +func TestBindBareNameScopeRefs_QualifiedNotTouched(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + // Even if a local matches the unqualified part, the qualified + // shapes must be left alone. + g.AddNode(&graph.Node{ID: owner + "#local:Foo@+2", Kind: graph.KindLocal, Name: "Foo", FilePath: "pkg/foo.go", StartLine: 2, Language: "go"}) + g.AddEdge(&graph.Edge{From: owner + "#local:Foo@+2", To: owner, Kind: graph.EdgeMemberOf}) + + keep := []*graph.Edge{ + {From: owner, To: "unresolved::*.Foo", Kind: graph.EdgeReads, Line: 5}, + {From: owner, To: "unresolved::pkg.Foo", Kind: graph.EdgeReads, Line: 6}, + {From: owner, To: "unresolved::pyrel::./foo", Kind: graph.EdgeReads, Line: 7}, + } + for _, e := range keep { + g.AddEdge(e) + } + + New(g).bindBareNameScopeRefs() + for _, e := range keep { + assert.True(t, + e.To == "unresolved::*.Foo" || e.To == "unresolved::pkg.Foo" || e.To == "unresolved::pyrel::./foo", + "qualified shape %q must stay untouched", e.To, + ) + } +} diff --git a/internal/resolver/bench_test.go b/internal/resolver/bench_test.go index bbce2a3e..8ea93f6a 100644 --- a/internal/resolver/bench_test.go +++ b/internal/resolver/bench_test.go @@ -8,7 +8,7 @@ import ( ) // buildResolverGraph creates a graph with unresolved edges for benchmarking. -func buildResolverGraph(files, symsPerFile int) (*graph.Graph, *Resolver) { +func buildResolverGraph(files, symsPerFile int) (graph.Store, *Resolver) { g := graph.New() // Create file nodes with functions, types, and methods. diff --git a/internal/resolver/concurrent_test.go b/internal/resolver/concurrent_test.go index 682f33c1..b06ee542 100644 --- a/internal/resolver/concurrent_test.go +++ b/internal/resolver/concurrent_test.go @@ -98,7 +98,7 @@ func TestResolver_CrossRepoResolver_SerializeOnGraph(t *testing.T) { // one unresolved edge so the resolver actually has work to do during // the race test. The shape doesn't matter — only that buildDirIndexes // observes >0 file nodes and the resolveEdge inner loop runs. -func buildSmallGraph(t *testing.T) *graph.Graph { +func buildSmallGraph(t *testing.T) graph.Store { t.Helper() g := graph.New() for _, fp := range []string{"repo-a/lib/a.go", "repo-a/lib/b.go", "repo-b/main.go"} { diff --git a/internal/resolver/cross_pkg_call_guard_test.go b/internal/resolver/cross_pkg_call_guard_test.go index db98107c..080e8095 100644 --- a/internal/resolver/cross_pkg_call_guard_test.go +++ b/internal/resolver/cross_pkg_call_guard_test.go @@ -14,7 +14,7 @@ import ( // faithful end-to-end harness for the resolver tests below: a real // extractor produces the unresolved edges, then ResolveAll runs against // them exactly as it does on a live index. -func buildGraphFromSources(t *testing.T, files map[string]string) *graph.Graph { +func buildGraphFromSources(t *testing.T, files map[string]string) graph.Store { t.Helper() g := graph.New() ts := languages.NewTypeScriptExtractor() @@ -50,7 +50,7 @@ func buildGraphFromSources(t *testing.T, files map[string]string) *graph.Graph { // callEdgeTo returns the resolved To-end of the call/reference edge that // leaves fromID at the given 1-based line. Empty string when no such // edge exists. -func callEdgeTo(g *graph.Graph, fromID string, line int) string { +func callEdgeTo(g graph.Store, fromID string, line int) string { for _, e := range g.GetOutEdges(fromID) { if (e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences) && e.Line == line { return e.To diff --git a/internal/resolver/cross_pkg_guard.go b/internal/resolver/cross_pkg_guard.go index 060651ed..d4591772 100644 --- a/internal/resolver/cross_pkg_guard.go +++ b/internal/resolver/cross_pkg_guard.go @@ -44,7 +44,13 @@ func (r *Resolver) guardCrossPackageCallEdges(jobs []reindexJob, closure map[str if len(jobs) == 0 { return 0 } - reverted := 0 + // Collect both mutation lists across the whole pass and apply them + // via the batched Store methods at the end. Per-edge + // SetEdgeProvenance + ReindexEdge in the body would otherwise pay + // two ACID round-trips per reverted edge against disk backends — + // catastrophic on a 30k-job pass. + var provBatch []graph.EdgeProvenanceUpdate + var reindexBatch []graph.EdgeReindex for i := range jobs { j := &jobs[i] if !isCallLikeEdge(j.kind) { @@ -71,7 +77,7 @@ func (r *Resolver) guardCrossPackageCallEdges(jobs []reindexJob, closure map[str continue } callerFile := r.edgeCallerFile(j.edge) - target := r.graph.GetNode(j.newTo) + target := r.cachedGetNode(j.newTo) if callerFile == "" || target == nil { continue } @@ -80,19 +86,24 @@ func (r *Resolver) guardCrossPackageCallEdges(jobs []reindexJob, closure map[str } // Not reachable — revert to the unresolved placeholder and // re-index against the resolved target we are abandoning. - // Drop the resolution provenance through SetEdgeProvenance so - // the reverted edge's identity change is counted; the logical - // key still carries the resolved target at this point, which - // is fine — SetEdgeProvenance keys the revision on Origin - // alone. The target revert + re-bucket follows. + // SetEdgeProvenance("") drops the resolution provenance so + // the reverted edge's identity change is counted; the target + // revert + re-bucket follows. Both go in their respective + // batches so the whole pass commits in two chunks instead of + // 2×N per-edge transactions. oldResolved := j.edge.To - r.graph.SetEdgeProvenance(j.edge, "") + provBatch = append(provBatch, graph.EdgeProvenanceUpdate{Edge: j.edge, NewOrigin: ""}) j.edge.To = j.oldTo j.edge.Confidence = 0 - r.graph.ReindexEdge(j.edge, oldResolved) - reverted++ + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: j.edge, OldTo: oldResolved}) } - return reverted + if len(provBatch) > 0 { + r.graph.SetEdgeProvenanceBatch(provBatch) + } + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) + } + return len(reindexBatch) } // isBareNameCallTarget reports whether an unresolved edge target is a @@ -127,8 +138,13 @@ func isCallLikeEdge(k graph.EdgeKind) bool { // edgeCallerFile returns the file path of the node that owns the edge's // From end. Empty when the caller node is unknown. +// +// Hot path: called once per cross-package-guarded edge. The pre-warmed +// per-pass cache populated in ResolveAll holds every From ID across the +// pending slice, so this call is a map lookup during a ResolveAll pass +// and a direct store call elsewhere. func (r *Resolver) edgeCallerFile(e *graph.Edge) string { - if n := r.graph.GetNode(e.From); n != nil && n.FilePath != "" { + if n := r.cachedGetNode(e.From); n != nil && n.FilePath != "" { return n.FilePath } return e.FilePath @@ -180,21 +196,18 @@ func (r *Resolver) buildImportClosure() map[string]map[string]struct{} { } set[dir] = struct{}{} } - for _, n := range r.graph.AllNodes() { - if n.Kind == graph.KindFile && n.FilePath != "" { + for n := range r.graph.NodesByKind(graph.KindFile) { + if n.FilePath != "" { add(n.FilePath, filepath.Dir(n.FilePath)) } } - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } + for e := range r.graph.EdgesByKind(graph.EdgeImports) { // Skip imports still pointing at an unresolved placeholder or an // out-of-repo stub — neither names an in-repo directory that a // name-only call candidate could legitimately live in. if strings.HasPrefix(e.To, unresolvedPrefix) || strings.HasPrefix(e.To, "external::") || - strings.HasPrefix(e.To, "stdlib::") || + graph.IsStdlibStub(e.To) || strings.HasPrefix(e.To, "dep::") { continue } diff --git a/internal/resolver/cross_repo.go b/internal/resolver/cross_repo.go index 87edf078..344f2388 100644 --- a/internal/resolver/cross_repo.go +++ b/internal/resolver/cross_repo.go @@ -62,7 +62,7 @@ type CrossWorkspaceDepLookup func(sourceWorkspaceID string) []CrossWorkspaceDepR // the target workspace via `cross_workspace_deps` AND, for import // edges, the import path has a declared-module prefix. type CrossRepoResolver struct { - graph *graph.Graph + graph graph.Store dirIndex map[string][]*graph.Node lastDirIndex map[string][]*graph.Node // reachableReposByFile maps a caller file's ID to the set of repo @@ -98,7 +98,7 @@ type CrossRepoResolver struct { } // NewCrossRepo creates a CrossRepoResolver for the given graph. -func NewCrossRepo(g *graph.Graph) *CrossRepoResolver { +func NewCrossRepo(g graph.Store) *CrossRepoResolver { return &CrossRepoResolver{graph: g, mu: g.ResolveMutex()} } @@ -187,12 +187,15 @@ func (cr *CrossRepoResolver) ResolveAll() *CrossRepoStats { stats := &CrossRepoStats{ByRepo: make(map[string]int)} - edges := cr.graph.AllEdges() - for _, e := range edges { - if !strings.HasPrefix(e.To, unresolvedPrefix) { - continue - } - cr.resolveEdge(e, stats) + // Predicate-shaped read: disk backends only enumerate the + // "unresolved::*" slice (the only one this pass mutates). Batch + // mutations to commit in chunks at the end. + var reindexBatch []graph.EdgeReindex + for e := range cr.graph.EdgesWithUnresolvedTarget() { + cr.resolveEdge(e, stats, &reindexBatch) + } + if len(reindexBatch) > 0 { + cr.graph.ReindexEdges(reindexBatch) } // Materialise the cross_repo_* edge layer over the freshly lifted // calls / implements / extends edges. @@ -215,15 +218,21 @@ func (cr *CrossRepoResolver) ResolveForRepo(repoPrefix string) *CrossRepoStats { stats := &CrossRepoStats{ByRepo: make(map[string]int)} - nodes := cr.graph.GetRepoNodes(repoPrefix) - for _, n := range nodes { - edges := cr.graph.GetOutEdges(n.ID) - for _, e := range edges { - if !strings.HasPrefix(e.To, unresolvedPrefix) { - continue - } - cr.resolveEdge(e, stats) + var reindexBatch []graph.EdgeReindex + // One backend query for every out-edge from this repo's nodes, + // instead of GetRepoNodes followed by GetOutEdges per node. On + // disk backends (Ladybug, SQLite, DuckDB) the per-node loop + // was O(repo_nodes) round-trips per pass — single-digit minutes + // of warmup on a multi-repo workspace where this method runs + // once per tracked repo. + for _, e := range cr.graph.GetRepoEdges(repoPrefix) { + if !strings.HasPrefix(e.To, unresolvedPrefix) { + continue } + cr.resolveEdge(e, stats, &reindexBatch) + } + if len(reindexBatch) > 0 { + cr.graph.ReindexEdges(reindexBatch) } // Materialise the cross_repo_* edge layer. The pass is graph-wide // (cheap relative to a resolve pass) so an edge into repoPrefix @@ -246,13 +255,9 @@ func (cr *CrossRepoResolver) ResolveForRepo(repoPrefix string) *CrossRepoStats { // These maps are torn down via clearDirIndexes when the pass completes // so we don't keep ~N pointers alive between resolves. func (cr *CrossRepoResolver) buildDirIndexes() { - nodes := cr.graph.AllNodes() - cr.dirIndex = make(map[string][]*graph.Node, len(nodes)/4) - cr.lastDirIndex = make(map[string][]*graph.Node, len(nodes)/4) - for _, n := range nodes { - if n.Kind != graph.KindFile { - continue - } + cr.dirIndex = make(map[string][]*graph.Node, 128) + cr.lastDirIndex = make(map[string][]*graph.Node, 128) + for n := range cr.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) cr.dirIndex[dir] = append(cr.dirIndex[dir], n) last := lastPathComponent(dir) @@ -267,12 +272,8 @@ func (cr *CrossRepoResolver) buildDirIndexes() { // by callerRepo, so the same dep node reachable here is the one in the // importing file's own go.mod. func (cr *CrossRepoResolver) buildDepModuleIndex() { - nodes := cr.graph.AllNodes() by := make(map[string][]depModuleEntry) - for _, n := range nodes { - if n.Kind != graph.KindContract { - continue - } + for n := range cr.graph.NodesByKind(graph.KindContract) { if !strings.HasPrefix(n.ID, "dep::") { continue } @@ -324,10 +325,7 @@ func (cr *CrossRepoResolver) clearDirIndexes() { // graph is settled enough to be trustworthy evidence. func (cr *CrossRepoResolver) buildReachableReposIndex() { idx := make(map[string]map[string]struct{}) - for _, e := range cr.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } + for e := range cr.graph.EdgesByKind(graph.EdgeImports) { // Only resolved imports carry evidence — an unresolved import // target tells us nothing about which repo the caller reaches. to := cr.graph.GetNode(e.To) @@ -387,7 +385,13 @@ func (cr *CrossRepoResolver) callerFileID(e *graph.Edge) string { return e.FilePath } -func (cr *CrossRepoResolver) resolveEdge(e *graph.Edge, stats *CrossRepoStats) { +// resolveEdge dispatches one unresolved edge through the cross-repo +// resolution paths and, when the resolution lifted the To target, +// appends a re-bind job to batch instead of committing a per-edge +// ReindexEdge transaction. The caller flushes the accumulated batch +// after the whole pass via ReindexEdges so disk backends amortise +// the commit cost. +func (cr *CrossRepoResolver) resolveEdge(e *graph.Edge, stats *CrossRepoStats, batch *[]graph.EdgeReindex) { oldTo := e.To target := strings.TrimPrefix(e.To, unresolvedPrefix) @@ -410,7 +414,7 @@ func (cr *CrossRepoResolver) resolveEdge(e *graph.Edge, stats *CrossRepoStats) { } if e.To != oldTo { - cr.graph.ReindexEdge(e, oldTo) + *batch = append(*batch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) } } @@ -563,10 +567,7 @@ func (cr *CrossRepoResolver) resolveImport(e *graph.Edge, importPath string, sta } } } else { - for _, n := range cr.graph.AllNodes() { - if n.Kind != graph.KindFile { - continue - } + for n := range cr.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) if strings.HasSuffix(dir, lastPathComponent(importPath)) || dir == importPath { consider(n) diff --git a/internal/resolver/cross_repo_edges.go b/internal/resolver/cross_repo_edges.go index aafaedcd..e3382ad3 100644 --- a/internal/resolver/cross_repo_edges.go +++ b/internal/resolver/cross_repo_edges.go @@ -25,12 +25,13 @@ import "github.com/zzet/gortex/internal/graph" // // Returns the count of cross-repo relationships found this pass — the // number of parallel edges that exist after it, modulo graph dedup. -func DetectCrossRepoEdges(g *graph.Graph) int { +func DetectCrossRepoEdges(g graph.Store) int { if g == nil { return 0 } emitted := 0 - for _, e := range g.AllEdges() { + for _, row := range crossRepoCandidates(g) { + e := row.Edge if e == nil { continue } @@ -38,21 +39,6 @@ func DetectCrossRepoEdges(g *graph.Graph) int { if !ok { continue } - from := g.GetNode(e.From) - to := g.GetNode(e.To) - if from == nil || to == nil { - // Unresolved / external / stdlib / dep stub targets never - // have a graph node — they cannot be cross-repo. - continue - } - if from.RepoPrefix == "" || to.RepoPrefix == "" { - // Single-repo graph (no prefixes) — nothing crosses a - // boundary. Also covers a node whose repo wasn't stamped. - continue - } - if from.RepoPrefix == to.RepoPrefix { - continue - } // Keep the bool flag on the base edge consistent with the // dedicated kind — existing consumers (smart_context's // cross_repo_dependencies, the Cypher / GraphML exporters) read @@ -71,11 +57,62 @@ func DetectCrossRepoEdges(g *graph.Graph) int { CrossRepo: true, Meta: map[string]any{ "base_kind": string(e.Kind), - "source_repo": from.RepoPrefix, - "target_repo": to.RepoPrefix, + "source_repo": row.FromRepo, + "target_repo": row.ToRepo, }, }) emitted++ } return emitted } + +// crossRepoCandidates returns every edge whose Kind has a parallel +// cross_repo_* kind AND whose endpoints carry two distinct, non-empty +// RepoPrefix values. Routed through the storage layer's +// CrossRepoCandidates capability when the backend implements it (one +// Cypher join with the kind + repo-prefix filters in WHERE); falls +// back to the AllEdges + per-edge GetNode walk otherwise. +// +// The base-kind set is derived from graph.CrossRepoKindFor by +// iterating the in-process registry — the disk backend uses the same +// kind list verbatim so single-repo graphs return no rows without a +// whole-table scan. +func crossRepoCandidates(g graph.Store) []graph.CrossRepoCandidateRow { + baseKinds := graph.BaseKindsForCrossRepo() + if cap, ok := g.(graph.CrossRepoCandidates); ok { + return cap.CrossRepoCandidates(baseKinds) + } + if len(baseKinds) == 0 { + return nil + } + kset := make(map[graph.EdgeKind]struct{}, len(baseKinds)) + for _, k := range baseKinds { + kset[k] = struct{}{} + } + var out []graph.CrossRepoCandidateRow + for _, e := range g.AllEdges() { + if e == nil { + continue + } + if _, ok := kset[e.Kind]; !ok { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.RepoPrefix == "" || to.RepoPrefix == "" { + continue + } + if from.RepoPrefix == to.RepoPrefix { + continue + } + out = append(out, graph.CrossRepoCandidateRow{ + Edge: e, + FromRepo: from.RepoPrefix, + ToRepo: to.RepoPrefix, + }) + } + return out +} diff --git a/internal/resolver/cross_repo_edges_test.go b/internal/resolver/cross_repo_edges_test.go index 51e7961d..fac8519c 100644 --- a/internal/resolver/cross_repo_edges_test.go +++ b/internal/resolver/cross_repo_edges_test.go @@ -9,7 +9,7 @@ import ( // countOutEdgesByKind returns how many out-edges of the given kind the // node fromID has. -func countOutEdgesByKind(g *graph.Graph, fromID string, kind graph.EdgeKind) int { +func countOutEdgesByKind(g graph.Store, fromID string, kind graph.EdgeKind) int { n := 0 for _, e := range g.GetOutEdges(fromID) { if e.Kind == kind { @@ -21,7 +21,7 @@ func countOutEdgesByKind(g *graph.Graph, fromID string, kind graph.EdgeKind) int // firstOutEdgeByKind returns the first out-edge of fromID with the given // kind, or nil. -func firstOutEdgeByKind(g *graph.Graph, fromID string, kind graph.EdgeKind) *graph.Edge { +func firstOutEdgeByKind(g graph.Store, fromID string, kind graph.EdgeKind) *graph.Edge { for _, e := range g.GetOutEdges(fromID) { if e.Kind == kind { return e diff --git a/internal/resolver/cross_repo_test.go b/internal/resolver/cross_repo_test.go index cba906ff..b4d3407a 100644 --- a/internal/resolver/cross_repo_test.go +++ b/internal/resolver/cross_repo_test.go @@ -18,7 +18,7 @@ import ( // without it, a bare name like `Helper` could land on any repo that // happens to define a `Helper`, which is the exact name-collision // false-positive class this guards against. -func wireImport(g *graph.Graph, callerFile, targetRepo, targetFile string) { +func wireImport(g graph.Store, callerFile, targetRepo, targetFile string) { g.AddNode(&graph.Node{ ID: targetFile, Kind: graph.KindFile, Name: targetFile, FilePath: targetFile, Language: "go", RepoPrefix: targetRepo, diff --git a/internal/resolver/dep_module_test.go b/internal/resolver/dep_module_test.go index 54cc998f..511be7d2 100644 --- a/internal/resolver/dep_module_test.go +++ b/internal/resolver/dep_module_test.go @@ -10,7 +10,7 @@ import ( // addDepNode is a tiny helper to materialise a dep:: contract // node the way GoModExtractor + commitInlinedContractToGraph would. -func addDepNode(t *testing.T, g *graph.Graph, repoPrefix, modulePath string) { +func addDepNode(t *testing.T, g graph.Store, repoPrefix, modulePath string) { t.Helper() g.AddNode(&graph.Node{ ID: "dep::" + modulePath, diff --git a/internal/resolver/external_call_attribution.go b/internal/resolver/external_call_attribution.go new file mode 100644 index 00000000..a4c0584b --- /dev/null +++ b/internal/resolver/external_call_attribution.go @@ -0,0 +1,216 @@ +package resolver + +import ( + "path" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// attributeGoExternalCalls materialises a KindFunction node for every +// unique `stdlib::::` / `dep::::` +// / `external::::` edge target, plus a KindModule +// parent for each owning import path. Without this pass the targets +// are stubs in storage backends that enforce rel-table FK (Ladybug) +// and invisible nodes in the in-memory backend, so a query like +// `find_usages(stdlib::encoding/json::Marshal)` +// can't surface "every function in this codebase that calls +// json.Marshal" — the destination doesn't exist as a graph node. +// +// Mirrors the Python / Dart attributeNonGoModuleImports pass for Go. +// Runs after resolveExtern (which classifies extern targets into the +// three prefix buckets) so we materialise the post-classification +// state rather than the pre-classification `unresolved::extern::*` +// shape. +// +// ID conventions: +// - Module node: `module::go:` — shared across every +// repo that imports the same path. Carries +// Meta["ecosystem"]="go" and Meta["import_path"]=. +// Meta["role"]="stdlib" for stdlib paths. +// - Symbol node: the original `stdlib::*` / `dep::*` / +// `external::*` ID stays the symbol's ID so existing edges land +// on it without rewriting. Carries Meta["external"]=true and +// Meta["module_path"]=. +// - EdgeMemberOf: symbol → module so `get_callers` on the module +// surfaces every symbol used from that package. +// +// All AddNode / AddEdge calls are idempotent on ID, so a second run +// of this pass (incremental ResolveFile re-invocation) is a no-op. +func (r *Resolver) attributeGoExternalCalls() { + // Scan every edge whose target sits in one of the three external + // prefixes. Collect unique (repoPrefix, prefix, importPath, symbol) + // tuples so we materialise each one once even when many edges + // reference the same target. repoPrefix is included because + // stdlib stubs are per-repo (see internal/graph/stub.go) — two + // repos on different Go SDK versions emit semantically distinct + // `::stdlib::fmt::Errorf` and `::stdlib::fmt::Errorf` + // stubs that MUST round-trip through this attribution pass as + // distinct nodes, not collide into one. + type extKey struct { + repoPrefix, prefix, importPath, symbol string + } + seen := map[extKey]struct{}{} + depEdgesScan := func(kind graph.EdgeKind) { + for e := range r.graph.EdgesByKind(kind) { + if e.To == "" { + continue + } + prefix, importPath, symbol := splitGoExternalTarget(e.To) + if prefix == "" { + continue + } + seen[extKey{graph.StubRepoPrefix(e.To), prefix, importPath, symbol}] = struct{}{} + } + } + // Same edge-kind set as attributeGoBuiltins — anywhere an + // extern-prefixed target can show up. + for _, k := range []graph.EdgeKind{ + graph.EdgeCalls, + graph.EdgeReferences, + graph.EdgeReads, + graph.EdgeArgOf, + graph.EdgeValueFlow, + graph.EdgeReturnsTo, + graph.EdgeTypedAs, + graph.EdgeReturns, + graph.EdgeInstantiates, + graph.EdgeCaptures, + graph.EdgeThrows, + } { + depEdgesScan(k) + } + if len(seen) == 0 { + return + } + + // Materialise the parent KindModule for each unique import path, + // then the per-symbol KindFunction. Module-side dedupe is via + // the `modules` map; the per-symbol nodes are unique by (prefix, + // path, symbol) by construction. + // Module IDs are also per-repo now — a module node carries the + // same SDK-version sensitivity its symbols do. Key includes the + // repo prefix so two repos importing the same path get distinct + // module nodes. + type modKey struct{ repoPrefix, importPath string } + modules := map[modKey]string{} + for k := range seen { + modKey := modKey{repoPrefix: k.repoPrefix, importPath: k.importPath} + moduleID, ok := modules[modKey] + if !ok { + // Ecosystem + path are ONE stub segment joined by a single + // colon (`go:`), matching the npm convention + // (`module::npm:`) and every module-id consumer + // (tools_analyze_external_calls). Passing them as two + // StubID parts would emit `module::go::` (double + // colon) — the form that broke the attribution tests. + moduleID = graph.StubID(k.repoPrefix, graph.StubKindModule, "go:"+k.importPath) + modules[modKey] = moduleID + role := "external" + switch k.prefix { + case "stdlib::": + role = "stdlib" + case "dep::": + role = "dep" + } + r.graph.AddNode(&graph.Node{ + ID: moduleID, + Kind: graph.KindModule, + Name: lastImportSegment(k.importPath), + Language: "go", + Meta: map[string]any{ + "ecosystem": "go", + "role": role, + "import_path": k.importPath, + }, + }) + } + var symbolID string + switch k.prefix { + case "stdlib::": + symbolID = graph.StubID(k.repoPrefix, graph.StubKindStdlib, k.importPath, k.symbol) + default: + // dep:: / external:: keep their legacy unprefixed form for + // now — they aren't covered by the stub-prefix migration + // (different module paths already provide repo-level + // distinction; same version pinning is enforced by go.mod + // per-repo). + symbolID = k.prefix + k.importPath + "::" + k.symbol + } + r.graph.AddNode(&graph.Node{ + ID: symbolID, + Kind: graph.KindFunction, + Name: k.symbol, + Language: "go", + Meta: map[string]any{ + "external": true, + "module_path": k.importPath, + "module_role": map[string]string{ + "stdlib::": "stdlib", + "dep::": "dep", + "external::": "external", + }[k.prefix], + }, + }) + // EdgeMemberOf: symbol → module. AddEdge is idempotent on the + // edge-key tuple so a re-run doesn't duplicate. + r.graph.AddEdge(&graph.Edge{ + From: symbolID, + To: moduleID, + Kind: graph.EdgeMemberOf, + Origin: graph.OriginASTResolved, + }) + } +} + +// splitGoExternalTarget recognises the three external-target prefixes +// the resolver emits after resolveExtern. Returns the prefix +// (`stdlib::` / `dep::` / `external::`), the import path, and the +// symbol name. Returns ("", "", "") for any other shape so the pass +// can skip it cleanly. +// +// The stdlib case is matched via graph.IsStdlibStub so both the +// legacy `stdlib::fmt::Errorf` shape and the per-repo-prefixed +// `::stdlib::fmt::Errorf` shape (see internal/graph/stub.go) +// route the same way. The returned bucket label stays `stdlib::` for +// downstream `k.prefix == "stdlib::"` comparisons. +func splitGoExternalTarget(target string) (prefix, importPath, symbol string) { + var body string + switch { + case graph.IsStdlibStub(target): + prefix = "stdlib::" + body = graph.StubRest(target) + case strings.HasPrefix(target, "dep::"): + prefix = "dep::" + body = strings.TrimPrefix(target, prefix) + case strings.HasPrefix(target, "external::"): + prefix = "external::" + body = strings.TrimPrefix(target, prefix) + default: + return "", "", "" + } + // The body shape produced by resolveExtern is + // `::`. Split on the LAST `::` because import + // paths can include slashes but not `::`, so the rightmost + // separator is always between path and symbol. + sep := strings.LastIndex(body, "::") + if sep < 0 { + // `external::os` style (just the package, no symbol — + // the resolveImport path). Treat the whole body as the path + // and leave symbol empty so we still materialise the module + // node but skip the symbol. + return prefix, body, "" + } + return prefix, body[:sep], body[sep+2:] +} + +// lastImportSegment returns the rightmost path component, used as +// the human-readable Name on the KindModule node. For +// `github.com/stretchr/testify/assert` the segment is `assert`; for +// `encoding/json` it's `json`; for `fmt` it's `fmt`. +func lastImportSegment(importPath string) string { + if importPath == "" { + return "" + } + return path.Base(importPath) +} diff --git a/internal/resolver/external_call_attribution_test.go b/internal/resolver/external_call_attribution_test.go new file mode 100644 index 00000000..473722fd --- /dev/null +++ b/internal/resolver/external_call_attribution_test.go @@ -0,0 +1,141 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func TestAttributeGoExternalCalls_StdlibFunctionMaterialised(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + // Post-resolveExtern shape: an edge directly to stdlib::fmt::Sprintf. + edge := &graph.Edge{From: owner, To: "stdlib::fmt::Sprintf", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 5} + g.AddEdge(edge) + + New(g).attributeGoExternalCalls() + + // The symbol becomes a KindFunction with the right metadata. + sym := g.GetNode("stdlib::fmt::Sprintf") + require.NotNil(t, sym, "stdlib symbol must be materialised as a node") + assert.Equal(t, graph.KindFunction, sym.Kind) + assert.Equal(t, "Sprintf", sym.Name) + assert.Equal(t, "go", sym.Language) + assert.Equal(t, true, sym.Meta["external"]) + assert.Equal(t, "fmt", sym.Meta["module_path"]) + assert.Equal(t, "stdlib", sym.Meta["module_role"]) + + // And a KindModule parent under module::go:fmt with role=stdlib. + mod := g.GetNode("module::go:fmt") + require.NotNil(t, mod, "module parent must be materialised") + assert.Equal(t, graph.KindModule, mod.Kind) + assert.Equal(t, "fmt", mod.Name) + assert.Equal(t, "stdlib", mod.Meta["role"]) + assert.Equal(t, "go", mod.Meta["ecosystem"]) + + // EdgeMemberOf: symbol -> module. + var foundLink bool + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + if e.From == "stdlib::fmt::Sprintf" && e.To == "module::go:fmt" { + foundLink = true + } + } + assert.True(t, foundLink, "symbol must be linked to its module via EdgeMemberOf") +} + +func TestAttributeGoExternalCalls_DepUsesFullImportPath(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner, To: "dep::github.com/stretchr/testify/assert::True", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 7}) + + New(g).attributeGoExternalCalls() + + sym := g.GetNode("dep::github.com/stretchr/testify/assert::True") + require.NotNil(t, sym) + assert.Equal(t, "True", sym.Name) + assert.Equal(t, "github.com/stretchr/testify/assert", sym.Meta["module_path"]) + assert.Equal(t, "dep", sym.Meta["module_role"]) + + mod := g.GetNode("module::go:github.com/stretchr/testify/assert") + require.NotNil(t, mod) + assert.Equal(t, "assert", mod.Name, "module name must be the last path segment, not the full import path") + assert.Equal(t, "dep", mod.Meta["role"]) +} + +func TestAttributeGoExternalCalls_ModuleNodeSharedAcrossSymbols(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + // Three different functions from the same stdlib package — all + // should attach to ONE module node, not three. + for _, sym := range []string{"Marshal", "Unmarshal", "RawMessage"} { + g.AddEdge(&graph.Edge{ + From: owner, To: "stdlib::encoding/json::" + sym, + Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 1, + }) + } + + New(g).attributeGoExternalCalls() + + count := 0 + for n := range g.NodesByKind(graph.KindModule) { + if n.ID == "module::go:encoding/json" { + count++ + } + } + assert.Equal(t, 1, count, "exactly one KindModule per import path") +} + +func TestAttributeGoExternalCalls_IdempotentOnRerun(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner, To: "stdlib::os::Open", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 1}) + + r := New(g) + r.attributeGoExternalCalls() + r.attributeGoExternalCalls() // second run must not duplicate + + syms := 0 + for n := range g.NodesByKind(graph.KindFunction) { + if n.ID == "stdlib::os::Open" { + syms++ + } + } + assert.Equal(t, 1, syms, "second pass must not duplicate the symbol node") + + memberEdges := 0 + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + if e.From == "stdlib::os::Open" && e.To == "module::go:os" { + memberEdges++ + } + } + assert.Equal(t, 1, memberEdges, "second pass must not duplicate the membership edge") +} + +func TestAttributeGoExternalCalls_NonExternEdgesIgnored(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + // Real intra-repo call — must not be touched. + g.AddNode(&graph.Node{ID: "pkg/bar.go::Helper", Kind: graph.KindFunction, Name: "Helper", FilePath: "pkg/bar.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner, To: "pkg/bar.go::Helper", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 1}) + // And an unresolved bare name — also not in scope for this pass. + g.AddEdge(&graph.Edge{From: owner, To: "unresolved::doSomething", Kind: graph.EdgeCalls, FilePath: "pkg/foo.go", Line: 2}) + + before := []string{} + for n := range g.NodesByKind(graph.KindModule) { + before = append(before, n.ID) + } + New(g).attributeGoExternalCalls() + after := []string{} + for n := range g.NodesByKind(graph.KindModule) { + after = append(after, n.ID) + } + assert.Equal(t, before, after, "no module nodes should be created when there are no extern-prefixed targets") +} diff --git a/internal/resolver/external_calls.go b/internal/resolver/external_calls.go index d776c8e5..b953a3d2 100644 --- a/internal/resolver/external_calls.go +++ b/internal/resolver/external_calls.go @@ -67,7 +67,7 @@ const externalCallPrefix = "external-call::" // the external hop visible. Enabled is the opt-in gate // (`.gortex.yaml::index::synthesize_external_calls`); when false the // pass is a no-op and the graph is untouched. -func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { +func SynthesizeExternalCalls(g graph.Store, enabled bool) int { if g == nil || !enabled { return 0 } @@ -81,8 +81,20 @@ func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { defer mu.Unlock() synthesized := 0 - for _, e := range g.AllEdges() { - if e == nil || !isCallLikeEdge(e.Kind) { + var reindexBatch []graph.EdgeReindex + // First sweep: collect every candidate edge and the From IDs we'll + // need to read Language off. Narrow to the call-like edge kinds + // server-side via EdgesByKinds — AllEdges scanned the whole bucket + // just to filter Kind Go-side. + type candidate struct { + edge *graph.Edge + ecosystem, importPath string + } + var candidates []candidate + fromIDSet := map[string]struct{}{} + callKinds := []graph.EdgeKind{graph.EdgeCalls, graph.EdgeReferences} + for e := range edgesByKinds(g, callKinds) { + if e == nil { continue } // Already pointing at a synthetic node — a prior run of this @@ -97,17 +109,35 @@ func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { if !ok { continue } - callerLang := edgeCallerLanguage(g, e) - if isLanguageStdlib(callerLang, importPath) { + candidates = append(candidates, candidate{edge: e, ecosystem: ecosystem, importPath: importPath}) + if e.From != "" { + fromIDSet[e.From] = struct{}{} + } + } + fromList := make([]string, 0, len(fromIDSet)) + for id := range fromIDSet { + fromList = append(fromList, id) + } + callerNodes := g.GetNodesByIDs(fromList) + + for _, c := range candidates { + e := c.edge + callerLang := "" + if from := callerNodes[e.From]; from != nil && from.Language != "" { + callerLang = from.Language + } else { + callerLang = langFamilyFromExt(e.FilePath) + } + if isLanguageStdlib(callerLang, c.importPath) { // Language built-in / standard library — noise. Leave the // edge on its bookkeeping-string terminal; a stdlib hop is // not a cross-system call worth a call-chain node. continue } - nodeID := externalCallNodeID(ecosystem, importPath) + nodeID := externalCallNodeID(c.ecosystem, c.importPath) if g.GetNode(nodeID) == nil { - g.AddNode(newExternalCallNode(nodeID, ecosystem, importPath, callerLang)) + g.AddNode(newExternalCallNode(nodeID, c.ecosystem, c.importPath, callerLang)) } oldTo := e.To @@ -124,9 +154,12 @@ func SynthesizeExternalCalls(g *graph.Graph, enabled bool) int { e.Meta = map[string]any{} } e.Meta["external_call"] = true - g.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) synthesized++ } + if len(reindexBatch) > 0 { + g.ReindexEdges(reindexBatch) + } return synthesized } @@ -155,8 +188,11 @@ func parseExternalCallTarget(target string) (ecosystem, importPath string, ok bo return "", "", false } return "dep", path, true - case strings.HasPrefix(target, "stdlib::"): - path := importPathOfExtern(strings.TrimPrefix(target, "stdlib::")) + case graph.IsStdlibStub(target): + // Handles both legacy `stdlib::::` and the + // per-repo-prefixed `::stdlib::::` shape + // (see internal/graph/stub.go). + path := importPathOfExtern(graph.StubRest(target)) if path == "" { return "", "", false } @@ -218,16 +254,6 @@ func newExternalCallNode(nodeID, ecosystem, importPath, callerLang string) *grap } } -// edgeCallerLanguage returns the source language of the node that owns -// the call edge's From end, falling back to the file extension of the -// edge's own FilePath when the caller node carries no Language. -func edgeCallerLanguage(g *graph.Graph, e *graph.Edge) string { - if from := g.GetNode(e.From); from != nil && from.Language != "" { - return from.Language - } - return langFamilyFromExt(e.FilePath) -} - // langFamilyFromExt maps a file extension to the coarse language label // stored on graph nodes. Distinct from builtins.go::langFromFilePath, // which collapses ts→ts/js→js for the built-in method tables; here we diff --git a/internal/resolver/external_calls_test.go b/internal/resolver/external_calls_test.go index f4afcd33..7af3d4d9 100644 --- a/internal/resolver/external_calls_test.go +++ b/internal/resolver/external_calls_test.go @@ -17,7 +17,7 @@ import ( // builder spans every ecosystem the external-call synthesis pass // classifies, so one table can exercise Go modules, pip packages, and // npm packages through the same real extract → resolve pipeline. -func buildMultiLangGraph(t *testing.T, files map[string]string) *graph.Graph { +func buildMultiLangGraph(t *testing.T, files map[string]string) graph.Store { t.Helper() g := graph.New() for path, src := range files { @@ -58,7 +58,7 @@ func buildMultiLangGraph(t *testing.T, files map[string]string) *graph.Graph { // with — and then the opt-in external-call synthesis pass. It mirrors // the indexer settle point: synthesis runs strictly after resolution + // guard, so the test exercises the same ordering the daemon uses. -func resolveAndSynthesize(g *graph.Graph, enabled bool) int { +func resolveAndSynthesize(g graph.Store, enabled bool) int { New(g).ResolveAll() return SynthesizeExternalCalls(g, enabled) } @@ -66,7 +66,7 @@ func resolveAndSynthesize(g *graph.Graph, enabled bool) int { // callTargetsFrom collects the To-end of every call/reference edge // leaving fromID, so a test can assert on the post-resolution shape of // a caller's outbound calls. -func callTargetsFrom(g *graph.Graph, fromID string) []string { +func callTargetsFrom(g graph.Store, fromID string) []string { var out []string for _, e := range g.GetOutEdges(fromID) { if e.Kind == graph.EdgeCalls || e.Kind == graph.EdgeReferences { diff --git a/internal/resolver/generic_param_bind.go b/internal/resolver/generic_param_bind.go new file mode 100644 index 00000000..18d8e0e9 --- /dev/null +++ b/internal/resolver/generic_param_bind.go @@ -0,0 +1,99 @@ +package resolver + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// bindGenericParamRefs rewrites `unresolved::` edges where the +// name is a generic type parameter declared by the source's +// enclosing function. The Go extractor already materialises +// KindGenericParam nodes with IDs `#tparam:` and an +// EdgeMemberOf back to the owner — the resolver just hasn't been +// consulting them when an in-body reference (`var x T`, return type +// `T`, etc.) lands as `unresolved::T`. +// +// Side benefit beyond stub reduction: `find_usages` on a generic +// type parameter starts working — *"where in this generic function +// is T used?"* — which is a real refactoring query. +// +// Scope is per-function: a function's tparams are visible only +// inside its body. The owner-keyed index built here lets each edge +// resolve in O(1) without re-walking the graph. +func (r *Resolver) bindGenericParamRefs() { + // owner-function ID → set of tparam-name → tparam-node-id. + owned := map[string]map[string]string{} + for n := range r.graph.NodesByKind(graph.KindGenericParam) { + if n.Language != "go" || n.Name == "" { + continue + } + owner := enclosingFunctionForBinding(n.ID) + if owner == "" || owner == n.ID { + continue + } + set, ok := owned[owner] + if !ok { + set = map[string]string{} + owned[owner] = set + } + // Don't overwrite — two tparams with the same name in the + // same function shouldn't happen in valid Go, but be defensive. + if _, dup := set[n.Name]; dup { + set[n.Name] = "" + continue + } + set[n.Name] = n.ID + } + if len(owned) == 0 { + return + } + + var batch []graph.EdgeReindex + // We don't know up front which edge kinds carry type-param refs: + // EdgeReferences for `var x T`, EdgeTypedAs for parameters typed + // as T, EdgeReturns for return signature, EdgeInstantiates for + // generic instantiation expressions. Walk the union. + for _, k := range []graph.EdgeKind{ + graph.EdgeReferences, + graph.EdgeTypedAs, + graph.EdgeReturns, + graph.EdgeInstantiates, + } { + for e := range r.graph.EdgesByKind(k) { + if old := r.tryBindGenericParam(e, owned); old != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: old}) + } + } + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} + +// tryBindGenericParam returns the old To value (for batched reindex) +// when the edge was rewritten, or "" when left alone. +func (r *Resolver) tryBindGenericParam(e *graph.Edge, owned map[string]map[string]string) string { + if e == nil || !strings.HasPrefix(e.To, "unresolved::") { + return "" + } + name := strings.TrimPrefix(e.To, "unresolved::") + if name == "" || strings.ContainsAny(name, ".*:#") { + return "" + } + ownerID := enclosingFunctionForBinding(e.From) + if ownerID == "" { + return "" + } + set := owned[ownerID] + if len(set) == 0 { + return "" + } + target, ok := set[name] + if !ok || target == "" || target == e.To { + return "" + } + oldTo := e.To + e.To = target + return oldTo +} diff --git a/internal/resolver/generic_param_bind_test.go b/internal/resolver/generic_param_bind_test.go new file mode 100644 index 00000000..2d41b6c6 --- /dev/null +++ b/internal/resolver/generic_param_bind_test.go @@ -0,0 +1,71 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/zzet/gortex/internal/graph" +) + +func TestBindGenericParamRefs_RewritesTRefToTParam(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Map" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Map", FilePath: "pkg/foo.go", Language: "go"}) + + tparamID := owner + "#tparam:T" + g.AddNode(&graph.Node{ID: tparamID, Kind: graph.KindGenericParam, Name: "T", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: tparamID, To: owner, Kind: graph.EdgeMemberOf}) + + // `var x T` inside Map's body — EdgeTypedAs from a local-ish + // source to the unresolved-T target. + from := owner + "#local:x@+3" + g.AddNode(&graph.Node{ID: from, Kind: graph.KindLocal, Name: "x", FilePath: "pkg/foo.go", StartLine: 3, Language: "go"}) + edge := &graph.Edge{From: from, To: "unresolved::T", Kind: graph.EdgeTypedAs, Line: 3} + g.AddEdge(edge) + + New(g).bindGenericParamRefs() + assert.Equal(t, tparamID, edge.To, "var x T must bind to the function's KindGenericParam T") +} + +func TestBindGenericParamRefs_OtherFunctionsLeftAlone(t *testing.T) { + g := graph.New() + // Function A declares tparam T. + a := "pkg/a.go::A" + g.AddNode(&graph.Node{ID: a, Kind: graph.KindFunction, Name: "A", FilePath: "pkg/a.go", Language: "go"}) + g.AddNode(&graph.Node{ID: a + "#tparam:T", Kind: graph.KindGenericParam, Name: "T", FilePath: "pkg/a.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: a + "#tparam:T", To: a, Kind: graph.EdgeMemberOf}) + + // Function B has its OWN body and references `T`, but doesn't + // declare it. Pass must NOT bind to A's tparam. + b := "pkg/b.go::B" + g.AddNode(&graph.Node{ID: b, Kind: graph.KindFunction, Name: "B", FilePath: "pkg/b.go", Language: "go"}) + edge := &graph.Edge{From: b, To: "unresolved::T", Kind: graph.EdgeReferences, Line: 1} + g.AddEdge(edge) + + New(g).bindGenericParamRefs() + assert.Equal(t, "unresolved::T", edge.To, "must not cross-bind to another function's tparam") +} + +func TestBindGenericParamRefs_QualifiedShapesIgnored(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + g.AddNode(&graph.Node{ID: owner + "#tparam:T", Kind: graph.KindGenericParam, Name: "T", FilePath: "pkg/foo.go", Language: "go"}) + g.AddEdge(&graph.Edge{From: owner + "#tparam:T", To: owner, Kind: graph.EdgeMemberOf}) + + keep := []*graph.Edge{ + {From: owner, To: "unresolved::*.T", Kind: graph.EdgeReferences, Line: 1}, + {From: owner, To: "unresolved::pkg.T", Kind: graph.EdgeReferences, Line: 2}, + } + for _, e := range keep { + g.AddEdge(e) + } + New(g).bindGenericParamRefs() + for _, e := range keep { + assert.True(t, + e.To == "unresolved::*.T" || e.To == "unresolved::pkg.T", + "qualified shape %q must be left alone", e.To, + ) + } +} diff --git a/internal/resolver/go_builtins_attribution.go b/internal/resolver/go_builtins_attribution.go new file mode 100644 index 00000000..1e58468a --- /dev/null +++ b/internal/resolver/go_builtins_attribution.go @@ -0,0 +1,181 @@ +package resolver + +import ( + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// goBuiltinFuncs is the complete set of pre-declared Go built-in +// functions. Source: https://pkg.go.dev/builtin (functions section). +// Kept in sync with the language spec — when a new builtin lands +// (e.g. clear / min / max in Go 1.21) add it here. +var goBuiltinFuncs = map[string]struct{}{ + "append": {}, "cap": {}, "clear": {}, "close": {}, "complex": {}, + "copy": {}, "delete": {}, "imag": {}, "len": {}, "make": {}, + "max": {}, "min": {}, "new": {}, "panic": {}, "print": {}, + "println": {}, "real": {}, "recover": {}, +} + +// goBuiltinTypes is the complete set of pre-declared Go built-in +// types. Source: https://pkg.go.dev/builtin (types section). +var goBuiltinTypes = map[string]struct{}{ + "any": {}, "bool": {}, "byte": {}, "comparable": {}, + "complex64": {}, "complex128": {}, "error": {}, + "float32": {}, "float64": {}, + "int": {}, "int8": {}, "int16": {}, "int32": {}, "int64": {}, + "rune": {}, "string": {}, + "uint": {}, "uint8": {}, "uint16": {}, "uint32": {}, "uint64": {}, + "uintptr": {}, +} + +// goBuiltinConsts is the set of pre-declared Go constants (true, +// false, iota, nil). Mostly emitted for completeness — `true` / +// `false` rarely show up as unresolved edge targets in practice +// because the parser handles them inline. +var goBuiltinConsts = map[string]struct{}{ + "true": {}, "false": {}, "iota": {}, "nil": {}, +} + +// attributeGoBuiltins rewrites `unresolved::` edges whose name +// is a Go language intrinsic onto the canonical `builtin::go::*` ID, +// and materialises a single KindBuiltin node per unique builtin so +// the rewritten edges land at a real graph node instead of a +// rel-table FK stub. Mirrors the existing builtin::py / builtin::ts +// classifier in internal/resolver/builtins.go but completes the +// pattern by also creating nodes for the targets — so +// `find_usages(builtin::go::type::float64)` answers "every variable +// typed as float64 in this codebase", and the Ladybug stub +// inflation drops by ~50k rows on a gortex-scale Go codebase. +// +// Three ID namespaces under `builtin::go::`: +// +// functions: builtin::go:: (append, len, make, ...) +// types: builtin::go::type:: (string, int, float64, ...) +// constants: builtin::go::const:: (true, false, iota, nil) +// +// Functions get the shortest namespace because their fan-in is the +// biggest and the shorter ID is what most downstream `find_usages` +// queries will type. +func (r *Resolver) attributeGoBuiltins() { + materialised := map[string]struct{}{} + var batch []graph.EdgeReindex + + // Every edge kind a builtin can be the target of. Type-system + // edges (typed_as / returns) carry type references; call / + // arg-of / value-flow carry function or const references. + for _, k := range []graph.EdgeKind{ + graph.EdgeCalls, + graph.EdgeReferences, + graph.EdgeReads, + graph.EdgeArgOf, + graph.EdgeValueFlow, + graph.EdgeReturnsTo, + graph.EdgeTypedAs, + graph.EdgeReturns, + graph.EdgeInstantiates, + graph.EdgeCaptures, + graph.EdgeThrows, + } { + for e := range r.graph.EdgesByKind(k) { + if old := r.tryAttributeGoBuiltin(e, materialised); old != "" { + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: old}) + } + } + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} + +// tryAttributeGoBuiltin checks if e.To is `unresolved::` +// where bareName is a Go builtin and the source language is Go (the +// source is inside a Go function / file). On a match it materialises +// the target node (once per unique ID), rewrites e.To, and returns +// the old To value for the batched reindex. Returns "" when the edge +// is left alone. +func (r *Resolver) tryAttributeGoBuiltin(e *graph.Edge, materialised map[string]struct{}) string { + if e == nil || !strings.HasPrefix(e.To, "unresolved::") { + return "" + } + name := strings.TrimPrefix(e.To, "unresolved::") + if name == "" || strings.ContainsAny(name, ".*:#") { + return "" + } + // Only attribute when the source is Go. Without this guard a + // Python reference to a local named `len` would get re-targeted + // at Go's builtin `len`, which would be obviously wrong. + if !r.fromIsGo(e.From) { + return "" + } + newID, kind, builtinKind := goBuiltinTarget(r.callerRepoPrefix(e), name) + if newID == "" { + return "" + } + if _, ok := materialised[newID]; !ok { + // AddNode is idempotent on ID, so even a second + // concurrent pass would not duplicate the row. + r.graph.AddNode(&graph.Node{ + ID: newID, + Kind: kind, + Name: name, + Language: "go", + Meta: map[string]any{ + "builtin": true, + "builtin_kind": builtinKind, + }, + }) + materialised[newID] = struct{}{} + } + oldTo := e.To + e.To = newID + return oldTo +} + +// goBuiltinTarget classifies a bare identifier as one of Go's +// intrinsics. Returns the canonical builtin::go:: ID (per-repo +// prefixed via graph.StubID — see internal/graph/stub.go for why +// two repos can disagree on what's a builtin), the NodeKind to +// materialise it under (always KindBuiltin), and a meta tag +// recording which subspace (func / type / const) it belongs to. +// Returns ("", "", "") when the name is not a Go builtin. +// repoPrefix is the owning repo's RepoPrefix (empty in +// single-repo / legacy callers). +func goBuiltinTarget(repoPrefix, name string) (id string, kind graph.NodeKind, builtinKind string) { + if _, ok := goBuiltinFuncs[name]; ok { + return graph.StubID(repoPrefix, graph.StubKindBuiltin, "go", name), graph.KindBuiltin, "func" + } + if _, ok := goBuiltinTypes[name]; ok { + return graph.StubID(repoPrefix, graph.StubKindBuiltin, "go", "type", name), graph.KindBuiltin, "type" + } + if _, ok := goBuiltinConsts[name]; ok { + return graph.StubID(repoPrefix, graph.StubKindBuiltin, "go", "const", name), graph.KindBuiltin, "const" + } + return "", "", "" +} + +// fromIsGo reports whether the source endpoint of an edge sits +// inside Go code. Uses the From's enclosing function (via the same +// suffix-stripping helper bare-name binding uses) — Go is the only +// language whose IDs follow the `file.go::Func` convention with a +// `.go` extension, so a path-based check is both cheap and reliable. +func (r *Resolver) fromIsGo(fromID string) bool { + owner := enclosingFunctionForBinding(fromID) + if owner == "" { + return false + } + if i := strings.Index(owner, "::"); i > 0 { + // `pkg/foo.go::Func` shape — peek at the file extension. + head := owner[:i] + if strings.HasSuffix(head, ".go") { + return true + } + } + // Fall back to looking up the owner node and checking its + // Language. More expensive but covers edge cases where the ID + // doesn't follow the `.go::Func` pattern. + if n := r.graph.GetNode(owner); n != nil && n.Language == "go" { + return true + } + return false +} diff --git a/internal/resolver/go_builtins_attribution_test.go b/internal/resolver/go_builtins_attribution_test.go new file mode 100644 index 00000000..48cc0f45 --- /dev/null +++ b/internal/resolver/go_builtins_attribution_test.go @@ -0,0 +1,115 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +func TestAttributeGoBuiltins_FunctionCall(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Run" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Run", FilePath: "pkg/foo.go", Language: "go"}) + edge := &graph.Edge{From: owner, To: "unresolved::append", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: 5} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "builtin::go::append", edge.To, + "call to `append` must retarget onto builtin::go::append") + n := g.GetNode("builtin::go::append") + require.NotNil(t, n, "KindBuiltin node must be materialised") + assert.Equal(t, graph.KindBuiltin, n.Kind) + assert.Equal(t, "append", n.Name) + assert.Equal(t, "go", n.Language) + assert.Equal(t, true, n.Meta["builtin"]) + assert.Equal(t, "func", n.Meta["builtin_kind"]) +} + +func TestAttributeGoBuiltins_Type(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::Handler" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "Handler", FilePath: "pkg/foo.go", Language: "go"}) + + paramID := owner + "#param:s" + g.AddNode(&graph.Node{ID: paramID, Kind: graph.KindParam, Name: "s", FilePath: "pkg/foo.go", Language: "go"}) + edge := &graph.Edge{From: paramID, To: "unresolved::string", Kind: graph.EdgeTypedAs, FilePath: "pkg/foo.go", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "builtin::go::type::string", edge.To, + "typed_as `string` must retarget onto builtin::go::type::string") + n := g.GetNode("builtin::go::type::string") + require.NotNil(t, n) + assert.Equal(t, graph.KindBuiltin, n.Kind) + assert.Equal(t, "type", n.Meta["builtin_kind"]) +} + +func TestAttributeGoBuiltins_DedupedAcrossManyEdges(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + + // Many calls to len from the same function. + for i := 1; i <= 5; i++ { + g.AddEdge(&graph.Edge{From: owner, To: "unresolved::len", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: i}) + } + + New(g).attributeGoBuiltins() + + // Exactly one KindBuiltin node should be created regardless of + // how many edges referenced it. + count := 0 + for n := range g.NodesByKind(graph.KindBuiltin) { + if n.ID == "builtin::go::len" { + count++ + } + } + assert.Equal(t, 1, count, "exactly one KindBuiltin per unique builtin") +} + +func TestAttributeGoBuiltins_NonGoLeftAlone(t *testing.T) { + g := graph.New() + // A Python source emitting a reference to `len` (Python builtin) + // — must NOT get attributed to Go's `builtin::go::len`. + owner := "pkg/app.py::process" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "process", FilePath: "pkg/app.py", Language: "python"}) + edge := &graph.Edge{From: owner, To: "unresolved::len", Kind: graph.EdgeArgOf, FilePath: "pkg/app.py", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "unresolved::len", edge.To, + "Python source must NOT cross-bind to Go's len builtin") +} + +func TestAttributeGoBuiltins_UnknownNameLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + edge := &graph.Edge{From: owner, To: "unresolved::myCustomFunc", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "unresolved::myCustomFunc", edge.To, + "non-builtin names must stay unresolved") +} + +func TestAttributeGoBuiltins_QualifiedShapeLeftAlone(t *testing.T) { + g := graph.New() + owner := "pkg/foo.go::F" + g.AddNode(&graph.Node{ID: owner, Kind: graph.KindFunction, Name: "F", FilePath: "pkg/foo.go", Language: "go"}) + + // `*.len` is qualified — leave to other passes. + edge := &graph.Edge{From: owner, To: "unresolved::*.len", Kind: graph.EdgeArgOf, FilePath: "pkg/foo.go", Line: 1} + g.AddEdge(edge) + + New(g).attributeGoBuiltins() + + assert.Equal(t, "unresolved::*.len", edge.To, "qualified `*.len` shape must be left alone") +} diff --git a/internal/resolver/grpc_stub_calls.go b/internal/resolver/grpc_stub_calls.go index cc4f2b2a..0b94a3b1 100644 --- a/internal/resolver/grpc_stub_calls.go +++ b/internal/resolver/grpc_stub_calls.go @@ -50,15 +50,25 @@ const grpcStubPrefix = unresolvedPrefix + "grpc::" // // Returns the number of grpc.stub edges pointing at a resolved handler // after the pass. -func ResolveGRPCStubCalls(g *graph.Graph) int { +func ResolveGRPCStubCalls(g graph.Store) int { if g == nil { return 0 } idx := buildGRPCHandlerIndex(g) resolved := 0 - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeCalls || e.Meta == nil { + var reindexBatch []graph.EdgeReindex + // First pass: collect every grpc.stub edge plus the From IDs we'll + // need to read RepoPrefix off, so the per-edge GetNode below + // collapses to a single GetNodesByIDs batch on disk backends. + type stubEdge struct { + edge *graph.Edge + service, method string + } + var stubs []stubEdge + fromIDs := make(map[string]struct{}) + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil || e.Meta == nil { continue } if v, _ := e.Meta["via"].(string); v != "grpc.stub" { @@ -69,16 +79,28 @@ func ResolveGRPCStubCalls(g *graph.Graph) int { if service == "" || method == "" { continue } + stubs = append(stubs, stubEdge{edge: e, service: service, method: method}) + if e.From != "" { + fromIDs[e.From] = struct{}{} + } + } + fromList := make([]string, 0, len(fromIDs)) + for id := range fromIDs { + fromList = append(fromList, id) + } + callerNodes := g.GetNodesByIDs(fromList) + for _, s := range stubs { + e := s.edge callerRepo := "" - if from := g.GetNode(e.From); from != nil { + if from := callerNodes[e.From]; from != nil { callerRepo = from.RepoPrefix } - handlerID, origin, conf := idx.lookup(service, method, callerRepo) + handlerID, origin, conf := idx.lookup(s.service, s.method, callerRepo) want := handlerID if want == "" { - want = grpcStubPlaceholder(service, method) + want = grpcStubPlaceholder(s.service, s.method) } if e.To == want { if handlerID != "" { @@ -104,7 +126,10 @@ func ResolveGRPCStubCalls(g *graph.Graph) int { e.ConfidenceLabel = "" delete(e.Meta, "grpc_resolution") } - g.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + } + if len(reindexBatch) > 0 { + g.ReindexEdges(reindexBatch) } return resolved } @@ -138,10 +163,11 @@ func (idx *grpcHandlerIndex) lookup(service, method, callerRepo string) (id, ori // buildGRPCHandlerIndex walks the graph once and indexes server-side // gRPC handler methods by service, via both discovery signals. -func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { +func buildGRPCHandlerIndex(g graph.Store) *grpcHandlerIndex { typesByName := map[string][]*graph.Node{} ifacesByName := map[string][]*graph.Node{} - for _, n := range g.AllNodes() { + typeAndIfaceNodes := nodesByKindsOrAll(g, graph.KindType, graph.KindInterface) + for _, n := range typeAndIfaceNodes { switch n.Kind { case graph.KindType: typesByName[n.Name] = append(typesByName[n.Name], n) @@ -151,28 +177,42 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { } // methodsByType: type node ID → its method nodes (via EdgeMemberOf). - // implementorsByIface: interface node ID → implementing type node IDs. + // Use the MemberMethodsByType capability — projects only the four + // columns we read (id/name/file/line) per row, no per-edge GetNode. + rawMembers := memberMethodInfosByType(g) methodsByType := map[string][]*graph.Node{} + for typeID, infos := range rawMembers { + nodes := make([]*graph.Node, 0, len(infos)) + for _, m := range infos { + nodes = append(nodes, &graph.Node{ + ID: m.MethodID, + Kind: graph.KindMethod, + Name: m.Name, + FilePath: m.FilePath, + StartLine: m.StartLine, + RepoPrefix: m.RepoPrefix, + }) + } + methodsByType[typeID] = nodes + } + + // implementorsByIface: interface node ID → implementing type node + // IDs. Pull only EdgeImplements; the From IDs are kept as-is for the + // later impl filter (Unimplemented*). implementorsByIface := map[string][]string{} var registrations []*graph.Edge - for _, e := range g.AllEdges() { + for e := range g.EdgesByKind(graph.EdgeImplements) { if e == nil { continue } - switch e.Kind { - case graph.EdgeMemberOf: - mn := g.GetNode(e.From) - if mn != nil && mn.Kind == graph.KindMethod { - methodsByType[e.To] = append(methodsByType[e.To], mn) - } - case graph.EdgeImplements: - implementorsByIface[e.To] = append(implementorsByIface[e.To], e.From) - case graph.EdgeCalls: - if e.Meta != nil { - if svc, _ := e.Meta["grpc_register_service"].(string); svc != "" { - registrations = append(registrations, e) - } - } + implementorsByIface[e.To] = append(implementorsByIface[e.To], e.From) + } + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil || e.Meta == nil { + continue + } + if svc, _ := e.Meta["grpc_register_service"].(string); svc != "" { + registrations = append(registrations, e) } } @@ -181,6 +221,17 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { iface: map[string][]*graph.Node{}, } + // Prefetch the From nodes for every registration call so the + // per-registration repo / dir lookup collapses to a single batch + // GetNodesByIDs on disk backends. + regFromIDs := make([]string, 0, len(registrations)) + for _, e := range registrations { + if e.From != "" { + regFromIDs = append(regFromIDs, e.From) + } + } + regFromNodes := g.GetNodesByIDs(regFromIDs) + // Signal 1: registration calls. Resolve the impl type named by the // registration's second argument, then index its methods. for _, e := range registrations { @@ -190,7 +241,7 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { continue } regRepo, regDir := "", "" - if from := g.GetNode(e.From); from != nil { + if from := regFromNodes[e.From]; from != nil { regRepo = from.RepoPrefix regDir = grpcParentDir(from.FilePath) } @@ -201,6 +252,29 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { idx.registration[service] = append(idx.registration[service], methodsByType[typeNode.ID]...) } + // Prefetch every implementor type referenced by a `Server` + // interface so the per-implementor GetNode in Signal 2 collapses to + // a batch. + implTypeIDs := make(map[string]struct{}) + for name, ifaceNodes := range ifacesByName { + const sfx = "Server" + if len(name) <= len(sfx) || !strings.HasSuffix(name, sfx) { + continue + } + for _, ifn := range ifaceNodes { + for _, typeID := range implementorsByIface[ifn.ID] { + if typeID != "" { + implTypeIDs[typeID] = struct{}{} + } + } + } + } + implTypeList := make([]string, 0, len(implTypeIDs)) + for id := range implTypeIDs { + implTypeList = append(implTypeList, id) + } + implTypeNodes := g.GetNodesByIDs(implTypeList) + // Signal 2: the `Server` interface and the concrete types // that implement it. The generated `UnimplementedServer` // stub also implements the interface — skip it so the fallback @@ -213,7 +287,7 @@ func buildGRPCHandlerIndex(g *graph.Graph) *grpcHandlerIndex { service := name[:len(name)-len(sfx)] for _, ifn := range ifaceNodes { for _, typeID := range implementorsByIface[ifn.ID] { - tn := g.GetNode(typeID) + tn := implTypeNodes[typeID] if tn == nil || strings.HasPrefix(tn.Name, "Unimplemented") { continue } diff --git a/internal/resolver/grpc_stub_calls_test.go b/internal/resolver/grpc_stub_calls_test.go index 76cbcbf1..6bbb314e 100644 --- a/internal/resolver/grpc_stub_calls_test.go +++ b/internal/resolver/grpc_stub_calls_test.go @@ -14,7 +14,7 @@ import ( // grpc.stub call edge, and a server-side handler discoverable via // registration and/or interface satisfaction. type grpcTestGraph struct { - g *graph.Graph + g graph.Store } func newGRPCTestGraph() *grpcTestGraph { return &grpcTestGraph{g: graph.New()} } diff --git a/internal/resolver/method_receiver_rebind.go b/internal/resolver/method_receiver_rebind.go new file mode 100644 index 00000000..524510d2 --- /dev/null +++ b/internal/resolver/method_receiver_rebind.go @@ -0,0 +1,95 @@ +package resolver + +import ( + "path/filepath" + "strings" + + "github.com/zzet/gortex/internal/graph" +) + +// rebindGoMethodReceivers fixes Go EdgeMemberOf edges whose target is +// a phantom `::TypeName` ID — the artefact of the Go +// extractor building the receiver-type endpoint from the method's own +// file rather than the file the type is actually declared in. Methods +// spread across multiple files in the same package each emit a +// different `::Type` target even though they all logically +// belong to the single type node defined elsewhere. +// +// Without this pass: +// - ladybug materialises phantom Node rows to satisfy the +// rel-table FK on every cross-file method-receiver edge; +// - InferImplements builds a typeID → method-set map keyed on the +// phantom IDs, so a type whose methods span N files appears as N +// partial types each with a fraction of the real method set, and +// interface satisfaction is under-detected; +// - find_implementations / get_class_hierarchy / get_callers over +// interface methods all return partial results for cross-file- +// method types (which is most of any non-trivial Go codebase). +// +// Algorithm: index every Go KindType / KindInterface node by +// (filepath.Dir(file), name); walk EdgeMemberOf; for each Go method +// whose To doesn't resolve, look up (its file's dir, type name); if +// exactly one match, rewrite edge.To to the canonical type ID via +// ReindexEdges (one batched commit instead of per-edge round-trips). +// +// Scope: Go only — other languages (Java / TS / Python) group methods +// inside the class body in the same file, so the cross-file pattern +// doesn't arise. The method node's Language gates the rebind. +func (r *Resolver) rebindGoMethodReceivers() { + type pkgKey struct{ pkg, name string } + typesIdx := make(map[pkgKey]string) + for _, kind := range []graph.NodeKind{graph.KindType, graph.KindInterface} { + for n := range r.graph.NodesByKind(kind) { + if n.Language != "go" || n.Name == "" || n.FilePath == "" { + continue + } + k := pkgKey{filepath.Dir(n.FilePath), n.Name} + if existing, ok := typesIdx[k]; ok && existing != n.ID { + // Two distinct type nodes with the same name in the + // same package directory shouldn't happen in valid Go, + // but guard against it — leave the edge alone rather + // than pick an arbitrary winner. + typesIdx[k] = "" + continue + } + typesIdx[k] = n.ID + } + } + if len(typesIdx) == 0 { + return + } + var batch []graph.EdgeReindex + for e := range r.graph.EdgesByKind(graph.EdgeMemberOf) { + method := r.graph.GetNode(e.From) + if method == nil || method.Language != "go" || method.Kind != graph.KindMethod { + continue + } + // Already resolves to a real type node — same-file methods + // land here. Nothing to do. + if n := r.graph.GetNode(e.To); n != nil && (n.Kind == graph.KindType || n.Kind == graph.KindInterface) { + continue + } + // Parse `::`. The split is on the LAST + // `::` so paths embedded in the ID (none in Go, but stay + // defensive) can't trip us up. + i := strings.LastIndex(e.To, "::") + if i <= 0 { + continue + } + file := e.To[:i] + typeName := e.To[i+2:] + if file == "" || typeName == "" { + continue + } + canonicalID, ok := typesIdx[pkgKey{filepath.Dir(file), typeName}] + if !ok || canonicalID == "" || canonicalID == e.To { + continue + } + oldTo := e.To + e.To = canonicalID + batch = append(batch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + } + if len(batch) > 0 { + r.graph.ReindexEdges(batch) + } +} diff --git a/internal/resolver/method_receiver_rebind_test.go b/internal/resolver/method_receiver_rebind_test.go new file mode 100644 index 00000000..9222bf5b --- /dev/null +++ b/internal/resolver/method_receiver_rebind_test.go @@ -0,0 +1,135 @@ +package resolver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/zzet/gortex/internal/graph" +) + +// TestRebindGoMethodReceivers_CollapsesCrossFileMethods is the +// regression for the Go extractor emitting EdgeMemberOf targets as +// ::TypeName. When methods on the same type live in +// different files of the same package, the parser produces a phantom +// type ID per method-file; the rebind pass must collapse them onto +// the canonical ::TypeName node so InferImplements and the +// downstream MCP tools (find_implementations, class_hierarchy) see +// the consolidated method set. +func TestRebindGoMethodReceivers_CollapsesCrossFileMethods(t *testing.T) { + g := graph.New() + + // Type defined in indexer.go. + typeID := "internal/indexer/indexer.go::Indexer" + g.AddNode(&graph.Node{ + ID: typeID, Kind: graph.KindType, Name: "Indexer", + FilePath: "internal/indexer/indexer.go", Language: "go", + }) + + // Method declared in a *different* file in the same package — the + // parser emits a phantom receiver target. + methodID := "internal/indexer/crash_isolation.go::Indexer.crashIsolationEnabled" + g.AddNode(&graph.Node{ + ID: methodID, Kind: graph.KindMethod, Name: "crashIsolationEnabled", + FilePath: "internal/indexer/crash_isolation.go", Language: "go", + }) + phantomTarget := "internal/indexer/crash_isolation.go::Indexer" + memberEdge := &graph.Edge{ + From: methodID, To: phantomTarget, Kind: graph.EdgeMemberOf, + FilePath: "internal/indexer/crash_isolation.go", Line: 23, + } + g.AddEdge(memberEdge) + + // Sanity: pre-pass the phantom target has no real node. + require.Nil(t, g.GetNode(phantomTarget), "phantom target must not exist as a real node") + + r := New(g) + r.rebindGoMethodReceivers() + + // Post-pass: the edge points at the canonical type node. + assert.Equal(t, typeID, memberEdge.To, + "EdgeMemberOf must be rewritten from ::Type to canonical ::Type") + + // And the same-file method on the type works too — covered by not + // breaking a control case: + g2 := graph.New() + g2.AddNode(&graph.Node{ + ID: "pkg/foo.go::Foo", Kind: graph.KindType, Name: "Foo", + FilePath: "pkg/foo.go", Language: "go", + }) + g2.AddNode(&graph.Node{ + ID: "pkg/foo.go::Foo.Bar", Kind: graph.KindMethod, Name: "Bar", + FilePath: "pkg/foo.go", Language: "go", + }) + sameFileEdge := &graph.Edge{ + From: "pkg/foo.go::Foo.Bar", To: "pkg/foo.go::Foo", + Kind: graph.EdgeMemberOf, FilePath: "pkg/foo.go", Line: 5, + } + g2.AddEdge(sameFileEdge) + + New(g2).rebindGoMethodReceivers() + assert.Equal(t, "pkg/foo.go::Foo", sameFileEdge.To, + "same-file method edge must be left unchanged") +} + +// TestRebindGoMethodReceivers_LanguageGated guards against the pass +// rewriting non-Go EdgeMemberOf edges. Java/TS/Python group methods +// in the class body so their EdgeMemberOf targets are already +// in-file; we don't want the pass touching them. +func TestRebindGoMethodReceivers_LanguageGated(t *testing.T) { + g := graph.New() + + // A type and a method in the same Go package — would normally be + // a rebind candidate. + g.AddNode(&graph.Node{ + ID: "pkg/types.go::Server", Kind: graph.KindType, Name: "Server", + FilePath: "pkg/types.go", Language: "go", + }) + // But the METHOD is declared as TypeScript (e.g. a TS extractor + // that emits the same EdgeMemberOf shape for some bridging + // reason). Pass must leave it alone. + tsMethod := &graph.Node{ + ID: "pkg/handler.ts::Server.serve", Kind: graph.KindMethod, Name: "serve", + FilePath: "pkg/handler.ts", Language: "typescript", + } + g.AddNode(tsMethod) + edge := &graph.Edge{ + From: tsMethod.ID, To: "pkg/handler.ts::Server", + Kind: graph.EdgeMemberOf, FilePath: "pkg/handler.ts", Line: 1, + } + g.AddEdge(edge) + + New(g).rebindGoMethodReceivers() + assert.Equal(t, "pkg/handler.ts::Server", edge.To, + "non-Go method edge must NOT be rewritten by the Go-only rebind pass") +} + +// TestRebindGoMethodReceivers_AmbiguousNameSkipped guards against the +// pass picking an arbitrary winner when two distinct types share the +// same name in the same package (shouldn't happen in valid Go, but +// the pass should leave the phantom alone rather than mis-bind). +func TestRebindGoMethodReceivers_AmbiguousNameSkipped(t *testing.T) { + g := graph.New() + g.AddNode(&graph.Node{ + ID: "pkg/a.go::Dup", Kind: graph.KindType, Name: "Dup", + FilePath: "pkg/a.go", Language: "go", + }) + g.AddNode(&graph.Node{ + ID: "pkg/b.go::Dup", Kind: graph.KindType, Name: "Dup", + FilePath: "pkg/b.go", Language: "go", + }) + g.AddNode(&graph.Node{ + ID: "pkg/c.go::Dup.M", Kind: graph.KindMethod, Name: "M", + FilePath: "pkg/c.go", Language: "go", + }) + edge := &graph.Edge{ + From: "pkg/c.go::Dup.M", To: "pkg/c.go::Dup", + Kind: graph.EdgeMemberOf, FilePath: "pkg/c.go", Line: 1, + } + g.AddEdge(edge) + + New(g).rebindGoMethodReceivers() + assert.Equal(t, "pkg/c.go::Dup", edge.To, + "ambiguous type name in same package must leave the edge phantom rather than guess") +} diff --git a/internal/resolver/module_attribution.go b/internal/resolver/module_attribution.go index 1b16f795..9a425b5e 100644 --- a/internal/resolver/module_attribution.go +++ b/internal/resolver/module_attribution.go @@ -39,10 +39,7 @@ func (r *Resolver) attributeNonGoModuleImports() { moduleSeeds := map[string]moduleSeed{} dependsSeen := map[string]map[string]struct{}{} // fileID → set of moduleIDs - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } + for e := range r.graph.EdgesByKind(graph.EdgeImports) { if !strings.HasPrefix(e.To, "external::") { continue } @@ -72,21 +69,46 @@ func (r *Resolver) attributeNonGoModuleImports() { } // Materialise module nodes first; later loops assume the - // node exists when we add EdgeDependsOnModule. + // node exists when we add EdgeDependsOnModule. Batch the + // presence check via GetNodesByIDs so disk backends do one + // indexed SELECT IN (...) instead of one per-seed GetNode. + seedIDs := make([]string, 0, len(moduleSeeds)) + for id := range moduleSeeds { + seedIDs = append(seedIDs, id) + } + existing := r.graph.GetNodesByIDs(seedIDs) for _, seed := range moduleSeeds { - if r.graph.GetNode(seed.id) != nil { + if _, ok := existing[seed.id]; ok { continue } r.graph.AddNode(buildNonGoModuleNode(seed)) } - // Rewrite each EdgeImports target and re-bucket via - // ReindexEdge so find_usages on the new module sees the - // caller file. + // Pre-build a set of every (fileID, moduleID) pair the graph + // already has an EdgeDependsOnModule edge for. The old code + // called hasDependsOnModule per rewrite, which on a disk backend + // fans out to N per-file GetOutEdges queries (50k+ on a + // gortex-scale pass). One EdgesByKind scan is an indexed range + // read on every backend, plus a Go-side map build that turns + // the per-rewrite check into a constant-time lookup. + existingDepends := make(map[string]map[string]struct{}) + for e := range r.graph.EdgesByKind(graph.EdgeDependsOnModule) { + set := existingDepends[e.From] + if set == nil { + set = make(map[string]struct{}) + existingDepends[e.From] = set + } + set[e.To] = struct{}{} + } + + // Rewrite each EdgeImports target and collect the re-bucket + // jobs into one batch so disk backends commit in chunks rather + // than once per import rewrite. + reindexBatch := make([]graph.EdgeReindex, 0, len(rewrites)) for _, p := range rewrites { p.edge.To = p.moduleID p.edge.Origin = graph.OriginASTResolved - r.graph.ReindexEdge(p.edge, p.oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: p.edge, OldTo: p.oldTo}) set, ok := dependsSeen[p.edge.From] if !ok { @@ -99,9 +121,12 @@ func (r *Resolver) attributeNonGoModuleImports() { set[p.moduleID] = struct{}{} // Avoid emitting a duplicate EdgeDependsOnModule when an // earlier pass already wired one (e.g. cold + warm - // indexing of the same file). - if r.hasDependsOnModule(p.edge.From, p.moduleID) { - continue + // indexing of the same file). Constant-time map lookup + // against the pre-built existingDepends index. + if existing, ok := existingDepends[p.edge.From]; ok { + if _, dup := existing[p.moduleID]; dup { + continue + } } r.graph.AddEdge(&graph.Edge{ From: p.edge.From, @@ -114,31 +139,21 @@ func (r *Resolver) attributeNonGoModuleImports() { Origin: graph.OriginASTResolved, }) } + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) + } } // collectFileLanguages walks KindFile nodes once and returns // (file ID → language) for the per-edge dispatch above. func (r *Resolver) collectFileLanguages() map[string]string { out := map[string]string{} - for _, n := range r.graph.AllNodes() { - if n.Kind == graph.KindFile { - out[n.ID] = n.Language - } + for n := range r.graph.NodesByKind(graph.KindFile) { + out[n.ID] = n.Language } return out } -// hasDependsOnModule reports whether the file already has an -// outgoing EdgeDependsOnModule pointing at moduleID. -func (r *Resolver) hasDependsOnModule(fileID, moduleID string) bool { - for _, e := range r.graph.GetOutEdges(fileID) { - if e.Kind == graph.EdgeDependsOnModule && e.To == moduleID { - return true - } - } - return false -} - // nonGoImportToModuleID maps a (language, importPath) pair to its // canonical KindModule ID. The second return value is the module's // own language tag (used at materialisation time so a stdlib module diff --git a/internal/resolver/module_attribution_test.go b/internal/resolver/module_attribution_test.go index f6b72d66..1a8f139d 100644 --- a/internal/resolver/module_attribution_test.go +++ b/internal/resolver/module_attribution_test.go @@ -11,7 +11,7 @@ import ( // seedFile adds a KindFile node with the given language to the // graph; tests use it to drive the language-aware attribution pass. -func seedFile(g *graph.Graph, fileID, language string) { +func seedFile(g graph.Store, fileID, language string) { g.AddNode(&graph.Node{ ID: fileID, Kind: graph.KindFile, Name: fileID, FilePath: fileID, Language: language, @@ -21,7 +21,7 @@ func seedFile(g *graph.Graph, fileID, language string) { // seedExternalImport drops in an EdgeImports edge that's already // landed at an `external::*` target — the post-pass inputs we want // to exercise. -func seedExternalImport(g *graph.Graph, fileID, importPath string) *graph.Edge { +func seedExternalImport(g graph.Store, fileID, importPath string) *graph.Edge { e := &graph.Edge{ From: fileID, To: "external::" + importPath, @@ -179,7 +179,7 @@ func TestAttributeNonGo_IdempotentOnSecondPass(t *testing.T) { // outEdgesOfKind is a small filter over Graph.GetOutEdges for the // assertions above; declared here to keep the test file self- // contained. -func outEdgesOfKind(g *graph.Graph, fileID string, kind graph.EdgeKind) []*graph.Edge { +func outEdgesOfKind(g graph.Store, fileID string, kind graph.EdgeKind) []*graph.Edge { var out []*graph.Edge for _, e := range g.GetOutEdges(fileID) { if e.Kind == kind { diff --git a/internal/resolver/relative_imports.go b/internal/resolver/relative_imports.go index b87b8419..6ad0f936 100644 --- a/internal/resolver/relative_imports.go +++ b/internal/resolver/relative_imports.go @@ -22,10 +22,54 @@ import ( // module-attribution pass can decide what to do with them. func (r *Resolver) resolveRelativeImports() { fileLang := r.collectFileLanguages() - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue + var reindexBatch []graph.EdgeReindex + + // Pre-build a map of every KindFile node's ID. The relative- + // import resolvers below check 1-2 candidate IDs per edge to + // decide whether a target file exists; doing that as a per-edge + // GetNode (a per-edge round-trip on a disk backend) is what made + // this pass dominate disk-backed resolve time. One NodesByKind scan + // materialises the set once at indexed cost; lookups become + // O(1) map hits. + fileIDs := make(map[string]struct{}, 1024) + for n := range r.graph.NodesByKind(graph.KindFile) { + if n != nil && n.ID != "" { + fileIDs[n.ID] = struct{}{} + } + } + resolvePython := func(stem string) string { + if !strings.Contains(stem, "/") { + return "" + } + for _, cand := range []string{stem + ".py", stem + "/__init__.py"} { + if _, ok := fileIDs[cand]; ok { + return cand + } + } + return "" + } + resolveDart := func(importingFile, uri string) string { + if uri == "" || strings.HasPrefix(uri, "dart:") || strings.HasPrefix(uri, "package:") { + return "" + } + dir := "" + if i := strings.LastIndex(importingFile, "/"); i >= 0 { + dir = importingFile[:i] + } + target := joinRelativePath(dir, uri) + if target == "" { + return "" } + if _, ok := fileIDs[target]; ok { + return target + } + return "" + } + + // EdgesByKind pushes the "kind = imports" filter into the store; + // disk backends only enumerate import edges instead of every + // edge in the graph. + for e := range r.graph.EdgesByKind(graph.EdgeImports) { lang, ok := fileLang[e.From] if !ok { continue @@ -38,7 +82,7 @@ func (r *Resolver) resolveRelativeImports() { // Always resolvable via internal-file lookup. path = strings.TrimPrefix(e.To, "unresolved::pyrel::") if lang == "python" { - resolved = resolvePythonRelativeImport(r.graph, path) + resolved = resolvePython(path) } case strings.HasPrefix(e.To, "external::"): // Fallthrough path for Dart relative URIs the main @@ -48,9 +92,9 @@ func (r *Resolver) resolveRelativeImports() { path = strings.TrimPrefix(e.To, "external::") switch lang { case "python": - resolved = resolvePythonRelativeImport(r.graph, path) + resolved = resolvePython(path) case "dart": - resolved = resolveDartRelativeImport(r.graph, e.From, path) + resolved = resolveDart(e.From, path) } default: continue @@ -62,57 +106,18 @@ func (r *Resolver) resolveRelativeImports() { if strings.HasPrefix(e.To, "unresolved::pyrel::") { oldTo := e.To e.To = "external::" + path - r.graph.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) } continue } oldTo := e.To e.To = resolved e.Origin = graph.OriginASTResolved - r.graph.ReindexEdge(e, oldTo) - } -} - -// resolvePythonRelativeImport maps a project-rooted Python file-path -// stem ("app/util", "pkg/sub") to the matching `KindFile` node ID. -// Tries `.py` first, then `/__init__.py` (package). Returns -// "" if no candidate exists in the graph or if `stem` doesn't look like -// a relative-import stem (no slash separator — those are absolute -// module references handled by attributeNonGoModuleImports). -func resolvePythonRelativeImport(g *graph.Graph, stem string) string { - if !strings.Contains(stem, "/") { - return "" - } - for _, cand := range []string{stem + ".py", stem + "/__init__.py"} { - if n := g.GetNode(cand); n != nil && n.Kind == graph.KindFile { - return n.ID - } - } - return "" -} - -// resolveDartRelativeImport joins a relative Dart import URI against -// the importing file's directory and returns the matching `KindFile` -// node ID. Paths starting with `dart:` or `package:` are caller- -// validated to belong to the module-attribution pass and are skipped -// here. Returns "" when the resolved path escapes the repo root or -// when the target file is not in the graph. -func resolveDartRelativeImport(g *graph.Graph, importingFile, uri string) string { - if uri == "" || strings.HasPrefix(uri, "dart:") || strings.HasPrefix(uri, "package:") { - return "" - } - dir := "" - if i := strings.LastIndex(importingFile, "/"); i >= 0 { - dir = importingFile[:i] - } - target := joinRelativePath(dir, uri) - if target == "" { - return "" + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) } - if n := g.GetNode(target); n != nil && n.Kind == graph.KindFile { - return n.ID + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) } - return "" } // joinRelativePath joins a relative URI onto a directory and collapses diff --git a/internal/resolver/resolver.go b/internal/resolver/resolver.go index a99f79c1..499670ed 100644 --- a/internal/resolver/resolver.go +++ b/internal/resolver/resolver.go @@ -1,6 +1,7 @@ package resolver import ( + "iter" "path/filepath" "runtime" "sort" @@ -35,7 +36,7 @@ type ResolveStats struct { // Indexer.IndexFile) crash the daemon with "concurrent map writes" // in buildDirIndexes. type Resolver struct { - graph *graph.Graph + graph graph.Store dirIndex map[string][]*graph.Node lastDirIndex map[string][]*graph.Node // providesForIdx maps `provides_for: AbstractName` (from @Module @@ -68,7 +69,7 @@ type Resolver struct { // pass, torn down at the end. depModuleIndex map[string][]depModuleEntry // mu serialises resolution phases against the shared graph. - // Pointer so every Resolver built from the same *graph.Graph + // Pointer so every Resolver built from the same graph.Store // locks the same mutex — necessary for MultiIndexer's per-repo // goroutines, each of which spawns its own Resolver instance. // Without the shared lock, concurrent ResolveAll passes race on @@ -76,6 +77,19 @@ type Resolver struct { // goroutine iterates via graph.AllEdges()). mu *sync.Mutex + // lookupCache holds per-pass batched results from GetNodesByIDs / + // FindNodesByNames. Populated by ResolveAll/ResolveFile before + // the worker fan-out and cleared on return. Workers consult these + // maps first; misses fall through to the underlying Store. + // + // Without the cache, the resolver fires ~3-10 store point lookups + // per pending edge — across 10-30k unresolved edges that's 100k+ + // queries, each one a round trip on disk backends (~ms each). + // With the cache the same information lands in two batched + // queries per pass. + nodeByID map[string]*graph.Node + nodesByName map[string][]*graph.Node + // lspHelper, when non-nil, is consulted before falling back to // AST heuristics for cross-file dispatch in languages whose // helper-reported extensions match (today: TS/JS/JSX/TSX via @@ -121,14 +135,41 @@ type depModuleEntry struct { node *graph.Node } -// New creates a Resolver for the given graph. The returned Resolver -// shares graph.ResolveMutex() with every other Resolver built from -// the same Graph, so their ResolveAll / ResolveFile calls serialise -// end-to-end. -func New(g *graph.Graph) *Resolver { +// New creates a Resolver for the given store. The returned Resolver +// shares store.ResolveMutex() with every other Resolver built from +// the same Store, so their ResolveAll / ResolveFile calls serialise +// end-to-end across cross-repo / temporal / external passes. +func New(g graph.Store) *Resolver { return &Resolver{graph: g, mu: g.ResolveMutex()} } +// SetGraph retargets the Resolver at a different Store. The indexer's +// in-memory shadow-swap path needs this: the Resolver is constructed +// against the disk Store at indexer-New time, but during IndexCtx the +// indexer reassigns its own graph pointer to an in-memory shadow. +// Without SetGraph the Resolver kept reading the (empty) disk Store +// and short-circuited on len(pending) == 0, silently disabling every +// resolver pass for backends that opt into the shadow swap. +// +// Holds the resolve mutex so a concurrent ResolveAll / ResolveFile +// can't observe a half-rotated graph reference, and switches mu to +// the new store's resolve mutex so subsequent passes serialise +// against any Resolver built directly on the new Store. +func (r *Resolver) SetGraph(g graph.Store) { + if g == nil { + return + } + oldMu := r.mu + if oldMu != nil { + oldMu.Lock() + } + r.graph = g + r.mu = g.ResolveMutex() + if oldMu != nil { + oldMu.Unlock() + } +} + // ResolveAll resolves all unresolved edges in the graph. // // Edge resolution is partitioned across runtime.NumCPU() workers. @@ -159,20 +200,58 @@ func (r *Resolver) ResolveAll() *ResolveStats { defer r.clearReachabilityIndex() defer r.clearLSPIndex() - edges := r.graph.AllEdges() - // Pre-filter to the unresolved subset so workers don't burn time - // re-walking the whole edge slice — ~95% of edges in a settled - // graph are already resolved. - pending := edges[:0:0] - for _, e := range edges { - if strings.HasPrefix(e.To, unresolvedPrefix) { - pending = append(pending, e) + // Backend-delegated resolution: when the store implements + // graph.BackendResolver AND the GORTEX_BACKEND_RESOLVER env var + // is set, drain the bulk-tractable subset of the resolver's + // work via a sequence of Cypher / SQL / Datalog statements that + // run inside the backend engine. ResolveAllBulk chains the + // per-rule methods (SameFile → SamePackage → ImportAware → …) + // in precision-descending order, so higher-precision rules bind + // first and unique-name fallback only resolves what nothing + // more specific covered. + // + // This is the disk-only / large-repo path: when the in-memory + // shadow swap is disabled, the resolver's ~100k+ per-edge round + // trips dominate wall time. The bulk pass typically drains + // 50-80% of pending edges before the Go worker pool runs, and + // the remaining set fits cheaply into a single per-pass + // warmLookupCache. Errors are non-fatal — the Go resolver + // always re-runs on whatever's left. + if backendResolverEnabled() { + if br, ok := r.graph.(graph.BackendResolver); ok { + if n, err := br.ResolveAllBulk(); err != nil { + // Non-fatal: the Go path resolves the same edges + // correctly, just slower. + _ = n + } } } + + // Use the predicate-shaped Store method so disk backends scan + // only the contiguous "unresolved::*" slice instead of pulling + // the whole edges table back to the client and filtering in Go. + // In-memory keeps the same cost as the old AllEdges()+prefix-check + // loop. + var pending []*graph.Edge + for e := range r.graph.EdgesWithUnresolvedTarget() { + pending = append(pending, e) + } if len(pending) == 0 { return &ResolveStats{} } + // Pre-warm the per-pass lookup cache. The resolver workers below + // will call store.GetNode for endpoints and store.FindNodesByName + // for resolution candidates — across 10-30k pending edges that's + // 100k+ individual queries on a disk backend + // (hundreds of seconds wall time). Collecting the + // IDs / names upfront and batch-loading them collapses those + // queries to ~10 chunked SELECT IN statements. Cleared on return + // via defer so callers outside ResolveAll see the empty caches and + // fall through to the underlying store on every lookup. + r.warmLookupCache(pending) + defer r.clearLookupCache() + workers := runtime.NumCPU() if workers < 1 { workers = 1 @@ -237,6 +316,18 @@ func (r *Resolver) ResolveAll() *ResolveStats { // the race entirely; it costs ~5% of resolver wall time on a // 12k-edge vscode pass and buys a clean -race run plus simpler // reasoning. + // Collect every mutation across all workers into one slice and hand + // the whole batch to ReindexEdges. Disk-backed stores commit per + // chunk inside the implementation; the in-memory store loops + // through the existing per-edge code. Per-edge ReindexEdge was the + // resolver's bottleneck against bbolt (10k+ ACID round-trips); the + // batch form folds it to ≤(N/5000) commits without changing any + // observable semantics. + totalJobs := 0 + for i := range perWorkerJobs { + totalJobs += len(perWorkerJobs[i]) + } + reindexBatch := make([]graph.EdgeReindex, 0, totalJobs) for i := range perWorkerJobs { for _, j := range perWorkerJobs[i] { j.edge.To = j.newTo @@ -245,9 +336,10 @@ func (r *Resolver) ResolveAll() *ResolveStats { j.edge.Confidence = j.confidence j.edge.Origin = j.origin j.edge.Meta = j.meta - r.graph.ReindexEdge(j.edge, j.oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: j.edge, OldTo: j.oldTo}) } } + r.graph.ReindexEdges(reindexBatch) // Cross-package name-match guard. The heuristic fallbacks above can // resolve a call by name alone to a candidate in a package the @@ -263,6 +355,54 @@ func (r *Resolver) ResolveAll() *ResolveStats { } } + // Rebind cross-file Go method receivers onto the canonical type + // node ID. The Go extractor builds the EdgeMemberOf target as + // `::TypeName` because it parses one file at a time; + // methods declared in files other than the type's defining file + // point at a phantom ID until this pass collapses them onto the + // real `::TypeName` node. See rebindGoMethodReceivers + // for the full rationale (InferImplements + find_implementations + // + class_hierarchy correctness all ride on this). + r.rebindGoMethodReceivers() + + // Scope-aware bare-name binding. Walks `unresolved::` edges + // whose source is inside a function and rewrites them onto the + // matching KindLocal / KindParam node when exactly one in-scope + // binding wins under the Go shadowing rules. Without this pass + // the worker-pool fallback would scan FindNodesByName(name) + // across the whole graph and fall through to `unresolved::*` for + // every common identifier (err / data / src / ...). The bind + // uses #77's KindLocal nodes — pre-#77 there was nothing to + // bind to. + r.bindBareNameScopeRefs() + + // Bind in-body references to a function's own generic type + // parameters (`var x T`, `func F[T any]() T { ... }`) onto the + // pre-existing KindGenericParam nodes — without this pass they + // stayed as `unresolved::T` even though the parser had already + // materialised the tparam node. + r.bindGenericParamRefs() + + // Attribute Go language intrinsics (append / len / make / string + // / int / ...) to canonical `builtin::go::*` IDs and materialise + // one KindBuiltin node per unique builtin. Eliminates ~50k of + // the bare-name `unresolved::*` population on a Go-heavy + // codebase and turns the analytics queries that need these + // targets (`find_usages(builtin::go::type::float64)` for + // type-drift analysis) into one-hop lookups. + r.attributeGoBuiltins() + + // Materialise stdlib / dep / external call targets as + // KindFunction nodes with KindModule parents so cross-package + // queries (`find_usages(stdlib::fmt::Sprintf)`, + // `get_callers(dep::github.com/stretchr/testify/assert::True)`, + // "what's our usage surface on encoding/json") become one-hop + // lookups. Must run AFTER resolveExtern (which classifies + // `unresolved::extern::*` into the stdlib/dep/external buckets) + // so we materialise the post-classification state, not the + // pre-classification shape. + r.attributeGoExternalCalls() + // Relative-import resolution for Python and Dart files. Runs // before module attribution so internal-target stems never get // mis-mapped to a phantom pypi/pub package. @@ -301,13 +441,11 @@ func (r *Resolver) ResolveAll() *ResolveStats { // - lastDirIndex keys on the last path component of that directory // so an import of "logger" matches any file under .../logger/. func (r *Resolver) buildDirIndexes() { - nodes := r.graph.AllNodes() - r.dirIndex = make(map[string][]*graph.Node, len(nodes)/4) - r.lastDirIndex = make(map[string][]*graph.Node, len(nodes)/4) - for _, n := range nodes { - if n.Kind != graph.KindFile { - continue - } + r.dirIndex = make(map[string][]*graph.Node, 128) + r.lastDirIndex = make(map[string][]*graph.Node, 128) + // NodesByKind pushes the file-kind filter into the store; disk + // backends iterate just the file nodes instead of every node. + for n := range r.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) r.dirIndex[dir] = append(r.dirIndex[dir], n) last := lastPathComponent(dir) @@ -322,6 +460,103 @@ func (r *Resolver) clearDirIndexes() { r.lastDirIndex = nil } +// warmLookupCache batches the per-edge GetNode / FindNodesByName +// queries the worker loop would otherwise fire serially. We collect +// every From/To node ID across the pending slice and the bare +// identifier name embedded in each `unresolved::*` target, then issue +// the two batched queries the Store exposes. Workers consult the +// resulting maps via cachedGetNode / cachedFindNodesByName; misses +// fall through to the underlying store. +func (r *Resolver) warmLookupCache(pending []*graph.Edge) { + if len(pending) == 0 { + return + } + idSet := make(map[string]struct{}, len(pending)*2) + nameSet := make(map[string]struct{}, len(pending)) + for _, e := range pending { + if e == nil { + continue + } + if e.From != "" { + idSet[e.From] = struct{}{} + } + // e.To at this point still carries the "unresolved::" prefix; + // pre-loading by that string isn't useful (no node has that + // id). We seed the name cache from the embedded identifier so + // the worker's FindNodesByName hit lands in the cache. + if name := identifierFromTarget(e.To); name != "" { + nameSet[name] = struct{}{} + } + } + ids := make([]string, 0, len(idSet)) + for id := range idSet { + ids = append(ids, id) + } + names := make([]string, 0, len(nameSet)) + for n := range nameSet { + names = append(names, n) + } + r.nodeByID = r.graph.GetNodesByIDs(ids) + r.nodesByName = r.graph.FindNodesByNames(names) + // Fold every candidate node returned by the name lookup into the + // id cache too: when a worker picks a candidate and the + // downstream guard (cross_pkg / cross_repo) calls GetNode on the + // chosen target, the cache should hit instead of falling through + // to a per-id store call. + if r.nodeByID == nil && len(r.nodesByName) > 0 { + r.nodeByID = make(map[string]*graph.Node, len(r.nodesByName)) + } + for _, hits := range r.nodesByName { + for _, n := range hits { + if n == nil || n.ID == "" { + continue + } + if _, ok := r.nodeByID[n.ID]; !ok { + r.nodeByID[n.ID] = n + } + } + } +} + +func (r *Resolver) clearLookupCache() { + r.nodeByID = nil + r.nodesByName = nil +} + +// cachedGetNode returns the node for id, consulting the per-pass +// lookup cache first and falling through to the underlying store on +// miss. The cache is a positive-only fast path — absence means "not +// pre-warmed", not "doesn't exist", so a miss still asks the store. +// Outside a ResolveAll pass the cache is nil and every call goes +// straight to the store. +func (r *Resolver) cachedGetNode(id string) *graph.Node { + if id == "" { + return nil + } + if r.nodeByID != nil { + if n, ok := r.nodeByID[id]; ok { + return n + } + } + return r.graph.GetNode(id) +} + +// cachedFindNodesByName returns the candidates for name, consulting +// the per-pass cache first and falling through to the store on miss. +// Returns the in-cache slice directly when hit — callers MUST treat +// the result as read-only. +func (r *Resolver) cachedFindNodesByName(name string) []*graph.Node { + if name == "" { + return nil + } + if r.nodesByName != nil { + if hits, ok := r.nodesByName[name]; ok { + return hits + } + } + return r.graph.FindNodesByName(name) +} + // buildDepModuleIndex collects every dep:: contract node // (one per non-indirect `require` line in a tracked go.mod) and groups // them by the owning repo's prefix so resolveImport can bridge a Go @@ -335,12 +570,8 @@ func (r *Resolver) clearDirIndexes() { // repo — those resolve through the cross-repo file graph instead and // have no module path embedded in the ID. func (r *Resolver) buildDepModuleIndex() { - nodes := r.graph.AllNodes() by := make(map[string][]depModuleEntry) - for _, n := range nodes { - if n.Kind != graph.KindContract { - continue - } + for n := range r.graph.NodesByKind(graph.KindContract) { if !strings.HasPrefix(n.ID, "dep::") { continue } @@ -396,20 +627,24 @@ func (r *Resolver) ResolveFile(filePath string) *ResolveStats { stats := &ResolveStats{} // Get all nodes in the file, then check their outgoing edges. - // Single-threaded path — apply ReindexEdge inline as before. - // Resolved edges are also recorded as jobs so the cross-package - // guard can re-check (and, if needed, revert) the weak-tier ones. + // Single-threaded path — collect mutations into a batch and flush + // in one ReindexEdges call after the file's edges are walked, so a + // per-file ResolveFile pass produces one Tx commit on disk + // backends instead of one per resolved edge. Resolved edges are + // also recorded as jobs so the cross-package guard can re-check + // (and, if needed, revert) the weak-tier ones. var jobs []reindexJob + var reindexBatch []graph.EdgeReindex nodes := r.graph.GetFileNodes(filePath) for _, n := range nodes { edges := r.graph.GetOutEdges(n.ID) for _, e := range edges { - if !strings.HasPrefix(e.To, unresolvedPrefix) { + if !graph.IsUnresolvedTarget(e.To) { continue } oldTo, changed := r.resolveEdge(e, stats) if changed { - r.graph.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) jobs = append(jobs, reindexJob{ edge: e, oldTo: oldTo, @@ -421,6 +656,9 @@ func (r *Resolver) ResolveFile(filePath string) *ResolveStats { } } } + if len(reindexBatch) > 0 { + r.graph.ReindexEdges(reindexBatch) + } // Cross-package name-match guard — same contract as in ResolveAll. if len(jobs) > 0 { @@ -435,6 +673,21 @@ func (r *Resolver) ResolveFile(filePath string) *ResolveStats { } } } + + // Re-run the attribution passes that ResolveAll runs. ResolveFile + // handles incremental updates — a re-parse of one file emits + // fresh `unresolved::` edges that haven't been seen by these + // passes yet, so without re-running them the incremental graph + // diverges from a cold re-index (caught by + // TestIncrementalReindex_ConvergesToFullIndex). Each pass is + // idempotent on already-rewritten edges (the `unresolved::` + // prefix check makes a second sweep a no-op). + r.rebindGoMethodReceivers() + r.bindBareNameScopeRefs() + r.bindGenericParamRefs() + r.attributeGoBuiltins() + r.attributeGoExternalCalls() + return stats } @@ -524,7 +777,18 @@ func releaseResolverClone(clone *graph.Edge) { // ResolveAll). When nothing changed the returned bool is false. func (r *Resolver) resolveEdge(e *graph.Edge, stats *ResolveStats) (oldTo string, changed bool) { oldTo = e.To - target := strings.TrimPrefix(e.To, unresolvedPrefix) + // graph.UnresolvedName handles both `unresolved::Name` (legacy) + // and `::unresolved::Name` (multi-repo COPY rewrite). + // strings.TrimPrefix only stripped the bare form, leaving every + // multi-repo edge with target=full-id and no downstream pattern + // match — that was the root cause of find_usages returning zero + // callers across the whole gortex repo. + target := graph.UnresolvedName(e.To) + if target == "" { + // Not an unresolved stub at all — fall through with the raw + // id so the pattern dispatch below sees the original value. + target = strings.TrimPrefix(e.To, unresolvedPrefix) + } // Resolve-time LSP hot-path. Consulted for TS/JS/JSX/TSX files // (and any other languages a future helper claims via @@ -633,7 +897,7 @@ func (r *Resolver) resolveEdge(e *graph.Edge, stats *ResolveStats) (oldTo string // every CLI-wired command and command-table entry looks // like dead code. if e.Kind == graph.EdgeReads && e.To != before { - if n := r.graph.GetNode(e.To); n != nil && (n.Kind == graph.KindFunction || n.Kind == graph.KindMethod) { + if n := r.cachedGetNode(e.To); n != nil && (n.Kind == graph.KindFunction || n.Kind == graph.KindMethod) { e.Kind = graph.EdgeReferences } } @@ -671,8 +935,11 @@ func (r *Resolver) resolveExtern(e *graph.Edge, spec string, stats *ResolveStats // Pass 1: does the symbol live in a file under this import path? // Reuse dirIndex populated by buildDirIndexes — no extra scan. + // cachedFindNodesByName lands in the per-pass batch cache for + // the common worker hot path; falls through to the store when + // called outside ResolveAll. callerRepo := r.callerRepoPrefix(e) - candidates := r.graph.FindNodesByName(symbol) + candidates := r.cachedFindNodesByName(symbol) for _, c := range candidates { if c.Kind != graph.KindFunction && c.Kind != graph.KindMethod && c.Kind != graph.KindType && c.Kind != graph.KindInterface { continue @@ -702,12 +969,15 @@ func (r *Resolver) resolveExtern(e *graph.Edge, spec string, stats *ResolveStats // Pass 2: classify the import path. "stdlib::" when the path looks // like a Go stdlib package (no dot in the first segment and not a // known module vendor prefix). "dep::" otherwise. Callers can treat - // both as external for edge-walk purposes. - prefix := "dep::" + // both as external for edge-walk purposes. The stdlib stub carries + // the caller's repo prefix (see internal/graph/stub.go) so two repos + // pinned to different Go SDK versions get distinct fmt::Errorf nodes + // instead of one shared, version-conflated terminal. if isStdlibLike(importPath) { - prefix = "stdlib::" + e.To = graph.StubID(callerRepo, graph.StubKindStdlib, importPath, symbol) + } else { + e.To = "dep::" + importPath + "::" + symbol } - e.To = prefix + importPath + "::" + symbol stats.External++ } @@ -805,10 +1075,7 @@ func (r *Resolver) resolveImport(e *graph.Edge, importPath string, stats *Resolv } } } else { - for _, n := range r.graph.AllNodes() { - if n.Kind != graph.KindFile { - continue - } + for n := range r.graph.NodesByKind(graph.KindFile) { dir := filepath.Dir(n.FilePath) if strings.HasSuffix(dir, lastPathComponent(importPath)) || dir == importPath { consider(n) @@ -1325,7 +1592,7 @@ func (r *Resolver) applyBuiltinIfKnown(e *graph.Edge, methodName string, stats * if !ok { return false } - e.To = "builtin::" + lang + "::" + category + "::" + methodName + e.To = graph.StubID(r.callerRepoPrefix(e), graph.StubKindBuiltin, lang, category, methodName) stats.External++ return true } @@ -1372,8 +1639,8 @@ func (r *Resolver) resolveTokenRef(e *graph.Edge, name string, stats *ResolveSta // comparisons that found nothing (vscode has zero NestJS modules). func (r *Resolver) buildProvidesForIndex() { idx := make(map[string]map[string]struct{}) - for _, ed := range r.graph.AllEdges() { - if ed.Kind != graph.EdgeProvides || ed.Meta == nil { + for ed := range r.graph.EdgesByKind(graph.EdgeProvides) { + if ed.Meta == nil { continue } pf, _ := ed.Meta["provides_for"].(string) @@ -1385,8 +1652,8 @@ func (r *Resolver) buildProvidesForIndex() { } to := ed.To var name string - if strings.HasPrefix(to, "unresolved::") { - name = strings.TrimPrefix(to, "unresolved::") + if graph.IsUnresolvedTarget(to) { + name = graph.UnresolvedName(to) } else if cut := strings.LastIndex(to, "::"); cut >= 0 { name = to[cut+2:] } else { @@ -1430,21 +1697,15 @@ func (r *Resolver) buildReachabilityIndex() { } // Seed with each indexed file's own directory. - for _, n := range r.graph.AllNodes() { - if n.Kind != graph.KindFile { - continue - } + for n := range r.graph.NodesByKind(graph.KindFile) { addDir(n.ID, filepath.Dir(n.FilePath)) } - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeImports { - continue - } + for e := range r.graph.EdgesByKind(graph.EdgeImports) { var importedDir string switch { - case strings.HasPrefix(e.To, "unresolved::import::"): - path := strings.TrimPrefix(e.To, "unresolved::import::") + case graph.IsUnresolvedTarget(e.To) && strings.HasPrefix(graph.UnresolvedName(e.To), "import::"): + path := strings.TrimPrefix(graph.UnresolvedName(e.To), "import::") if files := r.dirIndex[path]; len(files) > 0 { importedDir = filepath.Dir(files[0].FilePath) } else if last := lastPathComponent(path); last != "" { @@ -1531,6 +1792,202 @@ func nodeReceiverType(n *graph.Node) string { return "" } +// memberMethodInfosByType returns the storage layer's per-type member +// method projection verbatim. Routed through MemberMethodsByType when +// the backend implements it; falls back to an EdgesByKind + +// per-edge GetNode walk that synthesises matching info rows. +func memberMethodInfosByType(g graph.Store) map[string][]graph.MemberMethodInfo { + if cap, ok := g.(graph.MemberMethodsByType); ok { + return cap.MemberMethodsByType() + } + out := map[string][]graph.MemberMethodInfo{} + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + method := g.GetNode(e.From) + if method == nil || method.Kind != graph.KindMethod { + continue + } + out[e.To] = append(out[e.To], graph.MemberMethodInfo{ + MethodID: method.ID, + Name: method.Name, + FilePath: method.FilePath, + StartLine: method.StartLine, + RepoPrefix: method.RepoPrefix, + }) + } + return out +} + +// edgesByKinds yields every edge whose Kind is in the given set, +// using the EdgesByKindsScanner capability when the backend +// implements it (one Cypher IN-list scan) and falling back to a +// chain of per-kind EdgesByKind iterators otherwise. +func edgesByKinds(g graph.Store, kinds []graph.EdgeKind) iter.Seq[*graph.Edge] { + if scan, ok := g.(graph.EdgesByKindsScanner); ok { + return scan.EdgesByKinds(kinds) + } + return func(yield func(*graph.Edge) bool) { + for _, k := range kinds { + for e := range g.EdgesByKind(k) { + if !yield(e) { + return + } + } + } + } +} + +// nodesByKindsOrAll returns every node whose Kind is in the given +// set, using the NodesByKindsScanner capability when the backend +// implements it (a single Cypher kind-IN scan, one C-string column +// per row) and falling back to AllNodes + Go-side filter otherwise. +func nodesByKindsOrAll(g graph.Store, kinds ...graph.NodeKind) []*graph.Node { + if scan, ok := g.(graph.NodesByKindsScanner); ok { + return scan.NodesByKinds(kinds) + } + set := make(map[graph.NodeKind]struct{}, len(kinds)) + for _, k := range kinds { + set[k] = struct{}{} + } + var out []*graph.Node + for _, n := range g.AllNodes() { + if n == nil { + continue + } + if _, ok := set[n.Kind]; ok { + out = append(out, n) + } + } + return out +} + +// memberMethodsByType returns typeID → method-name-set for every +// EdgeMemberOf edge whose source is a KindMethod node. Routed through +// the storage layer's MemberMethodsByType capability when the backend +// implements it (one Cypher join, server-side), falling back to the +// EdgesByKind + per-edge GetNode loop the resolver used before the +// capability landed. Used by InferImplements (and shaped to match its +// existing map[string]map[string]bool API). +func memberMethodsByType(g graph.Store) map[string]map[string]bool { + if cap, ok := g.(graph.MemberMethodsByType); ok { + raw := cap.MemberMethodsByType() + if len(raw) == 0 { + return nil + } + out := make(map[string]map[string]bool, len(raw)) + for typeID, methods := range raw { + set := make(map[string]bool, len(methods)) + for _, m := range methods { + set[m.Name] = true + } + out[typeID] = set + } + return out + } + out := map[string]map[string]bool{} + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + methodNode := g.GetNode(e.From) + if methodNode == nil || methodNode.Kind != graph.KindMethod { + continue + } + if out[e.To] == nil { + out[e.To] = make(map[string]bool) + } + out[e.To][methodNode.Name] = true + } + return out +} + +// memberMethodNodesByType returns typeID → name → method-node for +// every EdgeMemberOf edge whose source is a KindMethod node. Routed +// through the storage layer's MemberMethodsByType capability when the +// backend implements it (the projection ships only the four columns +// the consumer reads — ID / Name / FilePath / StartLine — packed into +// a synthetic *Node that carries no Meta / QualName / Language); falls +// back to the EdgesByKind + per-edge GetNode loop otherwise. Used by +// InferOverrides which keys methods by name and reads ID/FilePath/ +// StartLine off the node when it emits an EdgeOverrides edge. +func memberMethodNodesByType(g graph.Store) map[string]map[string]*graph.Node { + if cap, ok := g.(graph.MemberMethodsByType); ok { + raw := cap.MemberMethodsByType() + if len(raw) == 0 { + return nil + } + out := make(map[string]map[string]*graph.Node, len(raw)) + for typeID, methods := range raw { + set := make(map[string]*graph.Node, len(methods)) + for _, m := range methods { + set[m.Name] = &graph.Node{ + ID: m.MethodID, + Kind: graph.KindMethod, + Name: m.Name, + FilePath: m.FilePath, + StartLine: m.StartLine, + RepoPrefix: m.RepoPrefix, + } + } + out[typeID] = set + } + return out + } + out := map[string]map[string]*graph.Node{} + for e := range g.EdgesByKind(graph.EdgeMemberOf) { + method := g.GetNode(e.From) + if method == nil || method.Kind != graph.KindMethod { + continue + } + set := out[e.To] + if set == nil { + set = make(map[string]*graph.Node) + out[e.To] = set + } + set[method.Name] = method + } + return out +} + +// structuralParentEdges returns every EdgeExtends / EdgeImplements / +// EdgeComposes edge whose endpoints are both KindType / KindInterface, +// projected as the (FromID, ToID, Origin) tuples InferOverrides +// consumes. Routed through the storage layer's StructuralParentEdges +// capability when the backend implements it (one Cypher join with +// kind filters on both sides — no per-edge GetNode); falls back to +// the AllEdges + per-edge GetNode walk otherwise. +func structuralParentEdges(g graph.Store) []graph.StructuralParentEdgeRow { + if cap, ok := g.(graph.StructuralParentEdges); ok { + return cap.StructuralParentEdges() + } + parentKinds := map[graph.EdgeKind]bool{ + graph.EdgeExtends: true, + graph.EdgeImplements: true, + graph.EdgeComposes: true, + } + var out []graph.StructuralParentEdgeRow + for _, e := range g.AllEdges() { + if e == nil || !parentKinds[e.Kind] { + continue + } + from := g.GetNode(e.From) + to := g.GetNode(e.To) + if from == nil || to == nil { + continue + } + if from.Kind != graph.KindType && from.Kind != graph.KindInterface { + continue + } + if to.Kind != graph.KindType && to.Kind != graph.KindInterface { + continue + } + out = append(out, graph.StructuralParentEdgeRow{ + FromID: from.ID, + ToID: to.ID, + FromKind: from.Kind, + ToKind: to.Kind, + Origin: e.Origin, + }) + } + return out +} + // InferImplements detects structural interface satisfaction by comparing // method sets and adds EdgeImplements edges from types to interfaces. // Returns the number of edges added. @@ -1543,11 +2000,7 @@ func (r *Resolver) InferImplements() int { } var ifaces []ifaceInfo - allNodes := r.graph.AllNodes() - for _, n := range allNodes { - if n.Kind != graph.KindInterface { - continue - } + for n := range r.graph.NodesByKind(graph.KindInterface) { if n.Meta == nil { continue } @@ -1580,23 +2033,7 @@ func (r *Resolver) InferImplements() int { } // Step 2: Build map of type ID -> set of method names via EdgeMemberOf edges. - typeMethods := make(map[string]map[string]bool) - allEdges := r.graph.AllEdges() - for _, e := range allEdges { - if e.Kind != graph.EdgeMemberOf { - continue - } - // EdgeMemberOf: From=method, To=type - methodNode := r.graph.GetNode(e.From) - if methodNode == nil || methodNode.Kind != graph.KindMethod { - continue - } - typeID := e.To - if typeMethods[typeID] == nil { - typeMethods[typeID] = make(map[string]bool) - } - typeMethods[typeID][methodNode.Name] = true - } + typeMethods := memberMethodsByType(r.graph) // Step 3: For each type, check if its method set satisfies each interface. // @@ -1616,6 +2053,12 @@ func (r *Resolver) InferImplements() int { typeList = append(typeList, tid) } + // Prefetch every type node referenced by EdgeMemberOf in one batch + // before the workers spin up — on disk backends a per-worker + // GetNode(typeID) was an N+1 over cgo that the workers' parallelism + // could not hide. + typeNodes := r.graph.GetNodesByIDs(typeList) + workers := runtime.NumCPU() if workers < 1 { workers = 1 @@ -1645,7 +2088,7 @@ func (r *Resolver) InferImplements() int { var out []pair for _, typeID := range slice { methods := typeMethods[typeID] - typeNode := r.graph.GetNode(typeID) + typeNode := typeNodes[typeID] if typeNode == nil || (typeNode.Kind != graph.KindType && typeNode.Kind != graph.KindInterface) { continue } @@ -1723,22 +2166,7 @@ func (r *Resolver) InferOverrides() int { defer r.mu.Unlock() // Step 1: index methods by their owning type via EdgeMemberOf. - typeMembers := make(map[string]map[string]*graph.Node) // typeID → name → method node - for _, e := range r.graph.AllEdges() { - if e.Kind != graph.EdgeMemberOf { - continue - } - method := r.graph.GetNode(e.From) - if method == nil || method.Kind != graph.KindMethod { - continue - } - set := typeMembers[e.To] - if set == nil { - set = make(map[string]*graph.Node) - typeMembers[e.To] = set - } - set[method.Name] = method - } + typeMembers := memberMethodNodesByType(r.graph) // typeID → name → method node if len(typeMembers) == 0 { return 0 } @@ -1747,33 +2175,17 @@ func (r *Resolver) InferOverrides() int { // edge, walk the child's methods and emit EdgeOverrides where the // parent has a same-named method. Skip if the override edge // already exists. - parentKinds := map[graph.EdgeKind]bool{ - graph.EdgeExtends: true, - graph.EdgeImplements: true, - graph.EdgeComposes: true, - } type overridePair struct { from, to *graph.Node origin string } var pending []overridePair - for _, e := range r.graph.AllEdges() { - if !parentKinds[e.Kind] { - continue - } - child := r.graph.GetNode(e.From) - parent := r.graph.GetNode(e.To) - if child == nil || parent == nil || child.ID == parent.ID { + for _, row := range structuralParentEdges(r.graph) { + if row.FromID == row.ToID { continue } - if child.Kind != graph.KindType && child.Kind != graph.KindInterface { - continue - } - if parent.Kind != graph.KindType && parent.Kind != graph.KindInterface { - continue - } - childMethods := typeMembers[child.ID] - parentMethods := typeMembers[parent.ID] + childMethods := typeMembers[row.FromID] + parentMethods := typeMembers[row.ToID] if len(childMethods) == 0 || len(parentMethods) == 0 { continue } @@ -1781,10 +2193,10 @@ func (r *Resolver) InferOverrides() int { // the override edge so blast-radius queries can filter by // min_tier consistently. origin := graph.OriginASTInferred - if e.Origin == graph.OriginASTResolved { + if row.Origin == graph.OriginASTResolved { origin = graph.OriginASTResolved - } else if rank := graph.OriginRank(e.Origin); rank >= graph.OriginRank(graph.OriginLSPDispatch) { - origin = e.Origin + } else if rank := graph.OriginRank(row.Origin); rank >= graph.OriginRank(graph.OriginLSPDispatch) { + origin = row.Origin } for name, cm := range childMethods { pm, ok := parentMethods[name] @@ -1796,6 +2208,7 @@ func (r *Resolver) InferOverrides() int { } added := 0 + var provBatch []graph.EdgeProvenanceUpdate for _, p := range pending { // Skip when the edge already exists. dup := false @@ -1803,11 +2216,13 @@ func (r *Resolver) InferOverrides() int { if existing.Kind == graph.EdgeOverrides && existing.To == p.to.ID { dup = true // Upgrade the provenance of the existing override edge - // through SetEdgeProvenance so the identity change is - // counted — a bare existing.Origin write would bypass - // the revision counter. + // through SetEdgeProvenanceBatch so the identity change + // is counted — a bare existing.Origin write would + // bypass the revision counter. Batched so a large + // hierarchy pass commits its provenance bumps in + // chunks on disk backends. if graph.OriginRank(existing.Origin) < graph.OriginRank(p.origin) { - r.graph.SetEdgeProvenance(existing, p.origin) + provBatch = append(provBatch, graph.EdgeProvenanceUpdate{Edge: existing, NewOrigin: p.origin}) } break } @@ -1827,6 +2242,9 @@ func (r *Resolver) InferOverrides() int { }) added++ } + if len(provBatch) > 0 { + r.graph.SetEdgeProvenanceBatch(provBatch) + } return added } diff --git a/internal/resolver/temporal_calls.go b/internal/resolver/temporal_calls.go index af4b7ee7..03003e17 100644 --- a/internal/resolver/temporal_calls.go +++ b/internal/resolver/temporal_calls.go @@ -72,7 +72,7 @@ const ( // // Returns the number of temporal.stub edges pointing at a resolved // handler after the pass. -func ResolveTemporalCalls(g *graph.Graph) int { +func ResolveTemporalCalls(g graph.Store) int { if g == nil { return 0 } @@ -87,8 +87,17 @@ func ResolveTemporalCalls(g *graph.Graph) int { defer mu.Unlock() idx := buildTemporalIndex(g) resolved := 0 - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeCalls || e.Meta == nil { + var reindexBatch []graph.EdgeReindex + // First sweep: collect stub edges and the From IDs we need so the + // per-edge GetNode below collapses to one batch lookup. + type stubEdge struct { + edge *graph.Edge + kind, name string + } + var stubs []stubEdge + fromIDSet := map[string]struct{}{} + for e := range g.EdgesByKind(graph.EdgeCalls) { + if e == nil || e.Meta == nil { continue } if v, _ := e.Meta["via"].(string); v != "temporal.stub" { @@ -99,16 +108,28 @@ func ResolveTemporalCalls(g *graph.Graph) int { if kind == "" || name == "" { continue } + stubs = append(stubs, stubEdge{edge: e, kind: kind, name: name}) + if e.From != "" { + fromIDSet[e.From] = struct{}{} + } + } + fromList := make([]string, 0, len(fromIDSet)) + for id := range fromIDSet { + fromList = append(fromList, id) + } + callerNodes := g.GetNodesByIDs(fromList) + for _, s := range stubs { + e := s.edge callerRepo := "" - if from := g.GetNode(e.From); from != nil { + if from := callerNodes[e.From]; from != nil { callerRepo = from.RepoPrefix } - handlerID, origin, conf := idx.lookup(kind, name, callerRepo) + handlerID, origin, conf := idx.lookup(s.kind, s.name, callerRepo) want := handlerID if want == "" { - want = temporalStubPlaceholder(kind, name) + want = temporalStubPlaceholder(s.kind, s.name) } if e.To == want { if handlerID != "" { @@ -131,7 +152,10 @@ func ResolveTemporalCalls(g *graph.Graph) int { e.ConfidenceLabel = "" delete(e.Meta, "temporal_resolution") } - g.ReindexEdge(e, oldTo) + reindexBatch = append(reindexBatch, graph.EdgeReindex{Edge: e, OldTo: oldTo}) + } + if len(reindexBatch) > 0 { + g.ReindexEdges(reindexBatch) } return resolved } @@ -177,12 +201,24 @@ func (idx *temporalIndex) lookup(kind, name, callerRepo string) (id, origin stri // `@WorkflowInterface` annotations (propagated to interface // implementors), and (b) returns a name index the stub-call resolver // consults. -func buildTemporalIndex(g *graph.Graph) *temporalIndex { +func buildTemporalIndex(g graph.Store) *temporalIndex { idx := &temporalIndex{byKindName: map[string][]*graph.Node{}} // Phase 1 — Go side. Walk `temporal.register` edges and stamp the - // registered function's node. - for _, e := range g.AllEdges() { + // registered function's node. The "via" tag lives on EdgeCalls + // edges, so narrow with EdgesByKind before the Meta filter. + // + // Collect every register edge first so we can batch-fetch every + // caller node and resolve every Go target name in one pair of + // round-trips, instead of N AllNodes scans + N GetNode calls. + type goRegister struct { + edge *graph.Edge + kind, name string + } + var goRegisters []goRegister + registerCallerIDs := map[string]struct{}{} + registerNames := map[string]struct{}{} + for e := range g.EdgesByKind(graph.EdgeCalls) { if e == nil || e.Meta == nil { continue } @@ -194,48 +230,85 @@ func buildTemporalIndex(g *graph.Graph) *temporalIndex { if kind == "" || name == "" { continue } - caller := g.GetNode(e.From) + goRegisters = append(goRegisters, goRegister{edge: e, kind: kind, name: name}) + if e.From != "" { + registerCallerIDs[e.From] = struct{}{} + } + registerNames[name] = struct{}{} + } + callerList := make([]string, 0, len(registerCallerIDs)) + for id := range registerCallerIDs { + callerList = append(callerList, id) + } + registerCallers := g.GetNodesByIDs(callerList) + nameList := make([]string, 0, len(registerNames)) + for n := range registerNames { + nameList = append(nameList, n) + } + candidatesByName := g.FindNodesByNames(nameList) + + for _, r := range goRegisters { + caller := registerCallers[r.edge.From] if caller == nil { continue } - target := findGoTemporalTarget(g, caller, name) + target := pickGoTemporalTarget(candidatesByName[r.name], caller) if target == nil { continue } - stampTemporalRole(target, kind, name) - idx.byKindName[kind+"::"+name] = append(idx.byKindName[kind+"::"+name], target) + stampTemporalRole(g, target, r.kind, r.name) + idx.byKindName[r.kind+"::"+r.name] = append(idx.byKindName[r.kind+"::"+r.name], target) } // Phase 2 — Java side. Walk `EdgeAnnotated` edges to find - // temporal-tagged interfaces and methods. - type javaIfaceTag struct { - ifaceID string - role string // "activity_interface" / "workflow_interface" + // temporal-tagged interfaces and methods. As with Phase 1, collect + // every annotation edge and batch the From-side GetNode calls. + type javaAnno struct { + fromID string + ifaceRole, methodRole string } - var javaIfaces []javaIfaceTag - for _, e := range g.AllEdges() { - if e == nil || e.Kind != graph.EdgeAnnotated { + var javaAnnos []javaAnno + annoFromIDs := map[string]struct{}{} + for e := range g.EdgesByKind(graph.EdgeAnnotated) { + if e == nil { continue } role, methodRole := temporalRoleForJavaAnnotation(e.To) if role == "" && methodRole == "" { continue } - from := g.GetNode(e.From) + javaAnnos = append(javaAnnos, javaAnno{fromID: e.From, ifaceRole: role, methodRole: methodRole}) + if e.From != "" { + annoFromIDs[e.From] = struct{}{} + } + } + annoFromList := make([]string, 0, len(annoFromIDs)) + for id := range annoFromIDs { + annoFromList = append(annoFromList, id) + } + annoFromNodes := g.GetNodesByIDs(annoFromList) + + type javaIfaceTag struct { + ifaceID string + role string // "activity_interface" / "workflow_interface" + } + var javaIfaces []javaIfaceTag + for _, a := range javaAnnos { + from := annoFromNodes[a.fromID] if from == nil { continue } // Method-level annotation: stamp directly. - if methodRole != "" && (from.Kind == graph.KindMethod || from.Kind == graph.KindFunction) { - stampTemporalRole(from, methodRole, from.Name) - idx.byKindName[normaliseTemporalKind(methodRole)+"::"+from.Name] = append( - idx.byKindName[normaliseTemporalKind(methodRole)+"::"+from.Name], from) + if a.methodRole != "" && (from.Kind == graph.KindMethod || from.Kind == graph.KindFunction) { + stampTemporalRole(g, from, a.methodRole, from.Name) + idx.byKindName[normaliseTemporalKind(a.methodRole)+"::"+from.Name] = append( + idx.byKindName[normaliseTemporalKind(a.methodRole)+"::"+from.Name], from) continue } // Interface-level annotation: queue for the propagation pass. - if role != "" && from.Kind == graph.KindInterface { - stampTemporalRole(from, role, from.Name) - javaIfaces = append(javaIfaces, javaIfaceTag{ifaceID: from.ID, role: role}) + if a.ifaceRole != "" && from.Kind == graph.KindInterface { + stampTemporalRole(g, from, a.ifaceRole, from.Name) + javaIfaces = append(javaIfaces, javaIfaceTag{ifaceID: from.ID, role: a.ifaceRole}) } } @@ -243,14 +316,57 @@ func buildTemporalIndex(g *graph.Graph) *temporalIndex { // methods (flat nodes living in the same file, within the // interface's line range) and stamp them. Then walk EdgeImplements // from each implementor and tag its same-named methods. + // + // Build a single Java method index up front via NodesByKind, then + // project it into the two views the propagation needs: + // - methodsByFile: file path → []*method (used for interface + // methods, which the Java extractor emits as flat + // :: nodes whose StartLine sits inside the + // interface's line range). + // - methodsByReceiver: receiver class name → []*method (used for + // impl-class methods, which carry Meta["receiver"]). + // One pass beats AllNodes() per interface. + javaMethodsByFile, javaMethodsByReceiver := buildJavaMethodViews(g, len(javaIfaces)) + + // Prefetch the interface nodes + the implementing-type nodes for + // the entire iface set so the propagation loop never issues an + // inline GetNode. + ifaceIDs := make([]string, 0, len(javaIfaces)) + for _, t := range javaIfaces { + ifaceIDs = append(ifaceIDs, t.ifaceID) + } + ifaceNodes := g.GetNodesByIDs(ifaceIDs) + implTypeIDSet := map[string]struct{}{} + implIDsByIface := map[string][]string{} + for _, t := range javaIfaces { + for _, ie := range g.GetInEdges(t.ifaceID) { + if ie == nil || ie.Kind != graph.EdgeImplements { + continue + } + implIDsByIface[t.ifaceID] = append(implIDsByIface[t.ifaceID], ie.From) + if ie.From != "" { + implTypeIDSet[ie.From] = struct{}{} + } + } + } + implTypeIDList := make([]string, 0, len(implTypeIDSet)) + for id := range implTypeIDSet { + implTypeIDList = append(implTypeIDList, id) + } + implTypeNodes := g.GetNodesByIDs(implTypeIDList) + for _, t := range javaIfaces { methodRole := "activity" if t.role == "workflow_interface" { methodRole = "workflow" } - ifaceMethods := collectJavaInterfaceMethods(g, t.ifaceID) + iface := ifaceNodes[t.ifaceID] + if iface == nil { + continue + } + ifaceMethods := collectJavaInterfaceMethodsFromIndex(iface, javaMethodsByFile) for _, m := range ifaceMethods { - stampTemporalRole(m, methodRole, m.Name) + stampTemporalRole(g, m, methodRole, m.Name) idx.byKindName[methodRole+"::"+m.Name] = append(idx.byKindName[methodRole+"::"+m.Name], m) } // Propagate to implementing classes' methods. @@ -258,19 +374,16 @@ func buildTemporalIndex(g *graph.Graph) *temporalIndex { for _, m := range ifaceMethods { implMethodNames[m.Name] = struct{}{} } - for _, ie := range g.GetInEdges(t.ifaceID) { - if ie == nil || ie.Kind != graph.EdgeImplements { - continue - } - implType := g.GetNode(ie.From) + for _, implTypeID := range implIDsByIface[t.ifaceID] { + implType := implTypeNodes[implTypeID] if implType == nil { continue } - for _, m := range methodsOfJavaType(g, implType) { + for _, m := range methodsOfJavaTypeFromIndex(implType, javaMethodsByReceiver) { if _, ok := implMethodNames[m.Name]; !ok { continue } - stampTemporalRole(m, methodRole, m.Name) + stampTemporalRole(g, m, methodRole, m.Name) idx.byKindName[methodRole+"::"+m.Name] = append(idx.byKindName[methodRole+"::"+m.Name], m) } } @@ -319,7 +432,7 @@ func normaliseTemporalKind(role string) string { // a previously-stamped node is re-stamped with a different role the // new role wins (the resolver runs as a full recompute, so this lets // the latest registration take precedence). -func stampTemporalRole(n *graph.Node, role, name string) { +func stampTemporalRole(g graph.Store, n *graph.Node, role, name string) { if n == nil || role == "" { return } @@ -330,22 +443,36 @@ func stampTemporalRole(n *graph.Node, role, name string) { if name != "" { n.Meta["temporal_name"] = name } + // Round-trip the stamp back through the store. On the in-memory + // backend n is canonical so this is an idempotent re-insert; on disk + // backends (Ladybug) n is a per-call GetNode/AllNodes reconstruction, + // so without the write-back temporal_role/temporal_name would be + // discarded the moment this pass returns. ResolveTemporalCalls runs + // from RunGlobalGraphPasses, which can execute after the bulk-load + // buffer is flushed, so the in-place mutation is not otherwise + // captured. Matches reach / coverage / blame / releases / churn. + g.AddNode(n) } -// findGoTemporalTarget locates the Go function or method that a -// `worker.Register*(F)` call refers to. The register call lives at -// `caller` (typically `main` or a worker setup function); the function -// `F` is either declared in the same file or imported. The search -// order is: +// pickGoTemporalTarget selects the Go function or method that a +// `worker.Register*(F)` call refers to from a name-matched candidate +// set. The register call lives at `caller`; the function `F` is +// either declared in the same file or imported. The search order is: // // 1. Same-file function whose name matches. // 2. Same-repo function whose name matches. // 3. Unique workspace-wide function whose name matches. // -// Returns nil when no unambiguous match exists. -func findGoTemporalTarget(g *graph.Graph, caller *graph.Node, name string) *graph.Node { +// Returns nil when no unambiguous match exists. The candidate list +// MUST be pre-filtered to Name == registered name (FindNodesByNames +// already does that); this helper applies the Go-kind and language +// gates plus the locality tie-break. +func pickGoTemporalTarget(candidates []*graph.Node, caller *graph.Node) *graph.Node { + if caller == nil { + return nil + } var sameFile, sameRepo, all []*graph.Node - for _, n := range g.AllNodes() { + for _, n := range candidates { if n == nil { continue } @@ -355,9 +482,6 @@ func findGoTemporalTarget(g *graph.Graph, caller *graph.Node, name string) *grap if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue } - if n.Name != name { - continue - } all = append(all, n) if caller.RepoPrefix != "" && n.RepoPrefix == caller.RepoPrefix { sameRepo = append(sameRepo, n) @@ -378,28 +502,47 @@ func findGoTemporalTarget(g *graph.Graph, caller *graph.Node, name string) *grap return nil } -// collectJavaInterfaceMethods returns the interface's method nodes. -// The Java extractor emits interface methods as flat -// `::` nodes (no class-membership edge), -// distinguished from class methods by the absence of a "receiver" -// Meta. We narrow to the interface's source-line range so multiple -// interfaces in one file don't bleed into each other. -func collectJavaInterfaceMethods(g *graph.Graph, ifaceID string) []*graph.Node { - iface := g.GetNode(ifaceID) - if iface == nil { - return nil +// buildJavaMethodViews materialises two indexes over every Java +// method node in the graph: methodsByFile groups nodes whose Meta has +// NO "receiver" (interface methods, per the Java extractor's +// convention); methodsByReceiver groups nodes whose Meta carries a +// non-empty receiver. One NodesByKind scan replaces the N AllNodes() +// passes the old collectJavaInterfaceMethods + methodsOfJavaType +// helpers ran inside the per-interface propagation loop. +// +// ifaceCount == 0 is a fast no-op; with no tagged interfaces the +// indexes are unused so we skip the scan. +func buildJavaMethodViews(g graph.Store, ifaceCount int) (map[string][]*graph.Node, map[string][]*graph.Node) { + if ifaceCount == 0 { + return nil, nil } - var out []*graph.Node - for _, n := range g.AllNodes() { - if n == nil || n.Kind != graph.KindMethod || n.Language != "java" { + methodsByFile := map[string][]*graph.Node{} + methodsByReceiver := map[string][]*graph.Node{} + for n := range g.NodesByKind(graph.KindMethod) { + if n == nil || n.Language != "java" { continue } - if n.FilePath != iface.FilePath { - continue - } - if _, hasReceiver := n.Meta["receiver"]; hasReceiver { - continue + recv, _ := n.Meta["receiver"].(string) + if recv == "" { + methodsByFile[n.FilePath] = append(methodsByFile[n.FilePath], n) + } else { + methodsByReceiver[recv] = append(methodsByReceiver[recv], n) } + } + return methodsByFile, methodsByReceiver +} + +// collectJavaInterfaceMethodsFromIndex returns the interface's method +// nodes — flat KindMethod nodes in the interface's file whose +// StartLine sits inside the interface's line range. Consumes the +// methodsByFile view built by buildJavaMethodViews so the scan is +// O(methods in this file) rather than O(every node). +func collectJavaInterfaceMethodsFromIndex(iface *graph.Node, methodsByFile map[string][]*graph.Node) []*graph.Node { + if iface == nil { + return nil + } + var out []*graph.Node + for _, n := range methodsByFile[iface.FilePath] { if n.StartLine < iface.StartLine || (iface.EndLine > 0 && n.StartLine > iface.EndLine) { continue } @@ -408,27 +551,28 @@ func collectJavaInterfaceMethods(g *graph.Graph, ifaceID string) []*graph.Node { return out } -// methodsOfJavaType returns the method nodes of a Java class — i.e. -// every KindMethod node whose Meta["receiver"] matches the type name. -// The Java extractor uses the receiver field for class membership. -func methodsOfJavaType(g *graph.Graph, t *graph.Node) []*graph.Node { +// methodsOfJavaTypeFromIndex returns the method nodes whose +// Meta["receiver"] matches the type's name (or the receiver-suffix +// shape on the class node's ID). Consumes the methodsByReceiver view +// built by buildJavaMethodViews so the scan is O(methods of this +// receiver) rather than O(every node). +func methodsOfJavaTypeFromIndex(t *graph.Node, methodsByReceiver map[string][]*graph.Node) []*graph.Node { if t == nil { return nil } - var out []*graph.Node - for _, n := range g.AllNodes() { - if n == nil || n.Kind != graph.KindMethod || n.Language != "java" { + out := methodsByReceiver[t.Name] + // Honour the legacy id-suffix tie-break: a class node's id is + // `::`; a method whose receiver matches that + // trailing component is still a member even when the receiver + // Meta carries a fully-qualified name. + for recv, candidates := range methodsByReceiver { + if recv == t.Name { continue } - recv, _ := n.Meta["receiver"].(string) - if recv == "" { + if !strings.HasSuffix(t.ID, "::"+recv) { continue } - // Java method node receiver is the class name; the class node's - // ID shape is `::` so match by suffix. - if recv == t.Name || strings.HasSuffix(t.ID, "::"+recv) { - out = append(out, n) - } + out = append(out, candidates...) } return out } diff --git a/internal/resolver/temporal_calls_test.go b/internal/resolver/temporal_calls_test.go index 7e2c4a9c..82c7922d 100644 --- a/internal/resolver/temporal_calls_test.go +++ b/internal/resolver/temporal_calls_test.go @@ -14,7 +14,7 @@ import ( // either a Go register-call edge or a Java @ActivityInterface + // EdgeImplements chain that names the activity. type temporalTestGraph struct { - g *graph.Graph + g graph.Store } func newTemporalTestGraph() *temporalTestGraph { return &temporalTestGraph{g: graph.New()} } diff --git a/internal/search/hybrid.go b/internal/search/hybrid.go index 13171e4b..61f63899 100644 --- a/internal/search/hybrid.go +++ b/internal/search/hybrid.go @@ -70,7 +70,7 @@ func (h *HybridBackend) Remove(id string) { // for natural-language queries (where semantic similarity catches // synonymous wording). func (h *HybridBackend) Search(query string, limit int) []SearchResult { - textResults, vecIDs := h.searchChannels(query, limit) + textResults, vecIDs, _ := h.searchChannels(query, limit) if len(vecIDs) == 0 { if len(textResults) > limit { return textResults[:limit] @@ -89,17 +89,93 @@ func (h *HybridBackend) Search(query string, limit int) []SearchResult { // contribute as a separate Signal instead of being collapsed into a // single RRF score upstream of the rerank. func (h *HybridBackend) SearchChannels(query string, limit int) (textResults []SearchResult, vectorIDs []string) { + textResults, vectorIDs, _ = h.searchChannels(query, limit) + return textResults, vectorIDs +} + +// ChannelTimings carries per-phase wall-clock numbers from one +// SearchChannelsTimed call. Zero fields = phase didn't run (e.g. +// VectorSearchMS=0 when the vector index is empty). +type ChannelTimings struct { + TextMS int64 + EmbedMS int64 + VectorSearchMS int64 +} + +// VectorChannelOnly returns the vector-channel IDs (embedder + ANN +// search) WITHOUT re-running the text BM25 path. Used by the engine +// when the text channel has already been satisfied via the bundle +// path — the bundle returns Nodes + edges + scores already, so +// re-running text Search would double-pay the FTS cost. Returns +// nil and a zero ChannelTimings when the vector index is empty. +func (h *HybridBackend) VectorChannelOnly(query string, limit int) ([]string, ChannelTimings) { + var stats ChannelTimings + if h == nil || h.vector == nil || h.vector.Count() == 0 { + return nil, stats + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + embedStart := time.Now() + queryVec, err := h.embedder.Embed(ctx, query) + stats.EmbedMS = time.Since(embedStart).Milliseconds() + if err != nil || queryVec == nil { + return nil, stats + } + fetch := limit * 2 + if h.vector.HasChunks() { + fetch = limit * 8 + } + vecStart := time.Now() + rawVecIDs := h.vector.Search(queryVec, fetch) + stats.VectorSearchMS = time.Since(vecStart).Milliseconds() + return h.dechunkVectorIDs(rawVecIDs, limit*2), stats +} + +// SearchChannelsTimed is SearchChannels with a per-phase timing +// breakdown so callers can prove which sub-step (text BM25 vs +// vector embed vs vector ANN) actually cost wall-clock time. +// Used by the MCP search_symbols handler's debug-log +// instrumentation; production callers that don't care just use +// SearchChannels. +func (h *HybridBackend) SearchChannelsTimed(query string, limit int) ([]SearchResult, []string, ChannelTimings) { return h.searchChannels(query, limit) } -func (h *HybridBackend) searchChannels(query string, limit int) ([]SearchResult, []string) { +// SearchSymbolBundles forwards to the text backend's bundle path when +// it implements SymbolBundleSearcherBackend. The vector channel does +// not participate — its IDs ride out through SearchChannels/Timed as +// before and the engine merges them with the bundle set. Returns nil +// when the text backend has no bundle support (no-op for the +// fallback path). +// +// HybridBackend wires both channels together in production, so the +// engine's bundle-detection step type-asserts on the outer +// HybridBackend through Swappable; this is what makes the bundle +// path available when the daemon's search is the BM25 + vector +// stack instead of a bare SymbolSearcherBackend. +func (h *HybridBackend) SearchSymbolBundles(query string, limit int) []SymbolBundle { + if h == nil || h.text == nil { + return nil + } + if bs, ok := h.text.(SymbolBundleSearcherBackend); ok { + return bs.SearchSymbolBundles(query, limit) + } + return nil +} + +func (h *HybridBackend) searchChannels(query string, limit int) ([]SearchResult, []string, ChannelTimings) { + var stats ChannelTimings + tStart := time.Now() textResults := h.text.Search(query, limit*2) + stats.TextMS = time.Since(tStart).Milliseconds() var vecIDs []string if h.vector.Count() > 0 { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() + embedStart := time.Now() queryVec, err := h.embedder.Embed(ctx, query) + stats.EmbedMS = time.Since(embedStart).Milliseconds() if err == nil && queryVec != nil { // When symbols are sub-chunked, one symbol owns several // vectors, so a fixed top-k under-counts distinct symbols. @@ -108,10 +184,13 @@ func (h *HybridBackend) searchChannels(query string, limit int) ([]SearchResult, if h.vector.HasChunks() { fetch = limit * 8 } - vecIDs = h.dechunkVectorIDs(h.vector.Search(queryVec, fetch), limit*2) + vecStart := time.Now() + rawVecIDs := h.vector.Search(queryVec, fetch) + stats.VectorSearchMS = time.Since(vecStart).Milliseconds() + vecIDs = h.dechunkVectorIDs(rawVecIDs, limit*2) } } - return textResults, vecIDs + return textResults, vecIDs, stats } // dechunkVectorIDs maps raw vector-search hits — which may be synthetic diff --git a/internal/search/rerank/context.go b/internal/search/rerank/context.go index 74426148..349fd168 100644 --- a/internal/search/rerank/context.go +++ b/internal/search/rerank/context.go @@ -121,6 +121,131 @@ type Context struct { // runs once per file rather than once per candidate. Bounded by // the candidate set's file count. pathPenaltyCache map[string]float64 + + // outEdgeCache / inEdgeCache hold the per-candidate edge slices + // fetched in one batched round-trip from Graph at prepare() time. + // FanInSignal / FanOutSignal / MinHashSignal read from these + // instead of calling Graph.GetIn/OutEdges per-candidate, which on + // the Ladybug backend collapses ~6N per-search cgo round-trips + // (~150 calls × 14ms ≈ 2 s) into 2. Empty when Graph is nil. + // Callers must use the inEdges / outEdges accessors so signals + // stay graph-agnostic. + outEdgeCache map[string][]*graph.Edge + inEdgeCache map[string][]*graph.Edge + + // preparedCands is the candidate slice identity prepare() was last + // called against. Pipeline.Rerank skips re-prepare when the same + // slice header is seen back-to-back so callers that pre-call + // Prepare for per-phase timing do not pay for it twice. The check + // is identity-only (same slice, same length) — any mutation that + // reallocates resets it. + preparedCands []*Candidate + + // cachePreSeeded is the caller's promise (via SeedEdgeCaches with + // preSeeded=true) that outEdgeCache / inEdgeCache already cover + // the candidate set the next Prepare call will see. When set, + // prepare() skips the batched edge fetch entirely — the bundle + // path's edges are authoritative and a second fetch is pure + // overhead. Reset by the caller (typically the engine, after each + // Search) to keep the flag from leaking across reranks. + cachePreSeeded bool +} + +// Prepare populates the internal scratch fields used by every signal +// once per Rerank call. Exposed so callers that want to time prepare +// separately (the search hot path) can call it explicitly; in that +// case the subsequent Rerank call detects the prepared state and +// skips the duplicate work. Safe to call multiple times against the +// same slice — it's a full reset on each call. +func (c *Context) Prepare(cands []*Candidate) { c.prepare(cands) } + +// SeedEdgeCaches installs pre-fetched in/out edge maps the caller +// already gathered (today: from the SymbolBundleSearcherBackend hot +// path). The maps are merged into the context — IDs already in the +// cache keep their existing entry, new IDs append. The accompanying +// flag tells prepare() the caches are authoritative for the +// candidate set so it can skip its own batched edge fetch on the +// next Prepare call. +// +// IDs missing from the caller's bundle (vector-channel hits, fallback +// substring matches) still get fetched the slow per-candidate way +// through the outEdges / inEdges accessors when a signal asks for +// them — the seed is a best-effort fast path, not a contract that +// every candidate's edges are present. Callers MUST set +// cachePreSeeded only when the seed covers the expected candidate set +// (i.e. when the bundle backend returned a result for every BM25 +// hit in the merged candidate slice). +func (c *Context) SeedEdgeCaches(inEdges, outEdges map[string][]*graph.Edge, preSeeded bool) { + if c.outEdgeCache == nil { + c.outEdgeCache = make(map[string][]*graph.Edge, len(outEdges)) + } + for id, es := range outEdges { + if _, dup := c.outEdgeCache[id]; dup { + continue + } + c.outEdgeCache[id] = es + } + if c.inEdgeCache == nil { + c.inEdgeCache = make(map[string][]*graph.Edge, len(inEdges)) + } + for id, es := range inEdges { + if _, dup := c.inEdgeCache[id]; dup { + continue + } + c.inEdgeCache[id] = es + } + if preSeeded { + c.cachePreSeeded = true + } +} + +// CachePreSeeded reports whether the caller has signaled (via +// SeedEdgeCaches with preSeeded=true) that the edge caches cover the +// candidate set the next Prepare call will see. Exposed so the +// MCP handler can report a cache-hit-rate / cache-pre-seeded boolean +// in its debug log without grepping internal state. +func (c *Context) CachePreSeeded() bool { return c.cachePreSeeded } + +// InheritEdgeCacheFrom shares the source context's edge caches + +// cachePreSeeded flag onto c. Used by the engine to give per-call +// inner reranks access to the handler-built bundle cache without +// inheriting the handler's session-aware signals (locality, combo, +// frecency, feedback). Cheap pointer-copy of the map references; the +// inner rerank's prepare() reads through them and any backfills it +// triggers land in the SHARED map so subsequent calls benefit. Pass +// nil to clear. +func (c *Context) InheritEdgeCacheFrom(src *Context) { + if c == nil || src == nil { + return + } + c.outEdgeCache = src.outEdgeCache + c.inEdgeCache = src.inEdgeCache + c.cachePreSeeded = src.cachePreSeeded +} + +// EdgeCacheHitRate reports the fraction of nodeIDs that have an entry +// in the in OR out edge cache. 0.0 when the caches are empty; 1.0 when +// every input id has a cache entry on both sides. Used by the +// MCP handler to surface "did the bundle path actually catch?" on +// the search_symbols debug log without exposing internal state. +func (c *Context) EdgeCacheHitRate(ids []string) float64 { + if len(ids) == 0 { + return 0 + } + hits := 0 + for _, id := range ids { + // An id counts as a hit if BOTH the in-edge cache and the + // out-edge cache have an entry for it — that's the contract + // the bundle pre-seed promises. A half-seeded id (only one + // side cached) is a near-miss the prepare() pass would still + // have to satisfy by fetching the missing side. + _, hasOut := c.outEdgeCache[id] + _, hasIn := c.inEdgeCache[id] + if hasOut && hasIn { + hits++ + } + } + return float64(hits) / float64(len(ids)) } // now returns the active timestamp (test-injectable when Now != 0). @@ -133,7 +258,23 @@ func (c *Context) now() int64 { // prepare populates the internal scratch fields once per Rerank call. // Idempotent — safe to call again after mutating the candidate slice. +// +// Edge fetches happen in two batched round-trips (one inbound, one +// outbound) collected from every candidate's ID up front. On the +// Ladybug backend each per-candidate GetInEdges / GetOutEdges call +// costs ~14ms cgo; batching collapses ~150 round-trips per Rerank +// into 2. +// +// Bundle pre-seed fast path: when the caller has set cachePreSeeded +// (via SeedEdgeCaches with preSeeded=true), prepare keeps the existing +// caches in place and skips the batched edge fetch entirely. The +// fanInMax / fanOutMax stats are computed from the already-cached +// maps — same numbers, no cgo. This is the load-bearing skip the +// SymbolBundleSearcherBackend path depends on: the bundle's edges +// were already gathered server-side; a second round-trip here would +// pure-overhead the win. func (c *Context) prepare(cands []*Candidate) { + c.preparedCands = cands c.communityCount = make(map[string]int, len(cands)) c.maxCommunityCount = 0 c.candidateIDs = make(map[string]struct{}, len(cands)) @@ -144,12 +285,23 @@ func (c *Context) prepare(cands []*Candidate) { c.fileScoreSum = make(map[string]float64, len(cands)) c.maxFileScoreSum = 0 c.pathPenaltyCache = make(map[string]float64, len(cands)) + // Preserve the seeded edge caches when the caller signaled + // cachePreSeeded; the legacy reset path below the candidate walk + // only runs when the caches are NOT authoritative. + if !c.cachePreSeeded { + c.outEdgeCache = nil + c.inEdgeCache = nil + } + // First pass: collect candidate IDs (the input to the batched edge + // fetch) and populate the non-edge scratch fields. + ids := make([]string, 0, len(cands)) for _, cand := range cands { if cand == nil || cand.Node == nil { continue } c.candidateIDs[cand.Node.ID] = struct{}{} + ids = append(ids, cand.Node.ID) if c.CommunityOf != nil { com := c.CommunityOf(cand.Node.ID) @@ -161,17 +313,6 @@ func (c *Context) prepare(cands []*Candidate) { } } - if c.Graph != nil { - fi := len(c.Graph.GetInEdges(cand.Node.ID)) - fo := len(c.Graph.GetOutEdges(cand.Node.ID)) - if fi > c.fanInMax { - c.fanInMax = fi - } - if fo > c.fanOutMax { - c.fanOutMax = fo - } - } - ch := c.churnFor(cand.Node) if ch > c.churnMax { c.churnMax = ch @@ -192,6 +333,102 @@ func (c *Context) prepare(cands []*Candidate) { } } } + + // Second pass: one batched in-edge + one out-edge round-trip + // against Graph, scoped to the IDs that are NOT yet cached. + // When cachePreSeeded covers every candidate (the bundle hot + // path's typical shape), the missing slice is empty and the + // round-trips are skipped entirely — pure cache-served fan-in / + // fan-out. When the bundle only covers some IDs (vector or + // fallback hits get appended without bundle edges), we fetch + // only the uncovered tail and merge into the existing cache. + // Skipped when Graph is nil — fan signals contribute 0. + if c.Graph != nil && len(ids) > 0 { + missingOut := missingEdgeIDs(ids, c.outEdgeCache) + missingIn := missingEdgeIDs(ids, c.inEdgeCache) + // Backfill — when the cache already covers everything, both + // missing slices are empty and no cgo round-trip fires. + if len(missingOut) > 0 { + fetched := c.Graph.GetOutEdgesByNodeIDs(missingOut) + if c.outEdgeCache == nil { + c.outEdgeCache = make(map[string][]*graph.Edge, len(fetched)) + } + for id, es := range fetched { + c.outEdgeCache[id] = es + } + } + if len(missingIn) > 0 { + fetched := c.Graph.GetInEdgesByNodeIDs(missingIn) + if c.inEdgeCache == nil { + c.inEdgeCache = make(map[string][]*graph.Edge, len(fetched)) + } + for id, es := range fetched { + c.inEdgeCache[id] = es + } + } + } + for _, id := range ids { + if fi := len(c.inEdgeCache[id]); fi > c.fanInMax { + c.fanInMax = fi + } + if fo := len(c.outEdgeCache[id]); fo > c.fanOutMax { + c.fanOutMax = fo + } + } +} + +// missingEdgeIDs returns the subset of ids whose edge slice is NOT +// already in cache. Used by prepare's backfill: when the bundle path +// pre-seeded most candidates but not all (vector / fallback hits get +// appended without bundle edges), only the uncovered ids cross the +// engine boundary. An empty result means the cache is complete — the +// fetch round-trip can be skipped entirely. +func missingEdgeIDs(ids []string, cache map[string][]*graph.Edge) []string { + if cache == nil { + // No pre-seed at all — caller has to fetch the full set; return + // the input unchanged so the existing batched fetch path runs. + return ids + } + missing := make([]string, 0, len(ids)) + for _, id := range ids { + if _, ok := cache[id]; !ok { + missing = append(missing, id) + } + } + return missing +} + +// outEdges returns the prepared outgoing-edge slice for nodeID. Reads +// from the prepare()-populated cache when available; falls back to a +// direct Graph.GetOutEdges call when prepare did not cache the node +// (a signal calling outside the candidate set, or Graph was nil at +// prepare time but a later mutation set it). Signals must use this +// accessor instead of calling Graph directly so the batched-fetch +// invariant holds. +func (c *Context) outEdges(nodeID string) []*graph.Edge { + if c.outEdgeCache != nil { + if edges, ok := c.outEdgeCache[nodeID]; ok { + return edges + } + } + if c.Graph == nil { + return nil + } + return c.Graph.GetOutEdges(nodeID) +} + +// inEdges is the inbound sibling of outEdges. See that doc-comment +// for the contract. +func (c *Context) inEdges(nodeID string) []*graph.Edge { + if c.inEdgeCache != nil { + if edges, ok := c.inEdgeCache[nodeID]; ok { + return edges + } + } + if c.Graph == nil { + return nil + } + return c.Graph.GetInEdges(nodeID) } // churnFor consults the ChurnOf hook, then Node.Meta["churn"], then diff --git a/internal/search/rerank/pipeline.go b/internal/search/rerank/pipeline.go index 07dd335c..2094deab 100644 --- a/internal/search/rerank/pipeline.go +++ b/internal/search/rerank/pipeline.go @@ -98,7 +98,13 @@ func (p *Pipeline) Rerank(query string, cands []*Candidate, ctx *Context) []*Can if ctx.QueryClass == QueryClassUnknown { ctx.QueryClass = ClassifyQuery(query) } - ctx.prepare(cands) + // Skip prepare when the caller already invoked Context.Prepare + // for per-phase timing on this exact slice — avoids paying the + // batched edge fetch twice on the search hot path. Identity check + // is intentional: any mutation that reallocates resets it. + if !sameSliceHeader(ctx.preparedCands, cands) { + ctx.prepare(cands) + } for _, c := range cands { if c.Signals == nil { @@ -143,6 +149,17 @@ func (p *Pipeline) Rerank(query string, cands []*Candidate, ctx *Context) []*Can return cands } +// sameSliceHeader reports whether a and b alias the same underlying +// candidate slice (same backing array, same length). Used by Rerank to +// detect "the caller already invoked Prepare on this exact slice" and +// skip the duplicate prepare pass. +func sameSliceHeader(a, b []*Candidate) bool { + if len(a) == 0 || len(b) == 0 || len(a) != len(b) { + return false + } + return &a[0] == &b[0] +} + // Nodes is a convenience that unwraps a result slice into the // underlying graph nodes in score order. func Nodes(cands []*Candidate) []*graph.Node { diff --git a/internal/search/rerank/retriever.go b/internal/search/rerank/retriever.go index afb12b20..a8d3ca2d 100644 --- a/internal/search/rerank/retriever.go +++ b/internal/search/rerank/retriever.go @@ -26,7 +26,7 @@ type Retriever interface { // The caller passes the graph (so retrievers can do graph // walks without owning a reference). ctx is honoured for // cancellation — long-running retrievers must respect it. - Retrieve(ctx context.Context, g *graph.Graph, query string, limit int) ([]*Candidate, error) + Retrieve(ctx context.Context, g graph.Store, query string, limit int) ([]*Candidate, error) } // GraphCompletion is a Retriever that uses an upstream Retriever for @@ -46,7 +46,7 @@ type Retriever interface { type GraphCompletion struct { // Seeder produces the initial candidate set the 1-hop expansion // will fan out from. Required. - Seeder func(ctx context.Context, g *graph.Graph, query string, limit int) ([]*Candidate, error) + Seeder func(ctx context.Context, g graph.Store, query string, limit int) ([]*Candidate, error) // MaxSeedExpansion caps the number of new candidates produced // per seed. Defaults to 8 — large enough to surface typical @@ -69,7 +69,7 @@ func (gc *GraphCompletion) Name() string { return "graph_completion" } // merged: the seed copy wins and keeps its rank fields. New nodes // added by expansion have TextRank=-1 / VectorRank=-1 so the // downstream rerank knows they came from graph expansion. -func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query string, limit int) ([]*Candidate, error) { +func (gc *GraphCompletion) Retrieve(ctx context.Context, g graph.Store, query string, limit int) ([]*Candidate, error) { if gc.Seeder == nil { return nil, errNilSeeder } @@ -91,6 +91,7 @@ func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query s out := make([]*Candidate, 0, len(seeds)*2) seen := make(map[string]*Candidate, len(seeds)*2) + seedIDs := make([]string, 0, len(seeds)) for _, c := range seeds { if c == nil || c.Node == nil { continue @@ -100,14 +101,38 @@ func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query s } seen[c.Node.ID] = c out = append(out, c) + seedIDs = append(seedIDs, c.Node.ID) } - for _, seed := range seeds { - if seed == nil || seed.Node == nil { - continue + // One batched out-edge round-trip across every seed instead of + // one cgo call per seed. On Ladybug this drops ~30 round-trips + // into 1 for a typical search_symbols completion pass. + outEdges := g.GetOutEdgesByNodeIDs(seedIDs) + + // Collect every distinct target id, then materialise the target + // nodes in one batched GetNodesByIDs call — same shape, same win. + toIDs := make([]string, 0, len(outEdges)*4) + toSeen := make(map[string]struct{}, len(outEdges)*4) + for _, seedID := range seedIDs { + for _, e := range outEdges[seedID] { + if !keepAll && !allowed[e.Kind] { + continue + } + if _, dup := seen[e.To]; dup { + continue + } + if _, dup := toSeen[e.To]; dup { + continue + } + toSeen[e.To] = struct{}{} + toIDs = append(toIDs, e.To) } + } + toNodes := g.GetNodesByIDs(toIDs) + + for _, seedID := range seedIDs { added := 0 - for _, e := range g.GetOutEdges(seed.Node.ID) { + for _, e := range outEdges[seedID] { if !keepAll && !allowed[e.Kind] { continue } @@ -117,7 +142,7 @@ func (gc *GraphCompletion) Retrieve(ctx context.Context, g *graph.Graph, query s if _, dup := seen[e.To]; dup { continue } - toNode := g.GetNode(e.To) + toNode := toNodes[e.To] if toNode == nil { continue } diff --git a/internal/search/rerank/retriever_test.go b/internal/search/rerank/retriever_test.go index 38ce449f..e4d9107d 100644 --- a/internal/search/rerank/retriever_test.go +++ b/internal/search/rerank/retriever_test.go @@ -24,7 +24,7 @@ func newRetrieverGraph(t *testing.T) *graph.Graph { return g } -func seedHub(_ context.Context, g *graph.Graph, _ string, _ int) ([]*Candidate, error) { +func seedHub(_ context.Context, g graph.Store, _ string, _ int) ([]*Candidate, error) { n := g.GetNode("h") if n == nil { return nil, nil @@ -102,7 +102,7 @@ func TestGraphCompletion_NilSeederErrors(t *testing.T) { func TestGraphCompletion_SeederErrorPropagates(t *testing.T) { g := newRetrieverGraph(t) gc := &GraphCompletion{ - Seeder: func(context.Context, *graph.Graph, string, int) ([]*Candidate, error) { + Seeder: func(context.Context, graph.Store, string, int) ([]*Candidate, error) { return nil, errors.New("seeder failed") }, } @@ -114,7 +114,7 @@ func TestGraphCompletion_SeederErrorPropagates(t *testing.T) { func TestGraphCompletion_DedupesSeedFromExpansion(t *testing.T) { g := newRetrieverGraph(t) // Two seeds, the second is reachable from the first. - multiSeed := func(_ context.Context, gr *graph.Graph, _ string, _ int) ([]*Candidate, error) { + multiSeed := func(_ context.Context, gr graph.Store, _ string, _ int) ([]*Candidate, error) { return []*Candidate{ {Node: gr.GetNode("h"), TextRank: 0}, {Node: gr.GetNode("a"), TextRank: 1}, // also reachable from h @@ -136,7 +136,7 @@ func TestGraphCompletion_DedupesSeedFromExpansion(t *testing.T) { func TestGraphCompletion_NilSeedsIgnored(t *testing.T) { g := newRetrieverGraph(t) gc := &GraphCompletion{ - Seeder: func(context.Context, *graph.Graph, string, int) ([]*Candidate, error) { + Seeder: func(context.Context, graph.Store, string, int) ([]*Candidate, error) { return []*Candidate{nil, {Node: nil}, {Node: g.GetNode("h")}}, nil }, } diff --git a/internal/search/rerank/signals_graph.go b/internal/search/rerank/signals_graph.go index 2f19e0c9..33c33dd8 100644 --- a/internal/search/rerank/signals_graph.go +++ b/internal/search/rerank/signals_graph.go @@ -13,7 +13,7 @@ func (FanInSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { if ctx.Graph == nil { return 0 } - count := len(ctx.Graph.GetInEdges(c.Node.ID)) + count := len(ctx.inEdges(c.Node.ID)) return normLog(count, ctx.fanInMax) } @@ -29,7 +29,7 @@ func (FanOutSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { if ctx.Graph == nil { return 0 } - count := len(ctx.Graph.GetOutEdges(c.Node.ID)) + count := len(ctx.outEdges(c.Node.ID)) return normLog(count, ctx.fanOutMax) } @@ -47,7 +47,7 @@ func (MinHashSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { return 0 } var total, n float64 - for _, e := range ctx.Graph.GetOutEdges(c.Node.ID) { + for _, e := range ctx.outEdges(c.Node.ID) { if e.Kind != graph.EdgeSimilarTo { continue } @@ -63,7 +63,7 @@ func (MinHashSignal) Contribute(_ string, c *Candidate, ctx *Context) float64 { } // Symmetric edge — also walk incoming (snapshots that omit // outgoing copies of similar_to don't lose recall). - for _, e := range ctx.Graph.GetInEdges(c.Node.ID) { + for _, e := range ctx.inEdges(c.Node.ID) { if e.Kind != graph.EdgeSimilarTo { continue } diff --git a/internal/search/swappable.go b/internal/search/swappable.go index fa24aaf2..d386c4c0 100644 --- a/internal/search/swappable.go +++ b/internal/search/swappable.go @@ -81,6 +81,59 @@ func (s *Swappable) SearchChannels(query string, limit int) (textResults []Searc return s.inner.Search(query, limit), nil } +// SearchChannelsTimed delegates to a backend that supports the +// per-phase timing breakdown (today only HybridBackend). Falls back +// to SearchChannels — and a zero-valued ChannelTimings — when the +// inner backend doesn't know how to split phases. +func (s *Swappable) SearchChannelsTimed(query string, limit int) ([]SearchResult, []string, ChannelTimings) { + s.mu.RLock() + defer s.mu.RUnlock() + type timer interface { + SearchChannelsTimed(query string, limit int) ([]SearchResult, []string, ChannelTimings) + } + if cst, ok := s.inner.(timer); ok { + return cst.SearchChannelsTimed(query, limit) + } + if cs, ok := s.inner.(ChannelSearcher); ok { + text, vec := cs.SearchChannels(query, limit) + return text, vec, ChannelTimings{} + } + return s.inner.Search(query, limit), nil, ChannelTimings{} +} + +// SearchSymbolBundles forwards to the inner backend when it implements +// SymbolBundleSearcherBackend (production wiring: a +// SymbolSearcherBackend whose store is the Ladybug Store, or a +// HybridBackend whose text backend is the same). Returns nil when the +// inner backend doesn't expose bundles — the engine treats nil as +// "no bundle support" and falls back to the per-call Search + +// GetNodesByIDs + GetIn/OutEdgesByNodeIDs path. +func (s *Swappable) SearchSymbolBundles(query string, limit int) []SymbolBundle { + s.mu.RLock() + defer s.mu.RUnlock() + if bs, ok := s.inner.(SymbolBundleSearcherBackend); ok { + return bs.SearchSymbolBundles(query, limit) + } + return nil +} + +// VectorChannelOnly forwards to the inner backend when it implements +// the vector-only channel pull (today: HybridBackend). Lets the +// engine fetch the vector channel without re-running text BM25 — +// the bundle path already has the text hits. Returns (nil, zero +// timings) when the inner backend isn't vector-aware. +func (s *Swappable) VectorChannelOnly(query string, limit int) ([]string, ChannelTimings) { + s.mu.RLock() + defer s.mu.RUnlock() + type vco interface { + VectorChannelOnly(query string, limit int) ([]string, ChannelTimings) + } + if v, ok := s.inner.(vco); ok { + return v.VectorChannelOnly(query, limit) + } + return nil, ChannelTimings{} +} + func (s *Swappable) Count() int { s.mu.RLock() defer s.mu.RUnlock() diff --git a/internal/search/symbolsearcher_backend.go b/internal/search/symbolsearcher_backend.go new file mode 100644 index 00000000..d7212e3e --- /dev/null +++ b/internal/search/symbolsearcher_backend.go @@ -0,0 +1,160 @@ +package search + +import ( + "strings" + "sync/atomic" + + "github.com/zzet/gortex/internal/graph" +) + +// SymbolSearcherBackend adapts a graph.SymbolSearcher into the +// search.Backend the daemon's search-symbols path consumes. +// Engine.gatherBackendCandidates and the rerank pipeline don't need +// to know whether the backend is BM25 / Bleve / native FTS — they +// see a plain search.Backend and call Search on it. +// +// Production wiring: when the indexer detects that the backing +// graph.Store also implements graph.SymbolSearcher (today only +// store_ladybug), it constructs this adapter as the initial +// search.Backend wrapped by search.NewSwappable. The in-process +// Bleve / BM25 build path is then bypassed entirely. +// +// Add / Remove are no-ops on the adapter because the indexer +// already drives the SymbolSearcher writes directly: +// +// - cold-load: BulkUpsertSymbolFTS at shadow-drain commit (see +// internal/indexer.go IndexCtx defer) +// - incremental: UpsertSymbolFTS alongside the parallel +// idx.search.Add in the per-file path +// +// The adapter therefore only carries the read side. Callers that +// invoke Add / Remove still get the right behaviour because the +// indexer is the only entity that ever creates this adapter, and +// it doesn't rely on Add / Remove updating the FTS — those calls +// happen through the direct SymbolSearcher surface. +type SymbolSearcherBackend struct { + s graph.SymbolSearcher + + // count tracks the indexer's incremental Add / Remove deltas + // only — it does NOT report the actual size of the backend + // FTS index (which lives in the disk store and is queryable + // via the SymbolSearcher's own primitives). Used for the + // search.Backend.Count() contract by callers that just want a + // rough magnitude (no caller currently treats this as + // authoritative). + count atomic.Int64 +} + +// NewSymbolSearcherBackend wraps a SymbolSearcher in the +// search.Backend contract. The caller is responsible for keeping +// the underlying SymbolSearcher alive — Close on this adapter is +// a no-op and never touches the wrapped store. +func NewSymbolSearcherBackend(s graph.SymbolSearcher) *SymbolSearcherBackend { + return &SymbolSearcherBackend{s: s} +} + +// SymbolBundle re-exports graph.SymbolBundle so callers (the query +// engine, the rerank seed path) can construct + consume bundles +// without re-importing the graph package next to the search +// package import — symmetric with how SearchResult sits in +// search/. +type SymbolBundle = graph.SymbolBundle + +// SearchSymbolBundles is the bundled-search hot path: it forwards +// to the wrapped graph.SymbolBundleSearcher when the underlying +// store implements that capability, returning the matched node + +// score + in/out edges in one engine round-trip. When the store +// only implements SymbolSearcher (no Bundle support), this method +// returns nil — callers MUST check the result and fall back to the +// per-call Search → GetNodesByIDs → GetIn/OutEdgesByNodeIDs path. +// +// Exposed on SymbolSearcherBackend (the production search.Backend +// adapter used in production) so the engine can type-assert through +// the search.Backend chain via SymbolBundleSearcherBackend without +// touching the daemon's wiring. +func (b *SymbolSearcherBackend) SearchSymbolBundles(query string, limit int) []SymbolBundle { + if b == nil || b.s == nil || strings.TrimSpace(query) == "" { + return nil + } + bs, ok := b.s.(graph.SymbolBundleSearcher) + if !ok { + return nil + } + bundles, err := bs.SearchSymbolBundles(query, limit) + if err != nil { + return nil + } + return bundles +} + +// SymbolBundleSearcherBackend is the interface the engine type-asserts +// on a search.Backend to detect bundle support. Both +// *SymbolSearcherBackend and *HybridBackend implement this; Swappable +// forwards. +type SymbolBundleSearcherBackend interface { + SearchSymbolBundles(query string, limit int) []SymbolBundle +} + +// Search forwards to SymbolSearcher.SearchSymbols and translates +// the per-hit (NodeID, Score) into search.SearchResult so callers +// don't see the graph package at all. +// +// An error from the backend is downgraded to an empty result — the +// daemon's search_symbols path already tolerates an empty primary +// hit set (it falls through to the exact-name / substring tiers in +// query.Engine.gatherBackendCandidates), so returning an error +// surface here would force every caller to grow its own fallback. +func (b *SymbolSearcherBackend) Search(query string, limit int) []SearchResult { + if b == nil || b.s == nil || strings.TrimSpace(query) == "" { + return nil + } + hits, err := b.s.SearchSymbols(query, limit) + if err != nil || len(hits) == 0 { + return nil + } + out := make([]SearchResult, len(hits)) + for i, h := range hits { + out[i] = SearchResult{ID: h.NodeID, Score: h.Score} + } + return out +} + +// Add is a no-op — the indexer drives UpsertSymbolFTS on the wrapped +// SymbolSearcher directly. count is bumped so the Count() figure +// tracks the deltas-since-construction (best-effort, not +// authoritative — the disk index may be larger from a prior cold +// load). +func (b *SymbolSearcherBackend) Add(id string, _ ...string) { + if b == nil || id == "" { + return + } + b.count.Add(1) +} + +// Remove is a no-op for the same reason as Add — the per-call +// removal path (when one lands) routes through SymbolSearcher +// directly, not through the search.Backend contract. count is +// decremented so the Count() figure stays roughly consistent. +func (b *SymbolSearcherBackend) Remove(id string) { + if b == nil || id == "" { + return + } + b.count.Add(-1) +} + +// Count returns the running delta-since-construction. Used for +// observability / "is the index populated?" gates — never as a +// load-bearing decision input. The authoritative size lives in +// the disk FTS index, which is queryable via the +// SymbolSearcher's native primitives if needed. +func (b *SymbolSearcherBackend) Count() int { + if b == nil { + return 0 + } + return int(b.count.Load()) +} + +// Close is a no-op. The wrapped SymbolSearcher is owned by the +// graph.Store; closing it from the search adapter would race the +// indexer's own lifecycle. +func (b *SymbolSearcherBackend) Close() {} diff --git a/internal/search/vector.go b/internal/search/vector.go index 77ffc345..63ac02d5 100644 --- a/internal/search/vector.go +++ b/internal/search/vector.go @@ -9,6 +9,8 @@ import ( "sync" "github.com/coder/hnsw" + + "github.com/zzet/gortex/internal/graph" ) // vectorFrameMagic prefixes the framed VectorBackend.Save format: a @@ -18,7 +20,24 @@ import ( // map — so old snapshots keep working. var vectorFrameMagic = [4]byte{'G', 'V', 'X', '1'} +// VectorDelegate is the subset of graph.VectorSearcher the +// VectorBackend shim consults when it's been told to delegate +// instead of holding an in-process HNSW. Exported (with a +// graph.VectorHit return) so the indexer can install a delegate +// without writing a translation layer — search already depends on +// graph for SymbolHit, so the type sharing is free. +type VectorDelegate interface { + SimilarTo(vec []float32, limit int) ([]graph.VectorHit, error) +} + // VectorBackend stores and searches embedding vectors using HNSW index. +// +// When delegate is set (via SetDelegate), the in-process HNSW is +// bypassed entirely: Add becomes a no-op (the indexer drives the +// delegate's bulk-upsert directly), Search forwards to the +// delegate's SimilarTo. The dims and chunkMap stay live so callers +// that need them (HybridBackend.dechunkVectorIDs) keep working +// against the same VectorBackend surface. type VectorBackend struct { graph *hnsw.Graph[string] count int @@ -30,6 +49,16 @@ type VectorBackend struct { // returned twice and chunk IDs never leak to callers. chunkMap map[string]string mu sync.RWMutex + + // delegate is the optional engine-native vector searcher (today + // only graph.SymbolSearcher-implementing stores). Set means + // "don't build the in-process HNSW; route reads through here". + // The wrapped delegateCount tracks Add-call deltas so Count() + // reports a non-zero figure once the indexer has finished its + // bulk upsert — HybridBackend gates the vector channel on + // Count() > 0. + delegate VectorDelegate + delegateCount int } // NewVector creates a vector search backend for the given embedding dimensions. @@ -75,6 +104,16 @@ func (v *VectorBackend) HasChunks() bool { func (v *VectorBackend) Add(id string, vector []float32) { v.mu.Lock() defer v.mu.Unlock() + if v.delegate != nil { + // Delegated mode: the indexer pushes vectors to the + // engine-native HNSW via the graph.VectorSearcher + // interface directly. Add here is a no-op so the + // in-process hnsw.Graph never allocates memory for what + // the delegate already owns; count tracks deltas so + // Count()'s "is the index populated" gate fires. + v.delegateCount++ + return + } v.graph.Add(hnsw.Node[string]{ Key: id, Value: hnsw.Vector(vector), @@ -82,8 +121,37 @@ func (v *VectorBackend) Add(id string, vector []float32) { v.count++ } +// SetDelegate routes Search / Count through an engine-native vector +// searcher (today the Ladybug store's graph.VectorSearcher). After +// the call: +// - Add is a no-op (the indexer talks to the delegate directly via +// graph.VectorSearcher.BulkUpsertEmbeddings / UpsertEmbedding), +// - Search forwards to delegate.SimilarTo, +// - Count reflects the delegate-delta count (not the in-process +// graph), so HybridBackend.searchChannels's `v.Count() > 0` gate +// fires once the indexer has populated the backend. +func (v *VectorBackend) SetDelegate(d VectorDelegate) { + v.mu.Lock() + defer v.mu.Unlock() + v.delegate = d +} + // Search returns the k nearest neighbors to the query vector. func (v *VectorBackend) Search(query []float32, k int) []string { + v.mu.RLock() + d := v.delegate + v.mu.RUnlock() + if d != nil { + hits, err := d.SimilarTo(query, k) + if err != nil || len(hits) == 0 { + return nil + } + ids := make([]string, len(hits)) + for i, h := range hits { + ids[i] = h.NodeID + } + return ids + } v.mu.RLock() defer v.mu.RUnlock() if v.count == 0 { @@ -101,6 +169,9 @@ func (v *VectorBackend) Search(query []float32, k int) []string { func (v *VectorBackend) Count() int { v.mu.RLock() defer v.mu.RUnlock() + if v.delegate != nil { + return v.delegateCount + } return v.count } diff --git a/internal/semantic/enricher.go b/internal/semantic/enricher.go index aa5727b0..c463a84f 100644 --- a/internal/semantic/enricher.go +++ b/internal/semantic/enricher.go @@ -20,13 +20,13 @@ func ConfirmEdge(e *graph.Edge, provider string) { // RefuteEdge removes a false-positive edge from the graph. // Returns true if the edge was removed. -func RefuteEdge(g *graph.Graph, e *graph.Edge) bool { +func RefuteEdge(g graph.Store, e *graph.Edge) bool { return g.RemoveEdge(e.From, e.To, e.Kind) } // AddSemanticEdge adds a new edge discovered by semantic analysis. Origin is // tagged LSP-grade (see ConfirmEdge). -func AddSemanticEdge(g *graph.Graph, from, to string, kind graph.EdgeKind, filePath string, line int, provider string) *graph.Edge { +func AddSemanticEdge(g graph.Store, from, to string, kind graph.EdgeKind, filePath string, line int, provider string) *graph.Edge { e := &graph.Edge{ From: from, To: to, @@ -66,7 +66,7 @@ func EnrichNodeMeta(n *graph.Node, key string, value any, provider string) { } // FindMatchingEdge searches for an existing edge between two nodes of a given kind. -func FindMatchingEdge(g *graph.Graph, from, to string, kind graph.EdgeKind) *graph.Edge { +func FindMatchingEdge(g graph.Store, from, to string, kind graph.EdgeKind) *graph.Edge { edges := g.GetOutEdges(from) for _, e := range edges { if e.To == to && e.Kind == kind { @@ -77,7 +77,7 @@ func FindMatchingEdge(g *graph.Graph, from, to string, kind graph.EdgeKind) *gra } // FindEdgeByTarget searches for an edge from a node to a target with any kind. -func FindEdgeByTarget(g *graph.Graph, from, to string) *graph.Edge { +func FindEdgeByTarget(g graph.Store, from, to string) *graph.Edge { edges := g.GetOutEdges(from) for _, e := range edges { if e.To == to { @@ -88,7 +88,7 @@ func FindEdgeByTarget(g *graph.Graph, from, to string) *graph.Edge { } // NodesByLanguage returns all nodes in the graph that match the given language. -func NodesByLanguage(g *graph.Graph, language string) []*graph.Node { +func NodesByLanguage(g graph.Store, language string) []*graph.Node { var result []*graph.Node for _, n := range g.AllNodes() { if n.Language == language { @@ -99,7 +99,7 @@ func NodesByLanguage(g *graph.Graph, language string) []*graph.Node { } // EdgesByLanguage returns all edges whose source node matches the given language. -func EdgesByLanguage(g *graph.Graph, language string) []*graph.Edge { +func EdgesByLanguage(g graph.Store, language string) []*graph.Edge { var result []*graph.Edge for _, e := range g.AllEdges() { fromNode := g.GetNode(e.From) diff --git a/internal/semantic/goanalysis/externals.go b/internal/semantic/goanalysis/externals.go index cae6dd10..363ee34c 100644 --- a/internal/semantic/goanalysis/externals.go +++ b/internal/semantic/goanalysis/externals.go @@ -39,12 +39,18 @@ const modulePathStdlib = "stdlib" // Statistics counters surface back through ExternalsResult so the caller // can report nodes/edges added. type externalsAttribution struct { - g *graph.Graph + g graph.Store pkgByPath map[string]*packages.Package moduleByPath map[string]string extByObj map[types.Object]string provider string + // repoPrefix is the owning repo's prefix, used to namespace stub + // IDs (graph.StubID). Empty when the caller doesn't supply one + // — in that case stub IDs are emitted in the legacy un-prefixed + // form, which graph.IsStdlibStub / friends still recognise. + repoPrefix string + nodesAdded int edgesAdded int edgesUpgraded int @@ -57,7 +63,7 @@ type externalsAttribution struct { // roots. Walking pkg.Imports collects every dep — stdlib and module-cache // alike — so resolveSymbol can find the owning *packages.Package for an // arbitrary types.Object. -func newExternalsAttribution(g *graph.Graph, roots []*packages.Package, provider string) *externalsAttribution { +func newExternalsAttribution(g graph.Store, roots []*packages.Package, provider string) *externalsAttribution { pkgByPath := make(map[string]*packages.Package) var visit func(p *packages.Package) visit = func(p *packages.Package) { @@ -81,9 +87,34 @@ func newExternalsAttribution(g *graph.Graph, roots []*packages.Package, provider moduleByPath: make(map[string]string), extByObj: make(map[types.Object]string), provider: provider, + repoPrefix: deriveRepoPrefix(g, roots), } } +// deriveRepoPrefix peeks at the first source file across the +// enrichment roots and reads its RepoPrefix from the graph. +// All files belonging to a single semantic.Provider.Enrich call +// share one repo, so a single sample suffices. Returns "" when no +// matching file node is found — stubs then fall back to the +// legacy un-prefixed form, which graph.IsStdlibStub still accepts. +func deriveRepoPrefix(g graph.Store, roots []*packages.Package) string { + for _, r := range roots { + if r == nil { + continue + } + for _, f := range r.GoFiles { + if nodes := g.GetFileNodes(f); len(nodes) > 0 { + for _, n := range nodes { + if n != nil && n.RepoPrefix != "" { + return n.RepoPrefix + } + } + } + } + } + return "" +} + // resolveSymbol returns the graph node ID for an external go/types object, // creating it (and the owning KindModule node, if not already present) // on first sight. Returns "" when the object is unsuitable for @@ -199,7 +230,7 @@ func (e *externalsAttribution) claimAndUpgradeStub(callerID string, importPath s // claimByExactStub handles the canonical resolver-shaped targets. Pulled // out so the fuzzy pass can layer on top. func (e *externalsAttribution) claimByExactStub(callerID string, importPath string, obj types.Object, newTarget string) *graph.Edge { - candidates := stubEdgeTargets(importPath, obj) + candidates := stubEdgeTargets(e.repoPrefix, importPath, obj) for _, target := range candidates { edge := semantic.FindEdgeByTarget(e.g, callerID, target) if edge == nil { @@ -276,9 +307,9 @@ func wantedEdgeKind(obj types.Object) graph.EdgeKind { // strings the resolver writes for unresolved or external lookups. func isStubTarget(to string) bool { switch { - case strings.HasPrefix(to, "unresolved::"), + case graph.IsUnresolvedTarget(to), strings.HasPrefix(to, "external::"), - strings.HasPrefix(to, "stdlib::"), + graph.IsStdlibStub(to), strings.HasPrefix(to, "dep::"): return true } @@ -393,7 +424,12 @@ func (e *externalsAttribution) ensureModuleNode(pkg *packages.Package) string { // written for an external obj. Order matches resolver precedence: // stdlib::/dep:: are produced post-resolve, unresolved::extern:: is the // raw form when resolveExtern wasn't run. -func stubEdgeTargets(importPath string, obj types.Object) []string { +// +// repoPrefix namespaces the stdlib stub form per-repo so two repos +// pinned to different Go SDK versions don't collide on a single +// `stdlib::fmt::Errorf` node. An empty repoPrefix yields the legacy +// un-prefixed form, which the resolver still emits today. +func stubEdgeTargets(repoPrefix, importPath string, obj types.Object) []string { if obj == nil { return nil } @@ -402,7 +438,7 @@ func stubEdgeTargets(importPath string, obj types.Object) []string { return nil } return []string{ - "stdlib::" + importPath + "::" + name, + graph.StubID(repoPrefix, graph.StubKindStdlib, importPath, name), "dep::" + importPath + "::" + name, "unresolved::extern::" + importPath + "::" + name, } diff --git a/internal/semantic/goanalysis/provider.go b/internal/semantic/goanalysis/provider.go index 0cebcc1a..159e4a33 100644 --- a/internal/semantic/goanalysis/provider.go +++ b/internal/semantic/goanalysis/provider.go @@ -65,7 +65,7 @@ func (p *Provider) Available() bool { return true } -func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResult, error) { +func (p *Provider) Enrich(g graph.Store, repoRoot string) (*semantic.EnrichResult, error) { start := time.Now() absRoot, err := filepath.Abs(repoRoot) @@ -245,6 +245,12 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu result.EdgesAdded += p.addMissingImplements(g, pkgs, objToNode, absRoot) // Phase 4: Enrich node metadata with type info. + // EnrichNodeMeta mutates Node.Meta in place; on disk backends the + // node is a per-call GetNode reconstruction, so collect every stamped + // node and round-trip it through the store at the end (one AddBatch) + // or the semantic_type / return_type stamps are silently discarded on + // Ladybug. See semantic.EnrichNodeMeta. + var stampedNodes []*graph.Node for _, pkg := range pkgs { if pkg.TypesInfo == nil { continue @@ -262,10 +268,12 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu continue } + didStamp := false typeStr := types.TypeString(obj.Type(), nil) if typeStr != "" && typeStr != "invalid type" { semantic.EnrichNodeMeta(node, "semantic_type", typeStr, p.Name()) result.NodesEnriched++ + didStamp = true } // Add return type for functions. @@ -274,18 +282,25 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu if ok && sig.Results().Len() > 0 { retType := types.TypeString(sig.Results(), nil) semantic.EnrichNodeMeta(node, "return_type", retType, p.Name()) + didStamp = true } } + if didStamp { + stampedNodes = append(stampedNodes, node) + } _ = ident // used in range } } + if len(stampedNodes) > 0 { + g.AddBatch(stampedNodes, nil) + } result.DurationMs = time.Since(start).Milliseconds() return result, nil } -func (p *Provider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*semantic.EnrichResult, error) { +func (p *Provider) EnrichFile(g graph.Store, repoRoot, filePath string) (*semantic.EnrichResult, error) { // go/types can do incremental loading per package, but for simplicity // we re-enrich the whole graph. The manager's debounce prevents thrashing. return nil, nil @@ -528,7 +543,7 @@ func (p *Provider) loadPackages(dir string) ([]*packages.Package, *token.FileSet } // enrichImplements confirms existing EdgeImplements edges using go/types. -func (p *Provider) enrichImplements(g *graph.Graph, pkgs []*packages.Package, objToNode map[types.Object]string) int { +func (p *Provider) enrichImplements(g graph.Store, pkgs []*packages.Package, objToNode map[types.Object]string) int { confirmed := 0 // Collect all interfaces from the loaded packages. @@ -565,7 +580,7 @@ func (p *Provider) enrichImplements(g *graph.Graph, pkgs []*packages.Package, ob } // addMissingImplements discovers interface implementations that tree-sitter missed. -func (p *Provider) addMissingImplements(g *graph.Graph, pkgs []*packages.Package, objToNode map[types.Object]string, absRoot string) int { +func (p *Provider) addMissingImplements(g graph.Store, pkgs []*packages.Package, objToNode map[types.Object]string, absRoot string) int { added := 0 // Collect interfaces and concrete types. @@ -619,7 +634,7 @@ func (p *Provider) addMissingImplements(g *graph.Graph, pkgs []*packages.Package } // findContainingFunc finds the Gortex function/method node that contains the given position. -func findContainingFunc(g *graph.Graph, pkgs []*packages.Package, fset *token.FileSet, absRoot string, pos token.Position) *graph.Node { +func findContainingFunc(g graph.Store, pkgs []*packages.Package, fset *token.FileSet, absRoot string, pos token.Position) *graph.Node { relPath := relativePath(pos.Filename, absRoot) if relPath == "" { return nil diff --git a/internal/semantic/lsp/provider.go b/internal/semantic/lsp/provider.go index e6b868fb..ded691c3 100644 --- a/internal/semantic/lsp/provider.go +++ b/internal/semantic/lsp/provider.go @@ -177,7 +177,7 @@ func (p *Provider) Close() error { return nil } -func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResult, error) { +func (p *Provider) Enrich(g graph.Store, repoRoot string) (*semantic.EnrichResult, error) { start := time.Now() absRoot, err := filepath.Abs(repoRoot) @@ -268,6 +268,11 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu // Query hover info for nodes to enrich metadata. enrichedNodes := make(map[string]bool) + // EnrichNodeMeta mutates Node.Meta in place; on disk backends n is a + // per-call AllNodes reconstruction, so collect stamped nodes and + // round-trip them through the store at the end or the semantic_type + // stamp is discarded on Ladybug. See semantic.EnrichNodeMeta. + var stampedNodes []*graph.Node for _, n := range g.AllNodes() { if n.Kind == graph.KindFile || n.Kind == graph.KindImport { continue @@ -300,6 +305,7 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu typeInfo := extractTypeFromHover(hoverResult.Contents.Value) if typeInfo != "" { semantic.EnrichNodeMeta(n, "semantic_type", typeInfo, p.Name()) + stampedNodes = append(stampedNodes, n) if !enrichedNodes[n.ID] { result.NodesEnriched++ result.SymbolsCovered++ @@ -307,6 +313,9 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu } } } + if len(stampedNodes) > 0 { + g.AddBatch(stampedNodes, nil) + } // Query implementations for interface nodes. for _, n := range g.AllNodes() { @@ -406,7 +415,7 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu return result, nil } -func (p *Provider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*semantic.EnrichResult, error) { +func (p *Provider) EnrichFile(g graph.Store, repoRoot, filePath string) (*semantic.EnrichResult, error) { // LSP supports incremental updates, but for simplicity we skip it. // The full Enrich pass handles this. return nil, nil @@ -1157,7 +1166,7 @@ func (p *Provider) Source(repoRoot, relPath string) []byte { // matching ast_inferred / text_matched EdgeCalls to lsp_resolved, or // add a fresh EdgeCalls when the AST extractor missed the link // (cross-file calls in languages without compile-unit info). -func (p *Provider) enrichCallHierarchy(g *graph.Graph, absRoot string, result *semantic.EnrichResult) { +func (p *Provider) enrichCallHierarchy(g graph.Store, absRoot string, result *semantic.EnrichResult) { for _, n := range g.AllNodes() { if n.Kind != graph.KindFunction && n.Kind != graph.KindMethod { continue @@ -1191,7 +1200,7 @@ func (p *Provider) enrichCallHierarchy(g *graph.Graph, absRoot string, result *s // asOutgoing=true means "this node calls other"; false means "other // calls this node" (incoming-calls direction). Existing edges get // promoted to lsp_resolved; missing edges get added. -func (p *Provider) recordHierarchyCall(g *graph.Graph, absRoot string, n *graph.Node, other CallHierarchyItem, asOutgoing bool, result *semantic.EnrichResult) { +func (p *Provider) recordHierarchyCall(g graph.Store, absRoot string, n *graph.Node, other CallHierarchyItem, asOutgoing bool, result *semantic.EnrichResult) { otherPath := uriToPath(other.URI, absRoot) if otherPath == "" { return @@ -1232,7 +1241,7 @@ func (p *Provider) recordHierarchyCall(g *graph.Graph, absRoot string, n *graph. // T → super when the super is an interface kind. // - subtypes(T) = the children of T. Emits EdgeImplements child // → T when T is an interface; EdgeExtends otherwise. -func (p *Provider) enrichTypeHierarchy(g *graph.Graph, absRoot string, result *semantic.EnrichResult) { +func (p *Provider) enrichTypeHierarchy(g graph.Store, absRoot string, result *semantic.EnrichResult) { for _, n := range g.AllNodes() { if n.Kind != graph.KindType && n.Kind != graph.KindInterface { continue @@ -1267,7 +1276,7 @@ func (p *Provider) enrichTypeHierarchy(g *graph.Graph, absRoot string, result *s // whose name matches a method on the parent — closing the // method-level half of the type hierarchy (Joern calls these // CONTAINS + OVERRIDES). -func (p *Provider) linkTypeHierarchy(g *graph.Graph, absRoot string, cur *graph.Node, other TypeHierarchyItem, asSupertype bool, result *semantic.EnrichResult) { +func (p *Provider) linkTypeHierarchy(g graph.Store, absRoot string, cur *graph.Node, other TypeHierarchyItem, asSupertype bool, result *semantic.EnrichResult) { otherPath := uriToPath(other.URI, absRoot) if otherPath == "" { return @@ -1313,7 +1322,7 @@ func (p *Provider) linkTypeHierarchy(g *graph.Graph, absRoot string, cur *graph. // origin lets the caller stamp the edges with lsp_dispatch (LSP- // confirmed parent), ast_resolved (AST-confirmed parent in the same // compilation unit), or ast_inferred (parent is a heuristic match). -func addOverrideEdges(g *graph.Graph, child, parent *graph.Node, provider, origin string, result *semantic.EnrichResult) { +func addOverrideEdges(g graph.Store, child, parent *graph.Node, provider, origin string, result *semantic.EnrichResult) { if child == nil || parent == nil || child.ID == parent.ID { return } diff --git a/internal/semantic/lsp/resolver_helper_integration_test.go b/internal/semantic/lsp/resolver_helper_integration_test.go new file mode 100644 index 00000000..5e327a4d --- /dev/null +++ b/internal/semantic/lsp/resolver_helper_integration_test.go @@ -0,0 +1,115 @@ +package lsp + +import ( + "os" + "os/exec" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/zap" +) + +// TestResolverHelper_RealTsserver_DefinitionAcrossFiles spins up a +// real typescript-language-server against a tiny on-disk TS fixture +// and asserts the helper resolves a cross-file method call to the +// correct declaration. Skips when typescript-language-server isn't +// on PATH (CI / dev machines without npm install). +// +// This is the load-bearing N5 integration check: the unit tests in +// resolver_registry_test.go cover dispatch logic with a scripted +// stub; this test verifies the underlying LSP-protocol wiring +// (initialize → didOpen → textDocument/definition → response) lands +// on a real graph file path. +func TestResolverHelper_RealTsserver_DefinitionAcrossFiles(t *testing.T) { + if _, err := exec.LookPath("typescript-language-server"); err != nil { + t.Skip("typescript-language-server not on PATH — skip integration test (run `npm i -g typescript-language-server typescript` to enable)") + } + + workspace := t.TempDir() + mustWrite(t, filepath.Join(workspace, "tsconfig.json"), `{"compilerOptions":{"target":"ES2020","module":"commonjs","strict":false}}`) + // Use a method on a class to avoid the import-binding ambiguity: + // tsserver's textDocument/definition on a method invocation + // reliably returns the method declaration, even with TS's + // declaration-merging. + mustWrite(t, filepath.Join(workspace, "lib.ts"), `export class Worker { + doWork(x: number): number { + return x + 1; + } +} +`) + mustWrite(t, filepath.Join(workspace, "caller.ts"), `import { Worker } from "./lib"; + +export function callIt(): number { + const w = new Worker(); + return w.doWork(42); +} +`) + + spec := SpecByName("typescript-language-server") + require.NotNil(t, spec, "TS spec must be in registry") + + provider := NewProviderFromSpec(spec, zap.NewNop()) + helper := NewResolverHelper(provider, workspace, 10*time.Second, zap.NewNop()) + defer func() { _ = helper.Close() }() + + // Warm tsserver up by asking once and discarding the result — + // the workspace project graph loads asynchronously and the first + // definition request often races the workspace warmup. A retry + // loop tolerates 1-2 cold attempts. + var ( + defPath string + defLine int + ok bool + ) + deadline := time.Now().Add(8 * time.Second) + for { + defPath, defLine, ok = helper.Definition("caller.ts", 5, "doWork") + if ok && defPath == "lib.ts" { + break + } + if time.Now().After(deadline) { + break + } + time.Sleep(250 * time.Millisecond) + } + + require.True(t, ok, "tsserver should eventually resolve doWork across files") + assert.Equal(t, "lib.ts", defPath, "definition lives in lib.ts") + // lib.ts: line 1 = `export class Worker {`, line 2 = ` doWork(...) {` + assert.Equal(t, 2, defLine) +} + +// TestResolverHelper_RealTsserver_NoMatchReturnsFalse — when the +// identifier on the requested line doesn't resolve to anything +// (typo, missing import), the helper returns ok=false rather than +// inventing a location. +func TestResolverHelper_RealTsserver_NoMatchReturnsFalse(t *testing.T) { + if _, err := exec.LookPath("typescript-language-server"); err != nil { + t.Skip("typescript-language-server not on PATH") + } + + workspace := t.TempDir() + mustWrite(t, filepath.Join(workspace, "tsconfig.json"), `{"compilerOptions":{"target":"ES2020","module":"commonjs","strict":false}}`) + mustWrite(t, filepath.Join(workspace, "foo.ts"), `// no identifiers worth resolving here +const a = 1; +`) + + spec := SpecByName("typescript-language-server") + provider := NewProviderFromSpec(spec, zap.NewNop()) + helper := NewResolverHelper(provider, workspace, 5*time.Second, zap.NewNop()) + defer func() { _ = helper.Close() }() + + // "ghostFunction" doesn't appear on line 2 — tsserver should + // return an empty location set, the helper should report + // ok=false, the resolver falls through to heuristics. + _, _, ok := helper.Definition("foo.ts", 2, "ghostFunction") + assert.False(t, ok) +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + require.NoError(t, os.WriteFile(path, []byte(content), 0644)) +} diff --git a/internal/semantic/manager.go b/internal/semantic/manager.go index b12e8432..e251e15b 100644 --- a/internal/semantic/manager.go +++ b/internal/semantic/manager.go @@ -101,7 +101,7 @@ func (m *Manager) LSPRouter() LSPRouter { // EnrichAll runs all available providers against the graph. // For each language, only the highest-priority available provider runs. -func (m *Manager) EnrichAll(g *graph.Graph, roots map[string]string) ([]*EnrichResult, error) { +func (m *Manager) EnrichAll(g graph.Store, roots map[string]string) ([]*EnrichResult, error) { if !m.config.Enabled { return nil, nil } @@ -202,7 +202,7 @@ func (m *Manager) configPriorityFor(name string) (int, bool) { // repo root and appends the results. Extracted so EnrichAll can share // the logging + lastResults bookkeeping between eager and Router-backed // providers. -func (m *Manager) runEnrichForProvider(g *graph.Graph, roots map[string]string, lang string, provider Provider, results []*EnrichResult) []*EnrichResult { +func (m *Manager) runEnrichForProvider(g graph.Store, roots map[string]string, lang string, provider Provider, results []*EnrichResult) []*EnrichResult { for repoName, repoRoot := range roots { start := time.Now() m.logger.Info("semantic enrichment starting", @@ -245,7 +245,7 @@ func (m *Manager) runEnrichForProvider(g *graph.Graph, roots map[string]string, } // EnrichFile runs incremental enrichment for a single file change. -func (m *Manager) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*EnrichResult, error) { +func (m *Manager) EnrichFile(g graph.Store, repoRoot, filePath string) (*EnrichResult, error) { if !m.config.Enabled || !m.config.EnrichOnWatch { return nil, nil } diff --git a/internal/semantic/manager_test.go b/internal/semantic/manager_test.go index 3a9cd906..26609c3e 100644 --- a/internal/semantic/manager_test.go +++ b/internal/semantic/manager_test.go @@ -15,7 +15,7 @@ type mockProvider struct { name string languages []string available bool - enrichFunc func(g *graph.Graph, root string) (*EnrichResult, error) + enrichFunc func(g graph.Store, root string) (*EnrichResult, error) closed bool } @@ -24,7 +24,7 @@ func (m *mockProvider) Languages() []string { return m.languages } func (m *mockProvider) Available() bool { return m.available } func (m *mockProvider) Close() error { m.closed = true; return nil } -func (m *mockProvider) Enrich(g *graph.Graph, repoRoot string) (*EnrichResult, error) { +func (m *mockProvider) Enrich(g graph.Store, repoRoot string) (*EnrichResult, error) { if m.enrichFunc != nil { return m.enrichFunc(g, repoRoot) } @@ -37,7 +37,7 @@ func (m *mockProvider) Enrich(g *graph.Graph, repoRoot string) (*EnrichResult, e }, nil } -func (m *mockProvider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*EnrichResult, error) { +func (m *mockProvider) EnrichFile(g graph.Store, repoRoot, filePath string) (*EnrichResult, error) { return nil, nil } @@ -87,7 +87,7 @@ func TestManager_PrioritySelection(t *testing.T) { name: "high-priority", languages: []string{"go"}, available: true, - enrichFunc: func(g *graph.Graph, root string) (*EnrichResult, error) { + enrichFunc: func(g graph.Store, root string) (*EnrichResult, error) { highCalled = true return &EnrichResult{Provider: "high-priority", Language: "go"}, nil }, @@ -96,7 +96,7 @@ func TestManager_PrioritySelection(t *testing.T) { name: "low-priority", languages: []string{"go"}, available: true, - enrichFunc: func(g *graph.Graph, root string) (*EnrichResult, error) { + enrichFunc: func(g graph.Store, root string) (*EnrichResult, error) { lowCalled = true return &EnrichResult{Provider: "low-priority", Language: "go"}, nil }, diff --git a/internal/semantic/matcher.go b/internal/semantic/matcher.go index f5a677e5..6d15c723 100644 --- a/internal/semantic/matcher.go +++ b/internal/semantic/matcher.go @@ -48,7 +48,7 @@ func (m *SymbolMap) Size() int { // MatchNodeByFileLine finds a Gortex node by file path and line number. // This is the primary matching strategy for SCIP and LSP results. // It finds the innermost (smallest range) non-file node containing the line. -func MatchNodeByFileLine(g *graph.Graph, filePath string, line int) *graph.Node { +func MatchNodeByFileLine(g graph.Store, filePath string, line int) *graph.Node { nodes := g.GetFileNodes(filePath) // First: find the innermost node containing this line (smallest range). @@ -89,12 +89,12 @@ func MatchNodeByFileLine(g *graph.Graph, filePath string, line int) *graph.Node } // MatchNodeByQualName finds a Gortex node by qualified name. -func MatchNodeByQualName(g *graph.Graph, qualName string) *graph.Node { +func MatchNodeByQualName(g graph.Store, qualName string) *graph.Node { return g.GetNodeByQualName(qualName) } // MatchNodeByNameInFile finds a Gortex node by name within a specific file. -func MatchNodeByNameInFile(g *graph.Graph, name, filePath string) *graph.Node { +func MatchNodeByNameInFile(g graph.Store, name, filePath string) *graph.Node { nodes := g.GetFileNodes(filePath) for _, n := range nodes { if n.Name == name { diff --git a/internal/semantic/provider.go b/internal/semantic/provider.go index 44bca818..20ff262f 100644 --- a/internal/semantic/provider.go +++ b/internal/semantic/provider.go @@ -20,12 +20,12 @@ type Provider interface { // Enrich performs a full enrichment pass over the graph for the given repo root. // It upgrades edge confidence, adds missing edges, and fills Node.Meta fields. // Called after tree-sitter indexing + resolver pass completes. - Enrich(g *graph.Graph, repoRoot string) (*EnrichResult, error) + Enrich(g graph.Store, repoRoot string) (*EnrichResult, error) // EnrichFile performs a targeted enrichment for a single file and its // immediate dependents. Used in watch mode for incremental updates. // Returns nil result if incremental enrichment is not supported. - EnrichFile(g *graph.Graph, repoRoot string, filePath string) (*EnrichResult, error) + EnrichFile(g graph.Store, repoRoot string, filePath string) (*EnrichResult, error) // Close releases any resources held by the provider (daemon processes, // temp files, connections). diff --git a/internal/semantic/scip/provider.go b/internal/semantic/scip/provider.go index 16c628c4..7877b4a1 100644 --- a/internal/semantic/scip/provider.go +++ b/internal/semantic/scip/provider.go @@ -61,7 +61,7 @@ func (p *Provider) Available() bool { return err == nil } -func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResult, error) { +func (p *Provider) Enrich(g graph.Store, repoRoot string) (*semantic.EnrichResult, error) { start := time.Now() // Run the SCIP indexer. @@ -86,7 +86,7 @@ func (p *Provider) Enrich(g *graph.Graph, repoRoot string) (*semantic.EnrichResu return result, nil } -func (p *Provider) EnrichFile(g *graph.Graph, repoRoot, filePath string) (*semantic.EnrichResult, error) { +func (p *Provider) EnrichFile(g graph.Store, repoRoot, filePath string) (*semantic.EnrichResult, error) { // SCIP doesn't support incremental indexing well — re-run full enrichment. // For large repos, this should be gated by the watch debounce. return nil, nil @@ -142,7 +142,7 @@ func (p *Provider) runIndexer(repoRoot string) (string, error) { } // enrichFromIndex maps SCIP data to the Gortex graph. -func (p *Provider) enrichFromIndex(g *graph.Graph, index *SCIPIndex, repoRoot string) *semantic.EnrichResult { +func (p *Provider) enrichFromIndex(g graph.Store, index *SCIPIndex, repoRoot string) *semantic.EnrichResult { result := &semantic.EnrichResult{} symMap := semantic.NewSymbolMap() @@ -272,6 +272,11 @@ func (p *Provider) enrichFromIndex(g *graph.Graph, index *SCIPIndex, repoRoot st } // Phase 4: Enrich node metadata from symbol documentation. + // Collect stamped nodes and round-trip them through the store at the + // end — EnrichNodeMeta mutates Node.Meta in place, which does not + // persist on disk backends (GetNode returns a per-call copy). See + // semantic.EnrichNodeMeta. + var stampedNodes []*graph.Node for _, doc := range index.Documents { for _, sym := range doc.Symbols { nodeID, ok := symMap.GortexID(sym.Symbol) @@ -289,16 +294,20 @@ func (p *Provider) enrichFromIndex(g *graph.Graph, index *SCIPIndex, repoRoot st if typeInfo != "" { semantic.EnrichNodeMeta(node, "semantic_type", typeInfo, p.Name()) result.NodesEnriched++ + stampedNodes = append(stampedNodes, node) } } } } + if len(stampedNodes) > 0 { + g.AddBatch(stampedNodes, nil) + } return result } // findContainingNode finds the innermost Gortex node that contains the given line. -func findContainingNode(g *graph.Graph, filePath string, line int) *graph.Node { +func findContainingNode(g graph.Store, filePath string, line int) *graph.Node { nodes := g.GetFileNodes(filePath) var best *graph.Node bestSize := int(^uint(0) >> 1) diff --git a/internal/server/dashboard.go b/internal/server/dashboard.go index e10b09fa..77db06f0 100644 --- a/internal/server/dashboard.go +++ b/internal/server/dashboard.go @@ -175,7 +175,7 @@ func splitOwner(prefix string) (owner, name string) { return "", prefix } -func reposFromGraph(g *graph.Graph) []repoEntry { +func reposFromGraph(g graph.Store) []repoEntry { stats := g.RepoStats() out := make([]repoEntry, 0, len(stats)) for prefix, s := range stats { @@ -1326,7 +1326,7 @@ func (h *Handler) handleCaveats(w http.ResponseWriter, r *http.Request) { // graph. Entries with an unresolvable symbol (e.g. cycle placeholders // or stale IDs from a prior index) are left untouched so the caller can // detect the gap instead of rendering zeros that look like real data. -func enrichCaveats(g *graph.Graph, caveats []caveatEntry) { +func enrichCaveats(g graph.Store, caveats []caveatEntry) { if g == nil { return } diff --git a/internal/server/handler.go b/internal/server/handler.go index 67fdd3c0..ae61faea 100644 --- a/internal/server/handler.go +++ b/internal/server/handler.go @@ -50,7 +50,7 @@ import ( // SetConfigManager / SetEventHub after construction. type Handler struct { mcpServer *mcpserver.MCPServer - graph *graph.Graph + graph graph.Store version string logger *zap.Logger mux *http.ServeMux @@ -65,7 +65,7 @@ type Handler struct { } // NewHandler creates an HTTP handler that dispatches to MCP tools. -func NewHandler(mcpServer *mcpserver.MCPServer, g *graph.Graph, version string, logger *zap.Logger) *Handler { +func NewHandler(mcpServer *mcpserver.MCPServer, g graph.Store, version string, logger *zap.Logger) *Handler { h := &Handler{ mcpServer: mcpServer, graph: g, @@ -84,7 +84,7 @@ func NewHandler(mcpServer *mcpserver.MCPServer, g *graph.Graph, version string, func (h *Handler) Mux() *http.ServeMux { return h.mux } // Graph returns the graph instance for sub-handlers that need direct access. -func (h *Handler) Graph() *graph.Graph { return h.graph } +func (h *Handler) Graph() graph.Store { return h.graph } // SetEventHub wires the watch-mode event hub so /v1/events can stream // graph-change events to subscribers, and starts the activity-buffer diff --git a/internal/skills/build.go b/internal/skills/build.go index 966132eb..8284d2d2 100644 --- a/internal/skills/build.go +++ b/internal/skills/build.go @@ -19,7 +19,7 @@ type BuildOpts struct { // Returns (nil, "") when no community meets the MinSize threshold — // callers treat both outputs as opaque payloads and pass them through // to adapters via agents.Env. -func Build(g *graph.Graph, opts BuildOpts) ([]GeneratedSkill, string) { +func Build(g graph.Store, opts BuildOpts) ([]GeneratedSkill, string) { if g == nil { return nil, "" } diff --git a/internal/skills/generator.go b/internal/skills/generator.go index ef69be3c..0e4cf6df 100644 --- a/internal/skills/generator.go +++ b/internal/skills/generator.go @@ -16,7 +16,7 @@ import ( type Generator struct { communities *analysis.CommunityResult processes *analysis.ProcessResult - graph *graph.Graph + graph graph.Store minSize int maxSkills int } @@ -30,7 +30,7 @@ type GeneratedSkill struct { } // New creates a skill generator. -func New(communities *analysis.CommunityResult, processes *analysis.ProcessResult, g *graph.Graph) *Generator { +func New(communities *analysis.CommunityResult, processes *analysis.ProcessResult, g graph.Store) *Generator { return &Generator{ communities: communities, processes: processes, diff --git a/internal/sql/registry.go b/internal/sql/registry.go index 085d0b31..41aaaa5d 100644 --- a/internal/sql/registry.go +++ b/internal/sql/registry.go @@ -44,7 +44,7 @@ type RebuildStats struct { // Returns counts for telemetry; rebuilt edges idempotently replace // any existing edges with the same edgeKey, so a second call after // the first reports tablesCreated=0, emittersLinked=0. -func RebuildTablesFromStringRegistry(g *graph.Graph) RebuildStats { +func RebuildTablesFromStringRegistry(g graph.Store) RebuildStats { if g == nil { return RebuildStats{} } diff --git a/internal/thirdparty/go-ladybug/LICENSE b/internal/thirdparty/go-ladybug/LICENSE new file mode 100644 index 00000000..3939a23a --- /dev/null +++ b/internal/thirdparty/go-ladybug/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022-2025 Kùzu Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/internal/thirdparty/go-ladybug/README.md b/internal/thirdparty/go-ladybug/README.md new file mode 100644 index 00000000..bb88bc03 --- /dev/null +++ b/internal/thirdparty/go-ladybug/README.md @@ -0,0 +1,53 @@ +# go-ladybug +[![Go Reference](https://pkg.go.dev/badge/github.com/LadybugDB/go-ladybug.svg)](https://pkg.go.dev/github.com/LadybugDB/go-ladybug) +[![CI](https://github.com/LadybugDB/go-ladybug/actions/workflows/go.yml/badge.svg)](https://github.com/LadybugDB/go-ladybug/actions/workflows/go.yml) +[![Go Report Card](https://goreportcard.com/badge/github.com/LadybugDB/go-ladybug)](https://goreportcard.com/report/github.com/LadybugDB/go-ladybug) +[![License](https://img.shields.io/github/license/lbugdb/go-ladybug)](LICENSE) + +Official Go language binding for [LadybugDB](https://github.com/LadybugDB/ladybug). Ladybug is an embeddable property graph database management system built for query speed and scalability. For more information, please visit the [Ladybug GitHub repository](https://github.com/LadybugDB/ladybug) or the [LadybugDB website](https://ladybugdb.com). + +## Installation + +```bash +go get github.com/LadybugDB/go-ladybug +``` + +## Get started +An example project is available in the [example](example) directory. + +To run the example project, you can use the following command: + +```bash +cd example +go run main.go +``` + +## Docs +The full documentation is available at [pkg.go.dev](https://pkg.go.dev/github.com/LadybugDB/go-ladybug). + +## Tests +To run the tests, you can use the following command: + +```bash +go test -v +``` + +## Windows Support +For Cgo to properly work on Windows, MSYS2 with `UCRT64` environment is required. You can follow the instructions below to set it up: +1. Install MSYS2 from [here](https://www.msys2.org/). +2. Install Microsoft Visual C++ 2015-2022 Redistributable (x64) from [here](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170). +3. Install the required packages by running the following command in the MSYS2 terminal: + ```bash + pacman -S mingw-w64-ucrt-x86_64-go mingw-w64-ucrt-x86_64-gcc + ``` +4. Add the path to `lbug_shared.dll` to your `PATH` environment variable. You can do this by running the following command in the MSYS2 terminal: + ```bash + export PATH="$(pwd)/lib/dynamic/windows:$PATH" + ``` + This is required to run the test cases and examples. If you are deploying your application, you can also copy the `lbug_shared.dll` file to the same directory as your executable or to a directory that is already in the `PATH`. + +For an example of how to properly set up the environment, you can also refer to our CI configuration file [here](.github/workflows/go.yml). + +## Contributing +We welcome contributions to go-ladybug. By contributing to go-ladybug, you agree that your contributions will be licensed under the [MIT License](LICENSE). Please read the [contributing guide](CONTRIBUTING.md) for more information. + diff --git a/internal/thirdparty/go-ladybug/cgo_shared.go b/internal/thirdparty/go-ladybug/cgo_shared.go new file mode 100644 index 00000000..0da860c0 --- /dev/null +++ b/internal/thirdparty/go-ladybug/cgo_shared.go @@ -0,0 +1,62 @@ +package lbug + +//go:generate bash ../../../scripts/fetch-lbug.sh + +/* +// liblbug is fetched by scripts/fetch-lbug.sh (not committed). +// +// linux + darwin: STATIC — liblbug.a is linked in (only the archive +// lives in lib/static/-/, so `-llbug` resolves to it) for a +// self-contained binary with no runtime lib to ship. The C++ runtime is +// linked too: libc++ on darwin (system, always present); libstdc++ + +// libgcc statically on linux so the binary doesn't need them at runtime. +// +// windows: DYNAMIC — lbug's windows release is MSVC-built (its C++ +// runtime is MSVCP140/VCRUNTIME140), which cannot be statically linked +// into a mingw binary. The .exe links directly against lbug_shared.dll +// (mingw ld reads the DLL's clean C ABI export table via -l:, so +// no import lib / gendef is needed) and ships the DLL — plus the VC++ +// runtime — alongside the .exe at runtime. +// FTS extensions + dlopen: liblbug loads its FTS (and other) extensions +// via dlopen at runtime, and those extension .so/.dylibs resolve liblbug's +// C++ symbols (e.g. typeinfo for lbug::catalog::IndexAuxInfo) FROM THE HOST +// PROCESS. When liblbug is a shared lib those symbols are globally visible; +// static-linked, two things must be true at link time: +// +// 1. the symbol must be PRESENT in the binary. Most of the symbols the +// extension needs are C++ RTTI (typeinfo/vtable) emitted as weak +// COMDAT data in liblbug.a. gortex's plain-C API calls never trigger +// RTTI, so nothing in the link references them, so demand-driven +// archive selection DROPS those object files entirely. -rdynamic +// cannot export a symbol that was never linked in. --whole-archive +// around -llbug forces every liblbug object (and thus every weak +// typeinfo/vtable) into the binary, exactly as a shared liblbug would +// expose them. --no-whole-archive turns it back off before the system +// libs so we don't try to whole-archive libstdc++/libm/etc. +// 2. the symbol must be EXPORTED in the dynamic symbol table so the +// dlopen'd extension can bind to it: -rdynamic (clang -> -export_dynamic, +// gcc -> --export-dynamic). +// +// darwin doesn't need --whole-archive: ld64 pulls the typeinfo objects in +// on its own, so -rdynamic alone suffices there. +// +// --whole-archive is NOT on cgo's #cgo LDFLAGS allowlist, so the linux +// build paths export CGO_LDFLAGS_ALLOW='-Wl,--(no-)?whole-archive' (Makefile +// / CI test job / release goreleaser env). Without it the linux build fails +// with "invalid flag in #cgo LDFLAGS". -rdynamic IS on the allowlist. +#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/lib/static/darwin-amd64 -llbug -lc++ -rdynamic +#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/lib/static/darwin-arm64 -llbug -lc++ -rdynamic +// libstdc++ is wrapped in -Wl,-Bstatic/-Bdynamic (NOT -static-libstdc++): +// cgo may link the final binary with the C driver (gcc), which never +// auto-appends libstdc++, so -static-libstdc++ could be a no-op and the +// explicit -lstdc++ would resolve to libstdc++.so.6 at runtime — +// defeating the self-contained goal. -Bstatic forces the .a. libm/dl/ +// pthread stay dynamic (system libs always present); libgcc is statically +// linked via -static-libgcc. --export-dynamic exposes liblbug's symbols +// for the dlopen'd FTS extension (see darwin note above). +#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/lib/static/linux-amd64 -Wl,--whole-archive -llbug -Wl,--no-whole-archive -Wl,-Bstatic -lstdc++ -Wl,-Bdynamic -lm -ldl -lpthread -static-libgcc -rdynamic +#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/lib/static/linux-arm64 -Wl,--whole-archive -llbug -Wl,--no-whole-archive -Wl,-Bstatic -lstdc++ -Wl,-Bdynamic -lm -ldl -lpthread -static-libgcc -rdynamic +#cgo windows LDFLAGS: -L${SRCDIR}/lib/dynamic/windows -l:lbug_shared.dll +#include "lbug.h" +*/ +import "C" diff --git a/internal/thirdparty/go-ladybug/connection.go b/internal/thirdparty/go-ladybug/connection.go new file mode 100644 index 00000000..266c9f9c --- /dev/null +++ b/internal/thirdparty/go-ladybug/connection.go @@ -0,0 +1,147 @@ +package lbug + +// #include "lbug.h" +// #include +import "C" + +import ( + "fmt" + "runtime" + "unsafe" +) + +// Connection represents a connection to a Lbug database. +type Connection struct { + cConnection C.lbug_connection + database *Database + isClosed bool +} + +// OpenConnection opens a connection to the specified database. +func OpenConnection(database *Database) (*Connection, error) { + conn := &Connection{} + conn.database = database + runtime.SetFinalizer(conn, func(conn *Connection) { + conn.Close() + }) + status := C.lbug_connection_init(&database.cDatabase, &conn.cConnection) + if status != C.LbugSuccess { + return conn, fmt.Errorf("failed to open connection with status %d", status) + } + return conn, nil +} + +// Close closes the Connection. Calling this method is optional. +// The Connection will be closed automatically when it is garbage collected. +func (conn *Connection) Close() { + if conn.isClosed { + return + } + C.lbug_connection_destroy(&conn.cConnection) + conn.isClosed = true +} + +// GetMaxNumThreads returns the maximum number of threads that can be used for +// executing a query in parallel. +func (conn *Connection) GetMaxNumThreads() uint64 { + numThreads := C.uint64_t(0) + C.lbug_connection_get_max_num_thread_for_exec(&conn.cConnection, &numThreads) + return uint64(numThreads) +} + +// SetMaxNumThreads sets the maximum number of threads that can be used for +// executing a query in parallel. +func (conn *Connection) SetMaxNumThreads(numThreads uint64) { + C.lbug_connection_set_max_num_thread_for_exec(&conn.cConnection, C.uint64_t(numThreads)) +} + +// Interrupt interrupts the execution of the current query on the connection. +func (conn *Connection) Interrupt() { + C.lbug_connection_interrupt(&conn.cConnection) +} + +// SetTimeout sets the timeout for the queries executed on the connection. +// The timeout is specified in milliseconds. A value of 0 means no timeout. +// If a query takes longer than the specified timeout, it will be interrupted. +func (conn *Connection) SetTimeout(timeout uint64) { + C.lbug_connection_set_query_timeout(&conn.cConnection, C.uint64_t(timeout)) +} + +// Query executes the specified query string and returns the result. +func (conn *Connection) Query(query string) (*QueryResult, error) { + cQuery := C.CString(query) + defer C.free(unsafe.Pointer(cQuery)) + queryResult := &QueryResult{} + queryResult.connection = conn + runtime.SetFinalizer(queryResult, func(queryResult *QueryResult) { + queryResult.Close() + }) + status := C.lbug_connection_query(&conn.cConnection, cQuery, &queryResult.cQueryResult) + if status != C.LbugSuccess || !C.lbug_query_result_is_success(&queryResult.cQueryResult) { + cErrMsg := C.lbug_query_result_get_error_message(&queryResult.cQueryResult) + defer C.lbug_destroy_string(cErrMsg) + return queryResult, fmt.Errorf(C.GoString(cErrMsg)) + } + return queryResult, nil +} + +// Execute executes the specified prepared statement with the specified arguments and returns the result. +// The arguments are a map of parameter names to values. +func (conn *Connection) Execute(preparedStatement *PreparedStatement, args map[string]any) (*QueryResult, error) { + queryResult := &QueryResult{} + queryResult.connection = conn + for key, value := range args { + err := conn.bindParameter(preparedStatement, key, value) + if err != nil { + return queryResult, err + } + } + runtime.SetFinalizer(queryResult, func(queryResult *QueryResult) { + queryResult.Close() + }) + status := C.lbug_connection_execute(&conn.cConnection, &preparedStatement.cPreparedStatement, &queryResult.cQueryResult) + if status != C.LbugSuccess || !C.lbug_query_result_is_success(&queryResult.cQueryResult) { + cErrMsg := C.lbug_query_result_get_error_message(&queryResult.cQueryResult) + defer C.lbug_destroy_string(cErrMsg) + return queryResult, fmt.Errorf(C.GoString(cErrMsg)) + } + return queryResult, nil +} + +// BindParameter binds a parameter to the prepared statement. +func (conn *Connection) bindParameter(preparedStatement *PreparedStatement, key string, value any) error { + cKey := C.CString(key) + defer C.free(unsafe.Pointer(cKey)) + var status C.lbug_state + var cValue *C.lbug_value + var valueConversionError error + cValue, valueConversionError = goValueToLbugValue(value) + if valueConversionError != nil { + return fmt.Errorf("failed to convert Go value to Lbug value: %v", valueConversionError) + } + defer C.lbug_value_destroy(cValue) + status = C.lbug_prepared_statement_bind_value(&preparedStatement.cPreparedStatement, cKey, cValue) + if status != C.LbugSuccess { + return fmt.Errorf("failed to bind value with status %d", status) + } + return nil +} + +// Prepare returns a prepared statement for the specified query string. +// The prepared statement can be used to execute the query with parameters. +func (conn *Connection) Prepare(query string) (*PreparedStatement, error) { + cQuery := C.CString(query) + defer C.free(unsafe.Pointer(cQuery)) + preparedStatement := &PreparedStatement{} + preparedStatement.connection = conn + runtime.SetFinalizer(preparedStatement, func(preparedStatement *PreparedStatement) { + preparedStatement.Close() + }) + status := C.lbug_connection_prepare(&conn.cConnection, cQuery, &preparedStatement.cPreparedStatement) + if status != C.LbugSuccess || !C.lbug_prepared_statement_is_success(&preparedStatement.cPreparedStatement) { + cErrMsg := C.lbug_prepared_statement_get_error_message(&preparedStatement.cPreparedStatement) + defer C.lbug_destroy_string(cErrMsg) + return preparedStatement, fmt.Errorf(C.GoString(cErrMsg)) + } + return preparedStatement, nil +} diff --git a/internal/thirdparty/go-ladybug/database.go b/internal/thirdparty/go-ladybug/database.go new file mode 100644 index 00000000..b719b495 --- /dev/null +++ b/internal/thirdparty/go-ladybug/database.go @@ -0,0 +1,92 @@ +// Package lbug provides a Go interface to Lbug graph database management system. +// The package is a wrapper around the C API of Lbug. +package lbug + +// #include "lbug.h" +// #include +import "C" +import ( + "fmt" + "runtime" + "unsafe" +) + +// SystemConfig represents the configuration of Lbug database system. +// BufferPoolSize is the size of the buffer pool in bytes. +// MaxNumThreads is the maximum number of threads that can be used by the database system. +// EnableCompression is a boolean flag to enable or disable compression. +// ReadOnly is a boolean flag to open the database in read-only mode. +// MaxDbSize is the maximum size of the database in bytes. +type SystemConfig struct { + BufferPoolSize uint64 + MaxNumThreads uint64 + EnableCompression bool + ReadOnly bool + MaxDbSize uint64 +} + +// DefaultSystemConfig returns the default system configuration. +// The default system configuration is as follows: +// BufferPoolSize: 80% of the total system memory. +// MaxNumThreads: Number of CPU cores. +// EnableCompression: true. +// ReadOnly: false. +// MaxDbSize: 0 (unlimited). +func DefaultSystemConfig() SystemConfig { + cSystemConfig := C.lbug_default_system_config() + return SystemConfig{ + BufferPoolSize: uint64(cSystemConfig.buffer_pool_size), + MaxNumThreads: uint64(cSystemConfig.max_num_threads), + EnableCompression: bool(cSystemConfig.enable_compression), + ReadOnly: bool(cSystemConfig.read_only), + MaxDbSize: uint64(cSystemConfig.max_db_size), + } +} + +// toC converts the SystemConfig Go struct to the C struct. +func (config SystemConfig) toC() C.lbug_system_config { + cSystemConfig := C.lbug_default_system_config() + cSystemConfig.buffer_pool_size = C.uint64_t(config.BufferPoolSize) + cSystemConfig.max_num_threads = C.uint64_t(config.MaxNumThreads) + cSystemConfig.enable_compression = C.bool(config.EnableCompression) + cSystemConfig.read_only = C.bool(config.ReadOnly) + cSystemConfig.max_db_size = C.uint64_t(config.MaxDbSize) + return cSystemConfig +} + +// Database represents a Lbug database instance. +type Database struct { + cDatabase C.lbug_database + isClosed bool +} + +// OpenDatabase opens a Lbug database at the given path with the given system configuration. +func OpenDatabase(path string, systemConfig SystemConfig) (*Database, error) { + db := &Database{} + runtime.SetFinalizer(db, func(db *Database) { + db.Close() + }) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + cSystemConfig := systemConfig.toC() + status := C.lbug_database_init(cPath, cSystemConfig, &db.cDatabase) + if status != C.LbugSuccess { + return db, fmt.Errorf("failed to open database with status %d", status) + } + return db, nil +} + +// OpenInMemoryDatabase opens a Lbug database in in-memory mode with the given system configuration. +func OpenInMemoryDatabase(systemConfig SystemConfig) (*Database, error) { + return OpenDatabase(":memory:", systemConfig) +} + +// Close closes the database. Calling this method is optional. +// The database will be closed automatically when it is garbage collected. +func (db *Database) Close() { + if db.isClosed { + return + } + C.lbug_database_destroy(&db.cDatabase) + db.isClosed = true +} diff --git a/internal/thirdparty/go-ladybug/driver.go b/internal/thirdparty/go-ladybug/driver.go new file mode 100644 index 00000000..c8c24e25 --- /dev/null +++ b/internal/thirdparty/go-ladybug/driver.go @@ -0,0 +1,371 @@ +package lbug + +import ( + "context" + "database/sql" + "database/sql/driver" + "fmt" + "io" + "net/url" + "strconv" + "sync" +) + +func init() { + var _ driver.Result = new(resultSet) + var _ driver.Rows = new(rowSet) + var _ SQLConnection = new(connection) + var _ SQLStatement = new(statement) + var _ SQLConnector = new(connector) + var _ driver.DriverContext = new(sqlDriver) + sql.Register(Name, &sqlDriver{cc: map[string]driver.Connector{}}) +} + +const Name = "lbug" + +type Finalizer interface { + Close() +} + +type SQLStatement interface { + driver.Stmt + driver.StmtExecContext + driver.StmtQueryContext +} + +type SQLConnection interface { + driver.Conn + driver.Pinger + driver.ConnPrepareContext + driver.QueryerContext + driver.ExecerContext +} + +type SQLConnector interface { + driver.Connector + io.Closer +} + +type sqlDriver struct { + sync.RWMutex + cc map[string]driver.Connector +} + +// OpenConnector lbug://path?poolSize=1024&threads=1024&dbSize=1024&compression=1&readOnly=1 +func (that *sqlDriver) OpenConnector(dsn string) (driver.Connector, error) { + u, err := url.Parse(dsn) + if nil != err { + return nil, err + } + q := u.Query() + systemConfig := DefaultSystemConfig() + if err = parse(q.Get("poolSize"), func(v uint64) { + systemConfig.BufferPoolSize = v + }); nil != err { + return nil, err + } + if err = parse(q.Get("threads"), func(v uint64) { + systemConfig.MaxNumThreads = v + }); nil != err { + return nil, err + } + if err = parse(q.Get("dbSize"), func(v uint64) { + systemConfig.MaxDbSize = v + }); nil != err { + return nil, err + } + if err = parse(q.Get("compression"), func(v uint64) { + systemConfig.EnableCompression = v == uint64(1) + }); nil != err { + return nil, err + } + if err = parse(q.Get("readOnly"), func(v uint64) { + systemConfig.ReadOnly = v == uint64(1) + }); nil != err { + return nil, err + } + db, err := OpenDatabase(u.Path, systemConfig) + if nil != err { + release(db) + return nil, err + } + return &connector{ + d: that, + dsn: dsn, + db: db, + }, nil +} + +func (that *sqlDriver) Open(dsn string) (driver.Conn, error) { + if cc := func() driver.Connector { + that.RLock() + defer that.RUnlock() + + return that.cc[dsn] + }(); nil != cc { + return cc.Connect(nextContext()) + } + that.Lock() + defer that.Unlock() + + cc, err := that.OpenConnector(dsn) + if nil != err { + return nil, err + } + that.cc[dsn] = cc + return cc.Connect(nextContext()) +} + +type connector struct { + dsn string + d driver.Driver + db *Database +} + +func (that *connector) Close() error { + that.db.Close() + return nil +} + +func (that *connector) Driver() driver.Driver { + return that.d +} + +func (that *connector) Connect(ctx context.Context) (driver.Conn, error) { + conn, err := OpenConnection(that.db) + if nil != err { + release(conn) + return nil, err + } + return &connection{ + conn: conn, + }, nil +} + +type connection struct { + conn *Connection +} + +func (that *connection) Ping(ctx context.Context) error { + return nil +} + +func (that *connection) QueryContext(ctx context.Context, query string, args []driver.NamedValue) (driver.Rows, error) { + stmt, err := that.prepareContext(ctx, query) + if nil != err { + return nil, err + } + defer closeQuiet(stmt) + return stmt.QueryContext(ctx, args) +} + +func (that *connection) ExecContext(ctx context.Context, query string, args []driver.NamedValue) (driver.Result, error) { + stmt, err := that.prepareContext(ctx, query) + if nil != err { + return nil, err + } + defer closeQuiet(stmt) + return stmt.ExecContext(ctx, args) +} + +func (that *connection) PrepareContext(ctx context.Context, query string) (driver.Stmt, error) { + return that.prepareContext(ctx, query) +} + +func (that *connection) Prepare(query string) (driver.Stmt, error) { + return that.prepareContext(nextContext(), query) +} + +func (that *connection) prepareContext(_ context.Context, query string) (SQLStatement, error) { + stmt, err := that.conn.Prepare(query) + if nil != err { + release(stmt) + return nil, err + } + return &statement{ + stmt: stmt, + conn: that.conn, + query: query, + num: -1, + }, nil +} + +func (that *connection) Close() error { + that.conn.Close() + return nil +} + +func (that *connection) Begin() (driver.Tx, error) { + return &transaction{ + conn: that, + }, nil +} + +type statement struct { + stmt *PreparedStatement + conn *Connection + query string + num int // -1 +} + +func (that *statement) Close() error { + that.stmt.Close() + return nil +} + +func (that *statement) NumInput() int { + return that.num +} + +func (that *statement) ExecContext(ctx context.Context, args []driver.NamedValue) (driver.Result, error) { + raw := make(map[string]any, len(args)) + for _, arg := range args { + raw[arg.Name] = arg.Value + } + rs, err := that.conn.Execute(that.stmt, raw) + if nil != err { + release(rs) + return nil, err + } + defer rs.Close() + + return &resultSet{ + lastInsertId: 0, + rowsAffected: int64(rs.GetNumberOfRows()), + }, nil +} + +func (that *statement) Exec(args []driver.Value) (driver.Result, error) { + list := make([]driver.NamedValue, len(args)) + for i, v := range args { + na, ok := v.(sql.NamedArg) + if !ok { + return nil, fmt.Errorf("only support named arguments") + } + list[i] = driver.NamedValue{ + Name: na.Name, + Ordinal: i + 1, + Value: na.Value, + } + } + return that.ExecContext(nextContext(), list) +} + +func (that *statement) QueryContext(ctx context.Context, args []driver.NamedValue) (driver.Rows, error) { + raw := make(map[string]any, len(args)) + for _, arg := range args { + raw[arg.Name] = arg.Value + } + rs, err := that.conn.Execute(that.stmt, raw) + if nil != err { + release(rs) + return nil, err + } + return &rowSet{rs: rs}, nil +} + +func (that *statement) Query(args []driver.Value) (driver.Rows, error) { + list := make([]driver.NamedValue, len(args)) + for i, v := range args { + na, ok := v.(sql.NamedArg) + if !ok { + return nil, fmt.Errorf("only support named arguments") + } + list[i] = driver.NamedValue{ + Name: na.Name, + Ordinal: i + 1, + Value: na.Value, + } + } + return that.QueryContext(nextContext(), list) +} + +// transaction is not support by now. +type transaction struct { + conn SQLConnection +} + +func (that *transaction) Commit() error { + return nil +} + +func (that *transaction) Rollback() error { + return nil +} + +type rowSet struct { + rs *QueryResult +} + +func (that *rowSet) Columns() []string { + return that.rs.GetColumnNames() +} + +func (that *rowSet) Close() error { + that.rs.Close() + return nil +} + +func (that *rowSet) Next(dest []driver.Value) error { + if !that.rs.HasNext() { + return io.EOF + } + row, err := that.rs.Next() + if nil != err { + release(row) + return err + } + defer row.Close() + + values, err := row.GetAsSlice() + if nil != err { + return err + } + for idx := range dest { + if len(values) <= idx { + break + } + dest[idx] = values[idx] + } + return nil +} + +type resultSet struct { + lastInsertId int64 + rowsAffected int64 +} + +func (that *resultSet) LastInsertId() (int64, error) { + return that.lastInsertId, nil +} + +func (that *resultSet) RowsAffected() (int64, error) { + return that.rowsAffected, nil +} + +// Release C resource +func release(f Finalizer) { + if nil != f { + f.Close() + } +} + +func nextContext() context.Context { + return context.Background() +} + +func closeQuiet(closer io.Closer) { + _ = closer.Close() +} + +func parse(v string, fn func(v uint64)) error { + if "" == v { + return nil + } + iv, err := strconv.ParseUint(v, 10, 64) + if nil != err { + return err + } + fn(iv) + return nil +} diff --git a/internal/thirdparty/go-ladybug/flat_tuple.go b/internal/thirdparty/go-ladybug/flat_tuple.go new file mode 100644 index 00000000..fdbfa44f --- /dev/null +++ b/internal/thirdparty/go-ladybug/flat_tuple.go @@ -0,0 +1,79 @@ +package lbug + +// #include "lbug.h" +// #include +import "C" +import "fmt" + +// FlatTuple represents a row in the result set of a query. +type FlatTuple struct { + cFlatTuple C.lbug_flat_tuple + queryResult *QueryResult + isClosed bool +} + +// Close closes the FlatTuple. Calling this method is optional. +// The FlatTuple will be closed automatically when it is garbage collected. +func (tuple *FlatTuple) Close() { + if tuple.isClosed { + return + } + C.lbug_flat_tuple_destroy(&tuple.cFlatTuple) + tuple.isClosed = true +} + +// GetAsString returns the string representation of the FlatTuple. +// The string representation contains the values of the tuple separated by vertical bars. +func (tuple *FlatTuple) GetAsString() string { + cString := C.lbug_flat_tuple_to_string(&tuple.cFlatTuple) + defer C.lbug_destroy_string(cString) + return C.GoString(cString) +} + +// GetAsSlice returns the values of the FlatTuple as a slice. +// The order of the values in the slice is the same as the order of the columns +// in the query result. +func (tuple *FlatTuple) GetAsSlice() ([]any, error) { + length := uint64(tuple.queryResult.GetNumberOfColumns()) + values := make([]any, 0, length) + var errors []error + for i := uint64(0); i < length; i++ { + value, err := tuple.GetValue(i) + if err != nil { + errors = append(errors, err) + } + values = append(values, value) + } + if len(errors) > 0 { + return values, fmt.Errorf("failed to get values: %v", errors) + } + return values, nil +} + +// GetAsMap returns the values of the FlatTuple as a map. +// The keys of the map are the column names in the query result. +func (tuple *FlatTuple) GetAsMap() (map[string]any, error) { + columnNames := tuple.queryResult.GetColumnNames() + values, err := tuple.GetAsSlice() + if err != nil { + if len(columnNames) != len(values) { + return nil, err + } + } + m := make(map[string]any) + for i, columnName := range columnNames { + m[columnName] = values[i] + } + return m, err +} + +// GetValue returns the value at the given index in the FlatTuple. +func (tuple *FlatTuple) GetValue(index uint64) (any, error) { + var cValue C.lbug_value + status := C.lbug_flat_tuple_get_value(&tuple.cFlatTuple, C.uint64_t(index), &cValue) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get value with status: %d", status) + } + defer C.lbug_value_destroy(&cValue) + return lbugValueToGoValue(cValue) +} diff --git a/internal/thirdparty/go-ladybug/go.mod b/internal/thirdparty/go-ladybug/go.mod new file mode 100644 index 00000000..25fffd8b --- /dev/null +++ b/internal/thirdparty/go-ladybug/go.mod @@ -0,0 +1,8 @@ +module github.com/LadybugDB/go-ladybug + +go 1.20 + +require ( + github.com/google/uuid v1.6.0 + github.com/shopspring/decimal v1.4.0 +) diff --git a/internal/thirdparty/go-ladybug/go.sum b/internal/thirdparty/go-ladybug/go.sum new file mode 100644 index 00000000..6ddaae58 --- /dev/null +++ b/internal/thirdparty/go-ladybug/go.sum @@ -0,0 +1,4 @@ +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= +github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= diff --git a/internal/thirdparty/go-ladybug/lbug.h b/internal/thirdparty/go-ladybug/lbug.h new file mode 100644 index 00000000..2705b209 --- /dev/null +++ b/internal/thirdparty/go-ladybug/lbug.h @@ -0,0 +1,1634 @@ +#pragma once +#include +#include +#include +#ifdef _WIN32 +#include +#endif + +/* Export header from common/api.h */ +// Helpers +#if defined _WIN32 || defined __CYGWIN__ +#define LBUG_HELPER_DLL_IMPORT __declspec(dllimport) +#define LBUG_HELPER_DLL_EXPORT __declspec(dllexport) +#define LBUG_HELPER_DLL_LOCAL +#define LBUG_HELPER_DEPRECATED __declspec(deprecated) +#else +#define LBUG_HELPER_DLL_IMPORT __attribute__((visibility("default"))) +#define LBUG_HELPER_DLL_EXPORT __attribute__((visibility("default"))) +#define LBUG_HELPER_DLL_LOCAL __attribute__((visibility("hidden"))) +#define LBUG_HELPER_DEPRECATED __attribute__((__deprecated__)) +#endif + +#ifdef LBUG_STATIC_DEFINE +#define LBUG_API +#define LBUG_NO_EXPORT +#else +#ifndef LBUG_API +#ifdef LBUG_EXPORTS +/* We are building this library */ +#define LBUG_API LBUG_HELPER_DLL_EXPORT +#else +/* We are using this library */ +#define LBUG_API LBUG_HELPER_DLL_IMPORT +#endif +#endif + +#endif + +#ifndef LBUG_DEPRECATED +#define LBUG_DEPRECATED LBUG_HELPER_DEPRECATED +#endif + +#ifndef LBUG_DEPRECATED_EXPORT +#define LBUG_DEPRECATED_EXPORT LBUG_API LBUG_DEPRECATED +#endif +/* end export header */ + +// The Arrow C data interface. +// https://arrow.apache.org/docs/format/CDataInterface.html + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus +#define LBUG_C_API extern "C" LBUG_API +#else +#define LBUG_C_API LBUG_API +#endif + +/** + * @brief Stores runtime configuration for creating or opening a Database + */ +typedef struct { + // bufferPoolSize Max size of the buffer pool in bytes. + // The larger the buffer pool, the more data from the database files is kept in memory, + // reducing the amount of File I/O + uint64_t buffer_pool_size; + // The maximum number of threads to use during query execution + uint64_t max_num_threads; + // Whether or not to compress data on-disk for supported types + bool enable_compression; + // If true, open the database in read-only mode. No write transaction is allowed on the Database + // object. If false, open the database read-write. + bool read_only; + // The maximum size of the database in bytes. Note that this is introduced temporarily for now + // to get around with the default 8TB mmap address space limit under some environment. This + // will be removed once we implemente a better solution later. The value is default to 1 << 43 + // (8TB) under 64-bit environment and 1GB under 32-bit one (see `DEFAULT_VM_REGION_MAX_SIZE`). + uint64_t max_db_size; + // If true, the database will automatically checkpoint when the size of + // the WAL file exceeds the checkpoint threshold. + bool auto_checkpoint; + // The threshold of the WAL file size in bytes. When the size of the + // WAL file exceeds this threshold, the database will checkpoint if auto_checkpoint is true. + uint64_t checkpoint_threshold; + // If true, any WAL replay failure when loading the database will raise an error. + bool throw_on_wal_replay_failure; + // If true, checksums are enabled for WAL and storage pages. + bool enable_checksums; + // If true, multiple concurrent write transactions are allowed. + bool enable_multi_writes; + +#if defined(__APPLE__) + // The thread quality of service (QoS) for the worker threads. + // This works for Swift bindings on Apple platforms only. + uint32_t thread_qos; +#endif +} lbug_system_config; + +/** + * @brief lbug_database manages all database components. + */ +typedef struct { + void* _database; +} lbug_database; + +/** + * @brief lbug_connection is used to interact with a Database instance. Each connection is + * thread-safe. Multiple connections can connect to the same Database instance in a multi-threaded + * environment. + */ +typedef struct { + void* _connection; +} lbug_connection; + +/** + * @brief lbug_prepared_statement is a parameterized query which can avoid planning the same query + * for repeated execution. + */ +typedef struct { + void* _prepared_statement; + void* _bound_values; +} lbug_prepared_statement; + +/** + * @brief lbug_query_result stores the result of a query. + */ +typedef struct { + void* _query_result; + bool _is_owned_by_cpp; +} lbug_query_result; + +/** + * @brief lbug_flat_tuple stores a vector of values. + */ +typedef struct { + void* _flat_tuple; + bool _is_owned_by_cpp; +} lbug_flat_tuple; + +/** + * @brief lbug_logical_type is the lbug internal representation of data types. + */ +typedef struct { + void* _data_type; +} lbug_logical_type; + +/** + * @brief lbug_value is used to represent a value with any lbug internal dataType. + */ +typedef struct { + void* _value; + bool _is_owned_by_cpp; +} lbug_value; + +/** + * @brief lbug internal internal_id type which stores the table_id and offset of a node/rel. + */ +typedef struct { + uint64_t table_id; + uint64_t offset; +} lbug_internal_id_t; + +/** + * @brief lbug internal date type which stores the number of days since 1970-01-01 00:00:00 UTC. + */ +typedef struct { + // Days since 1970-01-01 00:00:00 UTC. + int32_t days; +} lbug_date_t; + +/** + * @brief lbug internal timestamp_ns type which stores the number of nanoseconds since 1970-01-01 + * 00:00:00 UTC. + */ +typedef struct { + // Nanoseconds since 1970-01-01 00:00:00 UTC. + int64_t value; +} lbug_timestamp_ns_t; + +/** + * @brief lbug internal timestamp_ms type which stores the number of milliseconds since 1970-01-01 + * 00:00:00 UTC. + */ +typedef struct { + // Milliseconds since 1970-01-01 00:00:00 UTC. + int64_t value; +} lbug_timestamp_ms_t; + +/** + * @brief lbug internal timestamp_sec_t type which stores the number of seconds since 1970-01-01 + * 00:00:00 UTC. + */ +typedef struct { + // Seconds since 1970-01-01 00:00:00 UTC. + int64_t value; +} lbug_timestamp_sec_t; + +/** + * @brief lbug internal timestamp_tz type which stores the number of microseconds since 1970-01-01 + * with timezone 00:00:00 UTC. + */ +typedef struct { + // Microseconds since 1970-01-01 00:00:00 UTC. + int64_t value; +} lbug_timestamp_tz_t; + +/** + * @brief lbug internal timestamp type which stores the number of microseconds since 1970-01-01 + * 00:00:00 UTC. + */ +typedef struct { + // Microseconds since 1970-01-01 00:00:00 UTC. + int64_t value; +} lbug_timestamp_t; + +/** + * @brief lbug internal interval type which stores the months, days and microseconds. + */ +typedef struct { + int32_t months; + int32_t days; + int64_t micros; +} lbug_interval_t; + +/** + * @brief lbug_query_summary stores the execution time, plan, compiling time and query options of a + * query. + */ +typedef struct { + void* _query_summary; +} lbug_query_summary; + +typedef struct { + uint64_t low; + int64_t high; +} lbug_int128_t; + +/** + * @brief enum class for lbug internal dataTypes. + */ +typedef enum { + LBUG_ANY = 0, + LBUG_NODE = 10, + LBUG_REL = 11, + LBUG_RECURSIVE_REL = 12, + // SERIAL is a special data type that is used to represent a sequence of INT64 values that are + // incremented by 1 starting from 0. + LBUG_SERIAL = 13, + // fixed size types + LBUG_BOOL = 22, + LBUG_INT64 = 23, + LBUG_INT32 = 24, + LBUG_INT16 = 25, + LBUG_INT8 = 26, + LBUG_UINT64 = 27, + LBUG_UINT32 = 28, + LBUG_UINT16 = 29, + LBUG_UINT8 = 30, + LBUG_INT128 = 31, + LBUG_DOUBLE = 32, + LBUG_FLOAT = 33, + LBUG_DATE = 34, + LBUG_TIMESTAMP = 35, + LBUG_TIMESTAMP_SEC = 36, + LBUG_TIMESTAMP_MS = 37, + LBUG_TIMESTAMP_NS = 38, + LBUG_TIMESTAMP_TZ = 39, + LBUG_INTERVAL = 40, + LBUG_DECIMAL = 41, + LBUG_INTERNAL_ID = 42, + // variable size types + LBUG_STRING = 50, + LBUG_BLOB = 51, + LBUG_LIST = 52, + LBUG_ARRAY = 53, + LBUG_STRUCT = 54, + LBUG_MAP = 55, + LBUG_UNION = 56, + LBUG_POINTER = 58, + LBUG_UUID = 59 +} lbug_data_type_id; + +/** + * @brief enum class for lbug function return state. + */ +typedef enum { LbugSuccess = 0, LbugError = 1 } lbug_state; + +// Database +/** + * @brief Allocates memory and creates a lbug database instance at database_path with + * bufferPoolSize=buffer_pool_size. Caller is responsible for calling lbug_database_destroy() to + * release the allocated memory. + * @param database_path The path to the database. + * @param system_config The runtime configuration for creating or opening the database. + * @param[out] out_database The output parameter that will hold the database instance. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_database_init(const char* database_path, + lbug_system_config system_config, lbug_database* out_database); +/** + * @brief Destroys the lbug database instance and frees the allocated memory. + * @param database The database instance to destroy. + */ +LBUG_C_API void lbug_database_destroy(lbug_database* database); + +LBUG_C_API lbug_system_config lbug_default_system_config(); + +// Connection +/** + * @brief Allocates memory and creates a connection to the database. Caller is responsible for + * calling lbug_connection_destroy() to release the allocated memory. + * @param database The database instance to connect to. + * @param[out] out_connection The output parameter that will hold the connection instance. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_connection_init(lbug_database* database, + lbug_connection* out_connection); +/** + * @brief Destroys the connection instance and frees the allocated memory. + * @param connection The connection instance to destroy. + */ +LBUG_C_API void lbug_connection_destroy(lbug_connection* connection); +/** + * @brief Sets the maximum number of threads to use for executing queries. + * @param connection The connection instance to set max number of threads for execution. + * @param num_threads The maximum number of threads to use for executing queries. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_connection_set_max_num_thread_for_exec(lbug_connection* connection, + uint64_t num_threads); + +/** + * @brief Returns the maximum number of threads of the connection to use for executing queries. + * @param connection The connection instance to return max number of threads for execution. + * @param[out] out_result The output parameter that will hold the maximum number of threads to use + * for executing queries. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_connection_get_max_num_thread_for_exec(lbug_connection* connection, + uint64_t* out_result); +/** + * @brief Executes the given query and returns the result. + * @param connection The connection instance to execute the query. + * @param query The query to execute. + * @param[out] out_query_result The output parameter that will hold the result of the query. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_connection_query(lbug_connection* connection, const char* query, + lbug_query_result* out_query_result); +/** + * @brief Prepares the given query and returns the prepared statement. + * @param connection The connection instance to prepare the query. + * @param query The query to prepare. + * @param[out] out_prepared_statement The output parameter that will hold the prepared statement. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_connection_prepare(lbug_connection* connection, const char* query, + lbug_prepared_statement* out_prepared_statement); +/** + * @brief Executes the prepared_statement using connection. + * @param connection The connection instance to execute the prepared_statement. + * @param prepared_statement The prepared statement to execute. + * @param[out] out_query_result The output parameter that will hold the result of the query. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_connection_execute(lbug_connection* connection, + lbug_prepared_statement* prepared_statement, lbug_query_result* out_query_result); +/** + * @brief Interrupts the current query execution in the connection. + * @param connection The connection instance to interrupt. + */ +LBUG_C_API void lbug_connection_interrupt(lbug_connection* connection); +/** + * @brief Sets query timeout value in milliseconds for the connection. + * @param connection The connection instance to set query timeout value. + * @param timeout_in_ms The timeout value in milliseconds. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_connection_set_query_timeout(lbug_connection* connection, + uint64_t timeout_in_ms); + +// PreparedStatement +/** + * @brief Destroys the prepared statement instance and frees the allocated memory. + * @param prepared_statement The prepared statement instance to destroy. + */ +LBUG_C_API void lbug_prepared_statement_destroy(lbug_prepared_statement* prepared_statement); +/** + * @return the query is prepared successfully or not. + */ +LBUG_C_API bool lbug_prepared_statement_is_success(lbug_prepared_statement* prepared_statement); +/** + * @brief Returns the error message if the prepared statement is not prepared successfully. + * The caller is responsible for freeing the returned string with `lbug_destroy_string`. + * @param prepared_statement The prepared statement instance. + * @return the error message if the statement is not prepared successfully or null + * if the statement is prepared successfully. + */ +LBUG_C_API char* lbug_prepared_statement_get_error_message( + lbug_prepared_statement* prepared_statement); +/** + * @brief Binds the given boolean value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The boolean value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_bool(lbug_prepared_statement* prepared_statement, + const char* param_name, bool value); +/** + * @brief Binds the given int64_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The int64_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_int64( + lbug_prepared_statement* prepared_statement, const char* param_name, int64_t value); +/** + * @brief Binds the given int32_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The int32_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_int32( + lbug_prepared_statement* prepared_statement, const char* param_name, int32_t value); +/** + * @brief Binds the given int16_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The int16_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_int16( + lbug_prepared_statement* prepared_statement, const char* param_name, int16_t value); +/** + * @brief Binds the given int8_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The int8_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_int8(lbug_prepared_statement* prepared_statement, + const char* param_name, int8_t value); +/** + * @brief Binds the given uint64_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The uint64_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_uint64( + lbug_prepared_statement* prepared_statement, const char* param_name, uint64_t value); +/** + * @brief Binds the given uint32_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The uint32_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_uint32( + lbug_prepared_statement* prepared_statement, const char* param_name, uint32_t value); +/** + * @brief Binds the given uint16_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The uint16_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_uint16( + lbug_prepared_statement* prepared_statement, const char* param_name, uint16_t value); +/** + * @brief Binds the given int8_t value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The int8_t value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_uint8( + lbug_prepared_statement* prepared_statement, const char* param_name, uint8_t value); + +/** + * @brief Binds the given double value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The double value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_double( + lbug_prepared_statement* prepared_statement, const char* param_name, double value); +/** + * @brief Binds the given float value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The float value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_float( + lbug_prepared_statement* prepared_statement, const char* param_name, float value); +/** + * @brief Binds the given date value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The date value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_date(lbug_prepared_statement* prepared_statement, + const char* param_name, lbug_date_t value); +/** + * @brief Binds the given timestamp_ns value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The timestamp_ns value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_timestamp_ns( + lbug_prepared_statement* prepared_statement, const char* param_name, lbug_timestamp_ns_t value); +/** + * @brief Binds the given timestamp_sec value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The timestamp_sec value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_timestamp_sec( + lbug_prepared_statement* prepared_statement, const char* param_name, + lbug_timestamp_sec_t value); +/** + * @brief Binds the given timestamp_tz value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The timestamp_tz value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_timestamp_tz( + lbug_prepared_statement* prepared_statement, const char* param_name, lbug_timestamp_tz_t value); +/** + * @brief Binds the given timestamp_ms value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The timestamp_ms value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_timestamp_ms( + lbug_prepared_statement* prepared_statement, const char* param_name, lbug_timestamp_ms_t value); +/** + * @brief Binds the given timestamp value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The timestamp value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_timestamp( + lbug_prepared_statement* prepared_statement, const char* param_name, lbug_timestamp_t value); +/** + * @brief Binds the given interval value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The interval value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_interval( + lbug_prepared_statement* prepared_statement, const char* param_name, lbug_interval_t value); +/** + * @brief Binds the given string value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The string value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_string( + lbug_prepared_statement* prepared_statement, const char* param_name, const char* value); +/** + * @brief Binds the given lbug value to the given parameter name in the prepared statement. + * @param prepared_statement The prepared statement instance to bind the value. + * @param param_name The parameter name to bind the value. + * @param value The lbug value to bind. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_prepared_statement_bind_value( + lbug_prepared_statement* prepared_statement, const char* param_name, lbug_value* value); + +// QueryResult +/** + * @brief Destroys the given query result instance. + * @param query_result The query result instance to destroy. + */ +LBUG_C_API void lbug_query_result_destroy(lbug_query_result* query_result); +/** + * @brief Returns true if the query is executed successful, false otherwise. + * @param query_result The query result instance to check. + */ +LBUG_C_API bool lbug_query_result_is_success(lbug_query_result* query_result); +/** + * @brief Returns the error message if the query is failed. + * The caller is responsible for freeing the returned string with `lbug_destroy_string`. + * @param query_result The query result instance to check and return error message. + * @return The error message if the query has failed, or null if the query is successful. + */ +LBUG_C_API char* lbug_query_result_get_error_message(lbug_query_result* query_result); +/** + * @brief Returns the number of columns in the query result. + * @param query_result The query result instance to return. + */ +LBUG_C_API uint64_t lbug_query_result_get_num_columns(lbug_query_result* query_result); +/** + * @brief Returns the column name at the given index. + * @param query_result The query result instance to return. + * @param index The index of the column to return name. + * @param[out] out_column_name The output parameter that will hold the column name. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_query_result_get_column_name(lbug_query_result* query_result, + uint64_t index, char** out_column_name); +/** + * @brief Returns the data type of the column at the given index. + * @param query_result The query result instance to return. + * @param index The index of the column to return data type. + * @param[out] out_column_data_type The output parameter that will hold the column data type. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_query_result_get_column_data_type(lbug_query_result* query_result, + uint64_t index, lbug_logical_type* out_column_data_type); +/** + * @brief Returns the number of tuples in the query result. + * @param query_result The query result instance to return. + */ +LBUG_C_API uint64_t lbug_query_result_get_num_tuples(lbug_query_result* query_result); +/** + * @brief Returns the query summary of the query result. + * @param query_result The query result instance to return. + * @param[out] out_query_summary The output parameter that will hold the query summary. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_query_result_get_query_summary(lbug_query_result* query_result, + lbug_query_summary* out_query_summary); +/** + * @brief Returns true if we have not consumed all tuples in the query result, false otherwise. + * @param query_result The query result instance to check. + */ +LBUG_C_API bool lbug_query_result_has_next(lbug_query_result* query_result); +/** + * @brief Returns the next tuple in the query result. Throws an exception if there is no more tuple. + * Note that to reduce resource allocation, all calls to lbug_query_result_get_next() reuse the same + * FlatTuple object. Since its contents will be overwritten, please complete processing a FlatTuple + * or make a copy of its data before calling lbug_query_result_get_next() again. + * @param query_result The query result instance to return. + * @param[out] out_flat_tuple The output parameter that will hold the next tuple. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_query_result_get_next(lbug_query_result* query_result, + lbug_flat_tuple* out_flat_tuple); +/** + * @brief Returns true if we have not consumed all query results, false otherwise. Use this function + * for loop results of multiple query statements + * @param query_result The query result instance to check. + */ +LBUG_C_API bool lbug_query_result_has_next_query_result(lbug_query_result* query_result); +/** + * @brief Returns the next query result. Use this function to loop multiple query statements' + * results. + * @param query_result The query result instance to return. + * @param[out] out_next_query_result The output parameter that will hold the next query result. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_query_result_get_next_query_result(lbug_query_result* query_result, + lbug_query_result* out_next_query_result); + +/** + * @brief Returns the query result as a string. + * @param query_result The query result instance to return. + * @return The query result as a string. + */ +LBUG_C_API char* lbug_query_result_to_string(lbug_query_result* query_result); +/** + * @brief Resets the iterator of the query result to the beginning of the query result. + * @param query_result The query result instance to reset iterator. + */ +LBUG_C_API void lbug_query_result_reset_iterator(lbug_query_result* query_result); + +/** + * @brief Returns the query result's schema as ArrowSchema. + * @param query_result The query result instance to return. + * @param[out] out_schema The output parameter that will hold the datatypes of the columns as an + * arrow schema. + * @return The state indicating the success or failure of the operation. + * + * It is the caller's responsibility to call the release function to release the underlying data + */ +LBUG_C_API lbug_state lbug_query_result_get_arrow_schema(lbug_query_result* query_result, + struct ArrowSchema* out_schema); + +/** + * @brief Returns the next chunk of the query result as ArrowArray. + * @param query_result The query result instance to return. + * @param chunk_size The number of tuples to return in the chunk. + * @param[out] out_arrow_array The output parameter that will hold the arrow array representation of + * the query result. The arrow array internally stores an arrow struct with fields for each of the + * columns. + * @return The state indicating the success or failure of the operation. + * + * It is the caller's responsibility to call the release function to release the underlying data + */ +LBUG_C_API lbug_state lbug_query_result_get_next_arrow_chunk(lbug_query_result* query_result, + int64_t chunk_size, struct ArrowArray* out_arrow_array); + +// FlatTuple +/** + * @brief Destroys the given flat tuple instance. + * @param flat_tuple The flat tuple instance to destroy. + */ +LBUG_C_API void lbug_flat_tuple_destroy(lbug_flat_tuple* flat_tuple); +/** + * @brief Returns the value at index of the flat tuple. + * @param flat_tuple The flat tuple instance to return. + * @param index The index of the value to return. + * @param[out] out_value The output parameter that will hold the value at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_flat_tuple_get_value(lbug_flat_tuple* flat_tuple, uint64_t index, + lbug_value* out_value); +/** + * @brief Converts the flat tuple to a string. + * @param flat_tuple The flat tuple instance to convert. + * @return The flat tuple as a string. + */ +LBUG_C_API char* lbug_flat_tuple_to_string(lbug_flat_tuple* flat_tuple); + +// DataType +// TODO(Chang): Refactor the datatype constructor to follow the cpp way of creating dataTypes. +/** + * @brief Creates a data type instance with the given id, childType and num_elements_in_array. + * Caller is responsible for destroying the returned data type instance. + * @param id The enum type id of the datatype to create. + * @param child_type The child type of the datatype to create(only used for nested dataTypes). + * @param num_elements_in_array The number of elements in the array(only used for ARRAY). + * @param[out] out_type The output parameter that will hold the data type instance. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API void lbug_data_type_create(lbug_data_type_id id, lbug_logical_type* child_type, + uint64_t num_elements_in_array, lbug_logical_type* out_type); +/** + * @brief Creates a new data type instance by cloning the given data type instance. + * @param data_type The data type instance to clone. + * @param[out] out_type The output parameter that will hold the cloned data type instance. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API void lbug_data_type_clone(lbug_logical_type* data_type, lbug_logical_type* out_type); +/** + * @brief Destroys the given data type instance. + * @param data_type The data type instance to destroy. + */ +LBUG_C_API void lbug_data_type_destroy(lbug_logical_type* data_type); +/** + * @brief Returns true if the given data type is equal to the other data type, false otherwise. + * @param data_type1 The first data type instance to compare. + * @param data_type2 The second data type instance to compare. + */ +LBUG_C_API bool lbug_data_type_equals(lbug_logical_type* data_type1, lbug_logical_type* data_type2); +/** + * @brief Returns the enum type id of the given data type. + * @param data_type The data type instance to return. + */ +LBUG_C_API lbug_data_type_id lbug_data_type_get_id(lbug_logical_type* data_type); +/** + * @brief Returns the child type of the given ARRAY or LIST data type. + * @param data_type The ARRAY or LIST data type instance. + * @param[out] out_result The output parameter that will hold the child type. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_data_type_get_child_type(lbug_logical_type* data_type, + lbug_logical_type* out_result); +/** + * @brief Returns the number of elements for array. + * @param data_type The data type instance to return. + * @param[out] out_result The output parameter that will hold the number of elements in the array. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_data_type_get_num_elements_in_array(lbug_logical_type* data_type, + uint64_t* out_result); + +// Value +/** + * @brief Creates a NULL value of ANY type. Caller is responsible for destroying the returned value. + */ +LBUG_C_API lbug_value* lbug_value_create_null(); +/** + * @brief Creates a value of the given data type. Caller is responsible for destroying the + * returned value. + * @param data_type The data type of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_null_with_data_type(lbug_logical_type* data_type); +/** + * @brief Returns true if the given value is NULL, false otherwise. + * @param value The value instance to check. + */ +LBUG_C_API bool lbug_value_is_null(lbug_value* value); +/** + * @brief Sets the given value to NULL or not. + * @param value The value instance to set. + * @param is_null True if sets the value to NULL, false otherwise. + */ +LBUG_C_API void lbug_value_set_null(lbug_value* value, bool is_null); +/** + * @brief Creates a value of the given data type with default non-NULL value. Caller is responsible + * for destroying the returned value. + * @param data_type The data type of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_default(lbug_logical_type* data_type); +/** + * @brief Creates a value with boolean type and the given bool value. Caller is responsible for + * destroying the returned value. + * @param val_ The bool value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_bool(bool val_); +/** + * @brief Creates a value with int8 type and the given int8 value. Caller is responsible for + * destroying the returned value. + * @param val_ The int8 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_int8(int8_t val_); +/** + * @brief Creates a value with int16 type and the given int16 value. Caller is responsible for + * destroying the returned value. + * @param val_ The int16 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_int16(int16_t val_); +/** + * @brief Creates a value with int32 type and the given int32 value. Caller is responsible for + * destroying the returned value. + * @param val_ The int32 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_int32(int32_t val_); +/** + * @brief Creates a value with int64 type and the given int64 value. Caller is responsible for + * destroying the returned value. + * @param val_ The int64 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_int64(int64_t val_); +/** + * @brief Creates a value with uint8 type and the given uint8 value. Caller is responsible for + * destroying the returned value. + * @param val_ The uint8 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_uint8(uint8_t val_); +/** + * @brief Creates a value with uint16 type and the given uint16 value. Caller is responsible for + * destroying the returned value. + * @param val_ The uint16 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_uint16(uint16_t val_); +/** + * @brief Creates a value with uint32 type and the given uint32 value. Caller is responsible for + * destroying the returned value. + * @param val_ The uint32 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_uint32(uint32_t val_); +/** + * @brief Creates a value with uint64 type and the given uint64 value. Caller is responsible for + * destroying the returned value. + * @param val_ The uint64 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_uint64(uint64_t val_); +/** + * @brief Creates a value with int128 type and the given int128 value. Caller is responsible for + * destroying the returned value. + * @param val_ The int128 value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_int128(lbug_int128_t val_); +/** + * @brief Creates a value with float type and the given float value. Caller is responsible for + * destroying the returned value. + * @param val_ The float value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_float(float val_); +/** + * @brief Creates a value with double type and the given double value. Caller is responsible for + * destroying the returned value. + * @param val_ The double value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_double(double val_); +/** + * @brief Creates a value with decimal type and the given string representation. + * Caller is responsible for destroying the returned value. + * @param val_ The decimal value to create. + * @param precision The decimal precision. + * @param scale The decimal scale. + */ +LBUG_C_API lbug_value* lbug_value_create_decimal(const char* val_, uint32_t precision, + uint32_t scale); +/** + * @brief Creates a value with internal_id type and the given internal_id value. Caller is + * responsible for destroying the returned value. + * @param val_ The internal_id value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_internal_id(lbug_internal_id_t val_); +/** + * @brief Creates a value with date type and the given date value. Caller is responsible for + * destroying the returned value. + * @param val_ The date value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_date(lbug_date_t val_); +/** + * @brief Creates a value with timestamp_ns type and the given timestamp value. Caller is + * responsible for destroying the returned value. + * @param val_ The timestamp_ns value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_timestamp_ns(lbug_timestamp_ns_t val_); +/** + * @brief Creates a value with timestamp_ms type and the given timestamp value. Caller is + * responsible for destroying the returned value. + * @param val_ The timestamp_ms value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_timestamp_ms(lbug_timestamp_ms_t val_); +/** + * @brief Creates a value with timestamp_sec type and the given timestamp value. Caller is + * responsible for destroying the returned value. + * @param val_ The timestamp_sec value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_timestamp_sec(lbug_timestamp_sec_t val_); +/** + * @brief Creates a value with timestamp_tz type and the given timestamp value. Caller is + * responsible for destroying the returned value. + * @param val_ The timestamp_tz value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_timestamp_tz(lbug_timestamp_tz_t val_); +/** + * @brief Creates a value with timestamp type and the given timestamp value. Caller is responsible + * for destroying the returned value. + * @param val_ The timestamp value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_timestamp(lbug_timestamp_t val_); +/** + * @brief Creates a value with interval type and the given interval value. Caller is responsible + * for destroying the returned value. + * @param val_ The interval value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_interval(lbug_interval_t val_); +/** + * @brief Creates a value with string type and the given string value. Caller is responsible for + * destroying the returned value. + * @param val_ The string value of the value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_string(const char* val_); +/** + * @brief Creates a value with UUID type and the given string representation. + * Caller is responsible for destroying the returned value. + * @param val_ The UUID string value to create. + */ +LBUG_C_API lbug_value* lbug_value_create_uuid(const char* val_); +/** + * @brief Creates a list value with the given number of elements and the given elements. + * The caller needs to make sure that all elements have the same type. + * The elements are copied into the list value, so destroying the elements after creating the list + * value is safe. + * Caller is responsible for destroying the returned value. + * @param num_elements The number of elements in the list. + * @param elements The elements of the list. + * @param[out] out_value The output parameter that will hold a pointer to the created list value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_create_list(uint64_t num_elements, lbug_value** elements, + lbug_value** out_value); +/** + * @brief Creates a struct value with the given number of fields and the given field names and + * values. The caller needs to make sure that all field names are unique. + * The field names and values are copied into the struct value, so destroying the field names and + * values after creating the struct value is safe. + * Caller is responsible for destroying the returned value. + * @param num_fields The number of fields in the struct. + * @param field_names The field names of the struct. + * @param field_values The field values of the struct. + * @param[out] out_value The output parameter that will hold a pointer to the created struct value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_create_struct(uint64_t num_fields, const char** field_names, + lbug_value** field_values, lbug_value** out_value); +/** + * @brief Creates a map value with the given number of fields and the given keys and values. The + * caller needs to make sure that all keys are unique, and all keys and values have the same type. + * The keys and values are copied into the map value, so destroying the keys and values after + * creating the map value is safe. + * Caller is responsible for destroying the returned value. + * @param num_fields The number of fields in the map. + * @param keys The keys of the map. + * @param values The values of the map. + * @param[out] out_value The output parameter that will hold a pointer to the created map value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_create_map(uint64_t num_fields, lbug_value** keys, + lbug_value** values, lbug_value** out_value); +/** + * @brief Creates a new value based on the given value. Caller is responsible for destroying the + * returned value. + * @param value The value to create from. + */ +LBUG_C_API lbug_value* lbug_value_clone(lbug_value* value); +/** + * @brief Copies the other value to the value. + * @param value The value to copy to. + * @param other The value to copy from. + */ +LBUG_C_API void lbug_value_copy(lbug_value* value, lbug_value* other); +/** + * @brief Destroys the value. + * @param value The value to destroy. + */ +LBUG_C_API void lbug_value_destroy(lbug_value* value); +/** + * @brief Returns the number of elements per list of the given value. The value must be of type + * ARRAY. + * @param value The ARRAY value to get list size. + * @param[out] out_result The output parameter that will hold the number of elements per list. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_list_size(lbug_value* value, uint64_t* out_result); +/** + * @brief Returns the element at index of the given value. The value must be of type LIST. + * @param value The LIST value to return. + * @param index The index of the element to return. + * @param[out] out_value The output parameter that will hold the element at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_list_element(lbug_value* value, uint64_t index, + lbug_value* out_value); +/** + * @brief Returns the number of fields of the given struct value. The value must be of type STRUCT. + * @param value The STRUCT value to get number of fields. + * @param[out] out_result The output parameter that will hold the number of fields. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_struct_num_fields(lbug_value* value, uint64_t* out_result); +/** + * @brief Returns the field name at index of the given struct value. The value must be of physical + * type STRUCT (STRUCT, NODE, REL, RECURSIVE_REL, UNION). + * @param value The STRUCT value to get field name. + * @param index The index of the field name to return. + * @param[out] out_result The output parameter that will hold the field name at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_struct_field_name(lbug_value* value, uint64_t index, + char** out_result); +/** + * @brief Returns the field index for the given field name in the given struct value. + * @param value The STRUCT value to inspect. + * @param field_name The field name to look up. + * @param[out] out_result The output parameter that will hold the field index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_struct_field_index(lbug_value* value, const char* field_name, + uint64_t* out_result); +/** + * @brief Returns the field value at index of the given struct value. The value must be of physical + * type STRUCT (STRUCT, NODE, REL, RECURSIVE_REL, UNION). + * @param value The STRUCT value to get field value. + * @param index The index of the field value to return. + * @param[out] out_value The output parameter that will hold the field value at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_struct_field_value(lbug_value* value, uint64_t index, + lbug_value* out_value); + +/** + * @brief Returns the size of the given map value. The value must be of type MAP. + * @param value The MAP value to get size. + * @param[out] out_result The output parameter that will hold the size of the map. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_map_size(lbug_value* value, uint64_t* out_result); +/** + * @brief Returns the key at index of the given map value. The value must be of physical + * type MAP. + * @param value The MAP value to get key. + * @param index The index of the field name to return. + * @param[out] out_key The output parameter that will hold the key at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_map_key(lbug_value* value, uint64_t index, + lbug_value* out_key); +/** + * @brief Returns the field value at index of the given map value. The value must be of physical + * type MAP. + * @param value The MAP value to get field value. + * @param index The index of the field value to return. + * @param[out] out_value The output parameter that will hold the field value at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_map_value(lbug_value* value, uint64_t index, + lbug_value* out_value); +/** + * @brief Returns the list of nodes for recursive rel value. The value must be of type + * RECURSIVE_REL. + * @param value The RECURSIVE_REL value to return. + * @param[out] out_value The output parameter that will hold the list of nodes. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_recursive_rel_node_list(lbug_value* value, + lbug_value* out_value); + +/** + * @brief Returns the list of rels for recursive rel value. The value must be of type RECURSIVE_REL. + * @param value The RECURSIVE_REL value to return. + * @param[out] out_value The output parameter that will hold the list of rels. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_recursive_rel_rel_list(lbug_value* value, + lbug_value* out_value); +/** + * @brief Returns internal type of the given value. + * @param value The value to return. + * @param[out] out_type The output parameter that will hold the internal type of the value. + */ +LBUG_C_API void lbug_value_get_data_type(lbug_value* value, lbug_logical_type* out_type); +/** + * @brief Returns the boolean value of the given value. The value must be of type BOOL. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the boolean value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_bool(lbug_value* value, bool* out_result); +/** + * @brief Returns the int8 value of the given value. The value must be of type INT8. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the int8 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_int8(lbug_value* value, int8_t* out_result); +/** + * @brief Returns the int16 value of the given value. The value must be of type INT16. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the int16 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_int16(lbug_value* value, int16_t* out_result); +/** + * @brief Returns the int32 value of the given value. The value must be of type INT32. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the int32 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_int32(lbug_value* value, int32_t* out_result); +/** + * @brief Returns the int64 value of the given value. The value must be of type INT64 or SERIAL. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the int64 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_int64(lbug_value* value, int64_t* out_result); +/** + * @brief Returns the uint8 value of the given value. The value must be of type UINT8. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the uint8 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_uint8(lbug_value* value, uint8_t* out_result); +/** + * @brief Returns the uint16 value of the given value. The value must be of type UINT16. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the uint16 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_uint16(lbug_value* value, uint16_t* out_result); +/** + * @brief Returns the uint32 value of the given value. The value must be of type UINT32. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the uint32 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_uint32(lbug_value* value, uint32_t* out_result); +/** + * @brief Returns the uint64 value of the given value. The value must be of type UINT64. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the uint64 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_uint64(lbug_value* value, uint64_t* out_result); +/** + * @brief Returns the int128 value of the given value. The value must be of type INT128. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the int128 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_int128(lbug_value* value, lbug_int128_t* out_result); +/** + * @brief convert a string to int128 value. + * @param str The string to convert. + * @param[out] out_result The output parameter that will hold the int128 value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_int128_t_from_string(const char* str, lbug_int128_t* out_result); +/** + * @brief convert int128 to corresponding string. + * @param val The int128 value to convert. + * @param[out] out_result The output parameter that will hold the string value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_int128_t_to_string(lbug_int128_t val, char** out_result); +/** + * @brief Returns the float value of the given value. The value must be of type FLOAT. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the float value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_float(lbug_value* value, float* out_result); +/** + * @brief Returns the double value of the given value. The value must be of type DOUBLE. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the double value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_double(lbug_value* value, double* out_result); +/** + * @brief Returns the internal id value of the given value. The value must be of type INTERNAL_ID. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the internal id value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_internal_id(lbug_value* value, lbug_internal_id_t* out_result); +/** + * @brief Returns the date value of the given value. The value must be of type DATE. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the date value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_date(lbug_value* value, lbug_date_t* out_result); +/** + * @brief Returns the timestamp value of the given value. The value must be of type TIMESTAMP. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the timestamp value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_timestamp(lbug_value* value, lbug_timestamp_t* out_result); +/** + * @brief Returns the timestamp_ns value of the given value. The value must be of type TIMESTAMP_NS. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the timestamp_ns value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_timestamp_ns(lbug_value* value, + lbug_timestamp_ns_t* out_result); +/** + * @brief Returns the timestamp_ms value of the given value. The value must be of type TIMESTAMP_MS. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the timestamp_ms value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_timestamp_ms(lbug_value* value, + lbug_timestamp_ms_t* out_result); +/** + * @brief Returns the timestamp_sec value of the given value. The value must be of type + * TIMESTAMP_SEC. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the timestamp_sec value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_timestamp_sec(lbug_value* value, + lbug_timestamp_sec_t* out_result); +/** + * @brief Returns the timestamp_tz value of the given value. The value must be of type TIMESTAMP_TZ. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the timestamp_tz value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_timestamp_tz(lbug_value* value, + lbug_timestamp_tz_t* out_result); +/** + * @brief Returns the interval value of the given value. The value must be of type INTERVAL. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the interval value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_interval(lbug_value* value, lbug_interval_t* out_result); +/** + * @brief Returns the decimal value of the given value as a string. The value must be of type + * DECIMAL. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the decimal value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_decimal_as_string(lbug_value* value, char** out_result); +/** + * @brief Returns the string value of the given value. The value must be of type STRING. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the string value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_string(lbug_value* value, char** out_result); +/** + * @brief Returns the blob value of the given value. The value must be of type BLOB. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the blob value. + * @param[out] out_length The output parameter that will hold the length of the blob. + * @return The state indicating the success or failure of the operation. + * @note The caller is responsible for freeing the returned memory using `lbug_destroy_blob`. + */ +LBUG_C_API lbug_state lbug_value_get_blob(lbug_value* value, uint8_t** out_result, + uint64_t* out_length); +/** + * @brief Returns the uuid value of the given value. + * to a string. The value must be of type UUID. + * @param value The value to return. + * @param[out] out_result The output parameter that will hold the uuid value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_value_get_uuid(lbug_value* value, char** out_result); +/** + * @brief Converts the given value to string. + * @param value The value to convert. + * @return The value as a string. + */ +LBUG_C_API char* lbug_value_to_string(lbug_value* value); +/** + * @brief Returns the internal id value of the given node value as a lbug value. + * @param node_val The node value to return. + * @param[out] out_value The output parameter that will hold the internal id value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_node_val_get_id_val(lbug_value* node_val, lbug_value* out_value); +/** + * @brief Returns the label value of the given node value as a label value. + * @param node_val The node value to return. + * @param[out] out_value The output parameter that will hold the label value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_node_val_get_label_val(lbug_value* node_val, lbug_value* out_value); +/** + * @brief Returns the number of properties of the given node value. + * @param node_val The node value to return. + * @param[out] out_value The output parameter that will hold the number of properties. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_node_val_get_property_size(lbug_value* node_val, uint64_t* out_value); +/** + * @brief Returns the property name of the given node value at the given index. + * @param node_val The node value to return. + * @param index The index of the property. + * @param[out] out_result The output parameter that will hold the property name at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_node_val_get_property_name_at(lbug_value* node_val, uint64_t index, + char** out_result); +/** + * @brief Returns the property value of the given node value at the given index. + * @param node_val The node value to return. + * @param index The index of the property. + * @param[out] out_value The output parameter that will hold the property value at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_node_val_get_property_value_at(lbug_value* node_val, uint64_t index, + lbug_value* out_value); +/** + * @brief Converts the given node value to string. + * @param node_val The node value to convert. + * @param[out] out_result The output parameter that will hold the node value as a string. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_node_val_to_string(lbug_value* node_val, char** out_result); +/** + * @brief Returns the internal id value of the rel value as a lbug value. + * @param rel_val The rel value to return. + * @param[out] out_value The output parameter that will hold the internal id value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_get_id_val(lbug_value* rel_val, lbug_value* out_value); +/** + * @brief Returns the internal id value of the source node of the given rel value as a lbug value. + * @param rel_val The rel value to return. + * @param[out] out_value The output parameter that will hold the internal id value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_get_src_id_val(lbug_value* rel_val, lbug_value* out_value); +/** + * @brief Returns the internal id value of the destination node of the given rel value as a lbug + * value. + * @param rel_val The rel value to return. + * @param[out] out_value The output parameter that will hold the internal id value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_get_dst_id_val(lbug_value* rel_val, lbug_value* out_value); +/** + * @brief Returns the label value of the given rel value. + * @param rel_val The rel value to return. + * @param[out] out_value The output parameter that will hold the label value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_get_label_val(lbug_value* rel_val, lbug_value* out_value); +/** + * @brief Returns the number of properties of the given rel value. + * @param rel_val The rel value to return. + * @param[out] out_value The output parameter that will hold the number of properties. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_get_property_size(lbug_value* rel_val, uint64_t* out_value); +/** + * @brief Returns the property name of the given rel value at the given index. + * @param rel_val The rel value to return. + * @param index The index of the property. + * @param[out] out_result The output parameter that will hold the property name at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_get_property_name_at(lbug_value* rel_val, uint64_t index, + char** out_result); +/** + * @brief Returns the property of the given rel value at the given index as lbug value. + * @param rel_val The rel value to return. + * @param index The index of the property. + * @param[out] out_value The output parameter that will hold the property value at index. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_get_property_value_at(lbug_value* rel_val, uint64_t index, + lbug_value* out_value); +/** + * @brief Converts the given rel value to string. + * @param rel_val The rel value to convert. + * @param[out] out_result The output parameter that will hold the rel value as a string. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_rel_val_to_string(lbug_value* rel_val, char** out_result); +/** + * @brief Destroys any string created by the Lbug C API, including both the error message and the + * values returned by the API functions. This function is provided to avoid the inconsistency + * between the memory allocation and deallocation across different libraries and is preferred over + * using the standard C free function. + * @param str The string to destroy. + */ +LBUG_C_API void lbug_destroy_string(char* str); +/** + * @brief Destroys any blob created by the Lbug C API. This function is provided to avoid the + * inconsistency between the memory allocation and deallocation across different libraries and + * is preferred over using the standard C free function. + * @param blob The blob to destroy. + */ +LBUG_C_API void lbug_destroy_blob(uint8_t* blob); + +// QuerySummary +/** + * @brief Destroys the given query summary. + * @param query_summary The query summary to destroy. + */ +LBUG_C_API void lbug_query_summary_destroy(lbug_query_summary* query_summary); +/** + * @brief Returns the compilation time of the given query summary in milliseconds. + * @param query_summary The query summary to get compilation time. + */ +LBUG_C_API double lbug_query_summary_get_compiling_time(lbug_query_summary* query_summary); +/** + * @brief Returns the execution time of the given query summary in milliseconds. + * @param query_summary The query summary to get execution time. + */ +LBUG_C_API double lbug_query_summary_get_execution_time(lbug_query_summary* query_summary); + +// Utility functions +/** + * @brief Convert timestamp_ns to corresponding tm struct. + * @param timestamp The timestamp_ns value to convert. + * @param[out] out_result The output parameter that will hold the tm struct. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_ns_to_tm(lbug_timestamp_ns_t timestamp, struct tm* out_result); +/** + * @brief Convert timestamp_ms to corresponding tm struct. + * @param timestamp The timestamp_ms value to convert. + * @param[out] out_result The output parameter that will hold the tm struct. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_ms_to_tm(lbug_timestamp_ms_t timestamp, struct tm* out_result); +/** + * @brief Convert timestamp_sec to corresponding tm struct. + * @param timestamp The timestamp_sec value to convert. + * @param[out] out_result The output parameter that will hold the tm struct. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_sec_to_tm(lbug_timestamp_sec_t timestamp, + struct tm* out_result); +/** + * @brief Convert timestamp_tz to corresponding tm struct. + * @param timestamp The timestamp_tz value to convert. + * @param[out] out_result The output parameter that will hold the tm struct. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_tz_to_tm(lbug_timestamp_tz_t timestamp, struct tm* out_result); +/** + * @brief Convert timestamp to corresponding tm struct. + * @param timestamp The timestamp value to convert. + * @param[out] out_result The output parameter that will hold the tm struct. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_to_tm(lbug_timestamp_t timestamp, struct tm* out_result); +/** + * @brief Convert tm struct to timestamp_ns value. + * @param tm The tm struct to convert. + * @param[out] out_result The output parameter that will hold the timestamp_ns value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_ns_from_tm(struct tm tm, lbug_timestamp_ns_t* out_result); +/** + * @brief Convert tm struct to timestamp_ms value. + * @param tm The tm struct to convert. + * @param[out] out_result The output parameter that will hold the timestamp_ms value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_ms_from_tm(struct tm tm, lbug_timestamp_ms_t* out_result); +/** + * @brief Convert tm struct to timestamp_sec value. + * @param tm The tm struct to convert. + * @param[out] out_result The output parameter that will hold the timestamp_sec value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_sec_from_tm(struct tm tm, lbug_timestamp_sec_t* out_result); +/** + * @brief Convert tm struct to timestamp_tz value. + * @param tm The tm struct to convert. + * @param[out] out_result The output parameter that will hold the timestamp_tz value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_tz_from_tm(struct tm tm, lbug_timestamp_tz_t* out_result); +/** + * @brief Convert timestamp_ns to corresponding string. + * @param timestamp The timestamp_ns value to convert. + * @param[out] out_result The output parameter that will hold the string value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_timestamp_from_tm(struct tm tm, lbug_timestamp_t* out_result); +/** + * @brief Convert date to corresponding string. + * @param date The date value to convert. + * @param[out] out_result The output parameter that will hold the string value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_date_to_string(lbug_date_t date, char** out_result); +/** + * @brief Convert a string to date value. + * @param str The string to convert. + * @param[out] out_result The output parameter that will hold the date value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_date_from_string(const char* str, lbug_date_t* out_result); +/** + * @brief Convert date to corresponding tm struct. + * @param date The date value to convert. + * @param[out] out_result The output parameter that will hold the tm struct. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_date_to_tm(lbug_date_t date, struct tm* out_result); +/** + * @brief Convert tm struct to date value. + * @param tm The tm struct to convert. + * @param[out] out_result The output parameter that will hold the date value. + * @return The state indicating the success or failure of the operation. + */ +LBUG_C_API lbug_state lbug_date_from_tm(struct tm tm, lbug_date_t* out_result); +/** + * @brief Convert interval to corresponding difftime value in seconds. + * @param interval The interval value to convert. + * @param[out] out_result The output parameter that will hold the difftime value. + */ +LBUG_C_API void lbug_interval_to_difftime(lbug_interval_t interval, double* out_result); +/** + * @brief Convert difftime value in seconds to interval. + * @param difftime The difftime value to convert. + * @param[out] out_result The output parameter that will hold the interval value. + */ +LBUG_C_API void lbug_interval_from_difftime(double difftime, lbug_interval_t* out_result); + +// Version +/** + * @brief Returns the version of the Lbug library. + */ +LBUG_C_API char* lbug_get_version(); + +/** + * @brief Returns the storage version of the Lbug library. + */ +LBUG_C_API uint64_t lbug_get_storage_version(); + +// Error handling +/** + * @brief Returns the last error message set by the C API, consuming it (subsequent calls return + * nullptr until another error occurs). The caller is responsible for freeing the returned string + * using lbug_destroy_string(). Returns nullptr if no error has been recorded. + */ +LBUG_C_API char* lbug_get_last_error(); +#undef LBUG_C_API diff --git a/internal/thirdparty/go-ladybug/prepared_statement.go b/internal/thirdparty/go-ladybug/prepared_statement.go new file mode 100644 index 00000000..37748853 --- /dev/null +++ b/internal/thirdparty/go-ladybug/prepared_statement.go @@ -0,0 +1,24 @@ +package lbug + +// #include "lbug.h" +// #include +import "C" + +// PreparedStatement represents a prepared statement in Lbug, which can be +// used to execute a query with parameters. +// PreparedStatement is returned by the `Prepare` method of Connection. +type PreparedStatement struct { + cPreparedStatement C.lbug_prepared_statement + connection *Connection + isClosed bool +} + +// Close closes the PreparedStatement. Calling this method is optional. +// The PreparedStatement will be closed automatically when it is garbage collected. +func (stmt *PreparedStatement) Close() { + if stmt.isClosed { + return + } + C.lbug_prepared_statement_destroy(&stmt.cPreparedStatement) + stmt.isClosed = true +} diff --git a/internal/thirdparty/go-ladybug/query_result.go b/internal/thirdparty/go-ladybug/query_result.go new file mode 100644 index 00000000..2943c9a0 --- /dev/null +++ b/internal/thirdparty/go-ladybug/query_result.go @@ -0,0 +1,131 @@ +package lbug + +// #include "lbug.h" +// #include +import "C" + +import ( + "fmt" + "runtime" + "unsafe" +) + +// QueryResult represents the result of a query, which can be used to iterate +// over the result set. +// QueryResult is returned by the `Query` and `Execute` methods of Connection. +type QueryResult struct { + cQueryResult C.lbug_query_result + connection *Connection + isClosed bool + columnNames []string +} + +// ToString returns the string representation of the QueryResult. +// The string representation contains the column names and the tuples in the +// result set. +func (queryResult *QueryResult) ToString() string { + cString := C.lbug_query_result_to_string(&queryResult.cQueryResult) + str := C.GoString(cString) + C.free(unsafe.Pointer(cString)) + return str +} + +// Close closes the QueryResult. Calling this method is optional. +// The QueryResult will be closed automatically when it is garbage collected. +func (queryResult *QueryResult) Close() { + if queryResult.isClosed { + return + } + C.lbug_query_result_destroy(&queryResult.cQueryResult) + queryResult.isClosed = true +} + +// ResetIterator resets the iterator of the QueryResult. After calling this method, the `Next` +// method can be called to iterate over the result set from the beginning. +func (queryResult *QueryResult) ResetIterator() { + C.lbug_query_result_reset_iterator(&queryResult.cQueryResult) +} + +// GetColumnNames returns the column names of the QueryResult as a slice of strings. +func (queryResult *QueryResult) GetColumnNames() []string { + if queryResult.columnNames != nil { + return queryResult.columnNames + } + numColumns := int64(C.lbug_query_result_get_num_columns(&queryResult.cQueryResult)) + columns := make([]string, 0, numColumns) + for i := int64(0); i < numColumns; i++ { + var outColumn *C.char + C.lbug_query_result_get_column_name(&queryResult.cQueryResult, C.uint64_t(i), &outColumn) + defer C.lbug_destroy_string(outColumn) + columns = append(columns, C.GoString(outColumn)) + } + queryResult.columnNames = columns + return columns +} + +// GetNumberOfColumns returns the number of columns in the QueryResult. +func (queryResult *QueryResult) GetNumberOfColumns() uint64 { + return uint64(C.lbug_query_result_get_num_columns(&queryResult.cQueryResult)) +} + +// GetNumberOfRows returns the number of rows in the QueryResult. +func (queryResult *QueryResult) GetNumberOfRows() uint64 { + if queryResult.columnNames != nil { + return uint64(len(queryResult.columnNames)) + } + return uint64(C.lbug_query_result_get_num_tuples(&queryResult.cQueryResult)) +} + +// HasNext returns true if there is at least one more tuple in the result set. +func (queryResult *QueryResult) HasNext() bool { + return bool(C.lbug_query_result_has_next(&queryResult.cQueryResult)) +} + +// Next returns the next tuple in the result set. +func (queryResult *QueryResult) Next() (*FlatTuple, error) { + tuple := &FlatTuple{} + runtime.SetFinalizer(tuple, func(tuple *FlatTuple) { + tuple.Close() + }) + tuple.queryResult = queryResult + status := C.lbug_query_result_get_next(&queryResult.cQueryResult, &tuple.cFlatTuple) + if status != C.LbugSuccess { + return tuple, fmt.Errorf("failed to get next tuple with status %d", status) + } + return tuple, nil +} + +// HasNextQueryResult returns true not all the query results is consumed when +// multiple query statements are executed. +func (queryResult *QueryResult) HasNextQueryResult() bool { + return bool(C.lbug_query_result_has_next_query_result(&queryResult.cQueryResult)) +} + +// NextQueryResult returns the next query result when multiple query statements are executed. +func (queryResult *QueryResult) NextQueryResult() (*QueryResult, error) { + nextQueryResult := &QueryResult{} + runtime.SetFinalizer(nextQueryResult, func(nextQueryResult *QueryResult) { + nextQueryResult.Close() + }) + status := C.lbug_query_result_get_next_query_result(&queryResult.cQueryResult, &nextQueryResult.cQueryResult) + if status != C.LbugSuccess { + return nextQueryResult, fmt.Errorf("failed to get next query result with status %d", status) + } + return nextQueryResult, nil +} + +// GetCompilingTime returns the compiling time of the query in milliseconds. +func (queryResult *QueryResult) GetCompilingTime() float64 { + var cQuerySummary C.lbug_query_summary + C.lbug_query_result_get_query_summary(&queryResult.cQueryResult, &cQuerySummary) + defer C.lbug_query_summary_destroy(&cQuerySummary) + return float64(C.lbug_query_summary_get_compiling_time(&cQuerySummary)) +} + +// GetExecutionTime returns the execution time of the query in milliseconds. +func (queryResult *QueryResult) GetExecutionTime() float64 { + var cQuerySummary C.lbug_query_summary + C.lbug_query_result_get_query_summary(&queryResult.cQueryResult, &cQuerySummary) + defer C.lbug_query_summary_destroy(&cQuerySummary) + return float64(C.lbug_query_summary_get_execution_time(&cQuerySummary)) +} diff --git a/internal/thirdparty/go-ladybug/time_helper.go b/internal/thirdparty/go-ladybug/time_helper.go new file mode 100644 index 00000000..9578d729 --- /dev/null +++ b/internal/thirdparty/go-ladybug/time_helper.go @@ -0,0 +1,63 @@ +package lbug + +// #include "lbug.h" +// #include +import "C" + +import ( + "time" +) + +// unixEpoch returns the Unix epoch time. +func unixEpoch() time.Time { + return time.Unix(0, 0) +} + +// lbugDateToTime converts a lbug_date_t to a time.Time in UTC. +func lbugDateToTime(cLbugDate C.lbug_date_t) time.Time { + diff := time.Duration(cLbugDate.days) * 24 * time.Hour + return unixEpoch().UTC().Add(diff) +} + +// timeToLbugTimestamp converts a time.Time to a lbug_timestamp_t. +func timeToLbugTimestamp(inputTime time.Time) C.lbug_timestamp_t { + nanoseconds := inputTime.UnixNano() + microseconds := nanoseconds / 1000 + cLbugTime := C.lbug_timestamp_t{} + cLbugTime.value = C.int64_t(microseconds) + return cLbugTime +} + +// timeToLbugTimestampNs converts a time.Time to a lbug_timestamp_ns_t. +func timeToLbugTimestampNs(inputTime time.Time) C.lbug_timestamp_ns_t { + nanoseconds := inputTime.UnixNano() + cLbugTime := C.lbug_timestamp_ns_t{} + cLbugTime.value = C.int64_t(nanoseconds) + return cLbugTime +} + +// timeHasNanoseconds returns true if the time.Time has non-zero nanoseconds. +func timeHasNanoseconds(inputTime time.Time) bool { + return inputTime.Nanosecond() != 0 +} + +// durationToLbugInterval converts a time.Duration to a lbug_interval_t. +func durationToLbugInterval(inputDuration time.Duration) C.lbug_interval_t { + microseconds := inputDuration.Microseconds() + + cLbugInterval := C.lbug_interval_t{} + cLbugInterval.micros = C.int64_t(microseconds) + return cLbugInterval +} + +// lbugIntervalToDuration converts a lbug_interval_t to a time.Duration. +func lbugIntervalToDuration(cLbugInterval C.lbug_interval_t) time.Duration { + days := cLbugInterval.days + months := cLbugInterval.months + microseconds := cLbugInterval.micros + totalDays := int64(days) + int64(months)*30 + totalSeconds := totalDays * 24 * 60 * 60 + totalMicroseconds := totalSeconds*1000000 + int64(microseconds) + totalNanoseconds := totalMicroseconds * 1000 + return time.Duration(totalNanoseconds) +} diff --git a/internal/thirdparty/go-ladybug/value_helper.go b/internal/thirdparty/go-ladybug/value_helper.go new file mode 100644 index 00000000..1ec5ff07 --- /dev/null +++ b/internal/thirdparty/go-ladybug/value_helper.go @@ -0,0 +1,641 @@ +package lbug + +// #include "lbug.h" +// #include +// #include +import "C" + +import ( + "fmt" + "reflect" + "sort" + "time" + "unsafe" + + "math/big" + + "github.com/google/uuid" + "github.com/shopspring/decimal" +) + +// InternalID represents the internal ID of a node or relationship in Lbug. +type InternalID struct { + TableID uint64 + Offset uint64 +} + +// Node represents a node retrieved from Lbug. +// A node has an ID, a label, and properties. +type Node struct { + ID InternalID + Label string + Properties map[string]any +} + +// Relationship represents a relationship retrieved from Lbug. +// A relationship has a source ID, a destination ID, a label, and properties. +type Relationship struct { + ID InternalID + SourceID InternalID + DestinationID InternalID + Label string + Properties map[string]any +} + +// RecursiveRelationship represents a recursive relationship retrieved from a +// path query in Lbug. A recursive relationship has a list of nodes and a list +// of relationships. +type RecursiveRelationship struct { + Nodes []Node + Relationships []Relationship +} + +// MapItem represents a key-value pair in a map in Lbug. It is used for both +// the query parameters and the query result. +type MapItem struct { + Key any + Value any +} + +// lbugNodeValueToGoValue converts a lbug_value representing a node to a Node +// struct in Go. +func lbugNodeValueToGoValue(lbugValue C.lbug_value) (Node, error) { + node := Node{} + node.Properties = make(map[string]any) + idValue := C.lbug_value{} + C.lbug_node_val_get_id_val(&lbugValue, &idValue) + nodeId, _ := lbugValueToGoValue(idValue) + node.ID = nodeId.(InternalID) + C.lbug_value_destroy(&idValue) + labelValue := C.lbug_value{} + C.lbug_node_val_get_label_val(&lbugValue, &labelValue) + nodeLabel, _ := lbugValueToGoValue(labelValue) + node.Label = nodeLabel.(string) + C.lbug_value_destroy(&labelValue) + var propertySize C.uint64_t + C.lbug_node_val_get_property_size(&lbugValue, &propertySize) + var currentKey *C.char + var currentVal C.lbug_value + var errors []error + for i := C.uint64_t(0); i < propertySize; i++ { + C.lbug_node_val_get_property_name_at(&lbugValue, i, ¤tKey) + keyString := C.GoString(currentKey) + C.lbug_destroy_string(currentKey) + C.lbug_node_val_get_property_value_at(&lbugValue, i, ¤tVal) + value, err := lbugValueToGoValue(currentVal) + if err != nil { + errors = append(errors, err) + } + node.Properties[keyString] = value + C.lbug_value_destroy(¤tVal) + } + if len(errors) > 0 { + return node, fmt.Errorf("failed to get values: %v", errors) + } + return node, nil +} + +// lbugRelValueToGoValue converts a lbug_value representing a relationship to a +// Relationship struct in Go. +func lbugRelValueToGoValue(lbugValue C.lbug_value) (Relationship, error) { + relation := Relationship{} + relation.Properties = make(map[string]any) + idValue := C.lbug_value{} + C.lbug_rel_val_get_id_val(&lbugValue, &idValue) + id, _ := lbugValueToGoValue(idValue) + relation.ID = id.(InternalID) + C.lbug_value_destroy(&idValue) + C.lbug_rel_val_get_src_id_val(&lbugValue, &idValue) + src, _ := lbugValueToGoValue(idValue) + relation.SourceID = src.(InternalID) + C.lbug_value_destroy(&idValue) + C.lbug_rel_val_get_dst_id_val(&lbugValue, &idValue) + dst, _ := lbugValueToGoValue(idValue) + relation.DestinationID = dst.(InternalID) + C.lbug_value_destroy(&idValue) + labelValue := C.lbug_value{} + C.lbug_rel_val_get_label_val(&lbugValue, &labelValue) + label, _ := lbugValueToGoValue(labelValue) + relation.Label = label.(string) + C.lbug_value_destroy(&labelValue) + var propertySize C.uint64_t + C.lbug_rel_val_get_property_size(&lbugValue, &propertySize) + var currentKey *C.char + var currentVal C.lbug_value + var errors []error + for i := C.uint64_t(0); i < propertySize; i++ { + C.lbug_rel_val_get_property_name_at(&lbugValue, i, ¤tKey) + keyString := C.GoString(currentKey) + C.lbug_destroy_string(currentKey) + C.lbug_rel_val_get_property_value_at(&lbugValue, i, ¤tVal) + value, err := lbugValueToGoValue(currentVal) + if err != nil { + errors = append(errors, err) + } + relation.Properties[keyString] = value + C.lbug_value_destroy(¤tVal) + } + if len(errors) > 0 { + return relation, fmt.Errorf("failed to get values: %v", errors) + } + return relation, nil +} + +// lbugRecursiveRelValueToGoValue converts a lbug_value representing a recursive +// relationship to a RecursiveRelationship struct in Go. +func lbugRecursiveRelValueToGoValue(lbugValue C.lbug_value) (RecursiveRelationship, error) { + var nodesVal C.lbug_value + var relsVal C.lbug_value + C.lbug_value_get_recursive_rel_node_list(&lbugValue, &nodesVal) + C.lbug_value_get_recursive_rel_rel_list(&lbugValue, &relsVal) + defer C.lbug_value_destroy(&nodesVal) + defer C.lbug_value_destroy(&relsVal) + nodes, _ := lbugListValueToGoValue(nodesVal) + rels, _ := lbugListValueToGoValue(relsVal) + recursiveRel := RecursiveRelationship{} + recursiveRel.Nodes = make([]Node, len(nodes)) + for i, n := range nodes { + recursiveRel.Nodes[i] = n.(Node) + } + relationships := make([]Relationship, len(rels)) + for i, r := range rels { + relationships[i] = r.(Relationship) + } + recursiveRel.Relationships = relationships + return recursiveRel, nil +} + +// lbugListValueToGoValue converts a lbug_value representing a LIST or ARRAY to +// a slice of any in Go. +func lbugListValueToGoValue(lbugValue C.lbug_value) ([]any, error) { + var listSize C.uint64_t + cLogicalType := C.lbug_logical_type{} + defer C.lbug_data_type_destroy(&cLogicalType) + C.lbug_value_get_data_type(&lbugValue, &cLogicalType) + logicalTypeId := C.lbug_data_type_get_id(&cLogicalType) + if logicalTypeId == C.LBUG_ARRAY { + C.lbug_data_type_get_num_elements_in_array(&cLogicalType, &listSize) + } else { + C.lbug_value_get_list_size(&lbugValue, &listSize) + } + list := make([]any, 0, int(listSize)) + var currentVal C.lbug_value + var errors []error + for i := C.uint64_t(0); i < listSize; i++ { + C.lbug_value_get_list_element(&lbugValue, i, ¤tVal) + value, err := lbugValueToGoValue(currentVal) + if err != nil { + errors = append(errors, err) + } + list = append(list, value) + C.lbug_value_destroy(¤tVal) + } + if len(errors) > 0 { + return list, fmt.Errorf("failed to get values: %v", errors) + } + return list, nil +} + +// lbugStructValueToGoValue converts a lbug_value representing a STRUCT to a +// map of string to any in Go. +func lbugStructValueToGoValue(lbugValue C.lbug_value) (map[string]any, error) { + structure := make(map[string]any) + var propertySize C.uint64_t + C.lbug_value_get_struct_num_fields(&lbugValue, &propertySize) + var currentKey *C.char + var currentVal C.lbug_value + var errors []error + for i := C.uint64_t(0); i < propertySize; i++ { + C.lbug_value_get_struct_field_name(&lbugValue, i, ¤tKey) + keyString := C.GoString(currentKey) + C.lbug_destroy_string(currentKey) + C.lbug_value_get_struct_field_value(&lbugValue, i, ¤tVal) + value, err := lbugValueToGoValue(currentVal) + if err != nil { + errors = append(errors, err) + } + structure[keyString] = value + C.lbug_value_destroy(¤tVal) + } + if len(errors) > 0 { + return structure, fmt.Errorf("failed to get values: %v", errors) + } + return structure, nil +} + +// lbugMapValueToGoValue converts a lbug_value representing a MAP to a +// slice of MapItem in Go. +func lbugMapValueToGoValue(lbugValue C.lbug_value) ([]MapItem, error) { + var mapSize C.uint64_t + C.lbug_value_get_map_size(&lbugValue, &mapSize) + mapItems := make([]MapItem, 0, int(mapSize)) + var currentKey C.lbug_value + var currentValue C.lbug_value + var errors []error + for i := C.uint64_t(0); i < mapSize; i++ { + C.lbug_value_get_map_key(&lbugValue, i, ¤tKey) + C.lbug_value_get_map_value(&lbugValue, i, ¤tValue) + key, err := lbugValueToGoValue(currentKey) + if err != nil { + errors = append(errors, err) + } + value, err := lbugValueToGoValue(currentValue) + if err != nil { + errors = append(errors, err) + } + C.lbug_value_destroy(¤tKey) + C.lbug_value_destroy(¤tValue) + mapItems = append(mapItems, MapItem{Key: key, Value: value}) + } + if len(errors) > 0 { + return mapItems, fmt.Errorf("failed to get values: %v", errors) + } + return mapItems, nil +} + +// lbugValueToGoValue converts a lbug_value to a corresponding Go value. +func lbugValueToGoValue(lbugValue C.lbug_value) (any, error) { + if C.lbug_value_is_null(&lbugValue) { + return nil, nil + } + var logicalType C.lbug_logical_type + defer C.lbug_data_type_destroy(&logicalType) + C.lbug_value_get_data_type(&lbugValue, &logicalType) + logicalTypeId := C.lbug_data_type_get_id(&logicalType) + switch logicalTypeId { + case C.LBUG_BOOL: + var value C.bool + status := C.lbug_value_get_bool(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get bool value with status: %d", status) + } + return bool(value), nil + case C.LBUG_INT64, C.LBUG_SERIAL: + var value C.int64_t + status := C.lbug_value_get_int64(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get int64 value with status: %d", status) + } + return int64(value), nil + case C.LBUG_INT32: + var value C.int32_t + status := C.lbug_value_get_int32(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get int32 value with status: %d", status) + } + return int32(value), nil + case C.LBUG_INT16: + var value C.int16_t + status := C.lbug_value_get_int16(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get int16 value with status: %d", status) + } + return int16(value), nil + case C.LBUG_INT128: + var value C.lbug_int128_t + status := C.lbug_value_get_int128(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get int128 value with status: %d", status) + } + return int128ToBigInt(value) + case C.LBUG_INT8: + var value C.int8_t + status := C.lbug_value_get_int8(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get int8 value with status: %d", status) + } + return int8(value), nil + case C.LBUG_UUID: + var value *C.char + status := C.lbug_value_get_uuid(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get uuid value with status: %d", status) + } + defer C.lbug_destroy_string(value) + uuidString := C.GoString(value) + return uuid.Parse(uuidString) + case C.LBUG_UINT64: + var value C.uint64_t + status := C.lbug_value_get_uint64(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get uint64 value with status: %d", status) + } + return uint64(value), nil + case C.LBUG_UINT32: + var value C.uint32_t + status := C.lbug_value_get_uint32(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get uint32 value with status: %d", status) + } + return uint32(value), nil + case C.LBUG_UINT16: + var value C.uint16_t + status := C.lbug_value_get_uint16(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get uint16 value with status: %d", status) + } + return uint16(value), nil + case C.LBUG_UINT8: + var value C.uint8_t + status := C.lbug_value_get_uint8(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get uint8 value with status: %d", status) + } + return uint8(value), nil + case C.LBUG_DOUBLE: + var value C.double + status := C.lbug_value_get_double(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get double value with status: %d", status) + } + return float64(value), nil + case C.LBUG_FLOAT: + var value C.float + status := C.lbug_value_get_float(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get float value with status: %d", status) + } + return float32(value), nil + case C.LBUG_STRING: + var outString *C.char + status := C.lbug_value_get_string(&lbugValue, &outString) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get string value with status: %d", status) + } + defer C.lbug_destroy_string(outString) + return C.GoString(outString), nil + case C.LBUG_TIMESTAMP: + var value C.lbug_timestamp_t + status := C.lbug_value_get_timestamp(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get timestamp value with status: %d", status) + } + return time.Unix(0, int64(value.value)*1000), nil + case C.LBUG_TIMESTAMP_NS: + var value C.lbug_timestamp_ns_t + status := C.lbug_value_get_timestamp_ns(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get timestamp_ns value with status: %d", status) + } + return time.Unix(0, int64(value.value)), nil + case C.LBUG_TIMESTAMP_MS: + var value C.lbug_timestamp_ms_t + status := C.lbug_value_get_timestamp_ms(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get timestamp_ms value with status: %d", status) + } + return time.Unix(0, int64(value.value)*1000000), nil + case C.LBUG_TIMESTAMP_SEC: + var value C.lbug_timestamp_sec_t + status := C.lbug_value_get_timestamp_sec(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get timestamp_sec value with status: %d", status) + } + return time.Unix(int64(value.value), 0), nil + case C.LBUG_TIMESTAMP_TZ: + var value C.lbug_timestamp_tz_t + status := C.lbug_value_get_timestamp_tz(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get timestamp_tz value with status: %d", status) + } + return time.Unix(0, int64(value.value)*1000), nil + case C.LBUG_DATE: + var value C.lbug_date_t + status := C.lbug_value_get_date(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get date value with status: %d", status) + } + return lbugDateToTime(value), nil + case C.LBUG_INTERVAL: + var value C.lbug_interval_t + status := C.lbug_value_get_interval(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get interval value with status: %d", status) + } + return lbugIntervalToDuration(value), nil + case C.LBUG_INTERNAL_ID: + var value C.lbug_internal_id_t + status := C.lbug_value_get_internal_id(&lbugValue, &value) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get internal_id value with status: %d", status) + } + return InternalID{TableID: uint64(value.table_id), Offset: uint64(value.offset)}, nil + case C.LBUG_BLOB: + var value *C.uint8_t + var length C.uint64_t + status := C.lbug_value_get_blob(&lbugValue, &value, &length) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get blob value with status: %d", status) + } + defer C.lbug_destroy_blob(value) + blob := C.GoBytes(unsafe.Pointer(value), C.int(length)) + return blob, nil + case C.LBUG_NODE: + return lbugNodeValueToGoValue(lbugValue) + case C.LBUG_REL: + return lbugRelValueToGoValue(lbugValue) + case C.LBUG_RECURSIVE_REL: + return lbugRecursiveRelValueToGoValue(lbugValue) + case C.LBUG_LIST, C.LBUG_ARRAY: + return lbugListValueToGoValue(lbugValue) + case C.LBUG_STRUCT, C.LBUG_UNION: + return lbugStructValueToGoValue(lbugValue) + case C.LBUG_MAP: + return lbugMapValueToGoValue(lbugValue) + case C.LBUG_DECIMAL: + var outString *C.char + status := C.lbug_value_get_decimal_as_string(&lbugValue, &outString) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to get string value of decimal type with status: %d", status) + } + goString := C.GoString(outString) + C.lbug_destroy_string(outString) + goDecimal, casting_error := decimal.NewFromString(goString) + if casting_error != nil { + return nil, fmt.Errorf("failed to convert decimal value with error: %w", casting_error) + } + return goDecimal, casting_error + default: + valueString := C.lbug_value_to_string(&lbugValue) + defer C.lbug_destroy_string(valueString) + return C.GoString(valueString), fmt.Errorf("unsupported data type with type id: %d. the value is force-casted to string", logicalTypeId) + } +} + +// int128ToBigInt converts a lbug_int128_t to a big.Int in Go. +func int128ToBigInt(value C.lbug_int128_t) (*big.Int, error) { + var outString *C.char + status := C.lbug_int128_t_to_string(value, &outString) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to convert int128 to string with status: %d", status) + } + defer C.lbug_destroy_string(outString) + valueString := C.GoString(outString) + bigInt := new(big.Int) + _, success := bigInt.SetString(valueString, 10) + if !success { + return nil, fmt.Errorf("failed to convert string to big.Int") + } + return bigInt, nil +} + +// goMapToLbugStruct converts a map of string to any to a lbug_value representing +// a STRUCT. It returns an error if the map is empty. +func goMapToLbugStruct(value map[string]any) (*C.lbug_value, error) { + numFields := C.uint64_t(len(value)) + if numFields == 0 { + return nil, fmt.Errorf("failed to create STRUCT value because the map is empty") + } + fieldNames := make([]*C.char, 0, len(value)) + fieldValues := make([]*C.lbug_value, 0, len(value)) + // Sort the keys to ensure the order is consistent. + // This is useful for creating a LIST of STRUCTs because in Lbug, all the + // LIST elements must have the same type (i.e., the same order of fields). + sortedKeys := make([]string, 0, len(value)) + for k := range value { + sortedKeys = append(sortedKeys, k) + } + sort.Strings(sortedKeys) + for _, k := range sortedKeys { + cName := C.CString(k) + fieldNames = append(fieldNames, cName) + defer C.free(unsafe.Pointer(cName)) + lbugValue, error := goValueToLbugValue(value[k]) + if error != nil { + return nil, fmt.Errorf("failed to convert value in the map with error: %w", error) + } + fieldValues = append(fieldValues, lbugValue) + defer C.lbug_value_destroy(lbugValue) + } + + var lbugValue *C.lbug_value + status := C.lbug_value_create_struct(numFields, &fieldNames[0], &fieldValues[0], &lbugValue) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to create STRUCT value with status: %d", status) + } + return lbugValue, nil +} + +// goSliceOfMapItemsToLbugMap converts a slice of MapItem to a lbug_value +// representing a MAP. It returns an error if the slice is empty or if the keys +// in the slice are of different types or if the values in the slice are of +// different types. +func goSliceOfMapItemsToLbugMap(slice []MapItem) (*C.lbug_value, error) { + numItems := C.uint64_t(len(slice)) + if numItems == 0 { + return nil, fmt.Errorf("failed to create MAP value because the slice is empty") + } + keys := make([]*C.lbug_value, 0, len(slice)) + values := make([]*C.lbug_value, 0, len(slice)) + for _, item := range slice { + key, error := goValueToLbugValue(item.Key) + if error != nil { + return nil, fmt.Errorf("failed to convert key in the slice with error: %w", error) + } + keys = append(keys, key) + defer C.lbug_value_destroy(key) + value, error := goValueToLbugValue(item.Value) + if error != nil { + return nil, fmt.Errorf("failed to convert value in the slice with error: %w", error) + } + values = append(values, value) + defer C.lbug_value_destroy(value) + } + var lbugValue *C.lbug_value + status := C.lbug_value_create_map(numItems, &keys[0], &values[0], &lbugValue) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to create MAP value with status: %d. please make sure all the keys are of the same type and all the values are of the same type", status) + } + return lbugValue, nil +} + +// goSliceToLbugList converts a slice of any to a lbug_value representing a LIST. +// It returns an error if the slice is empty or if the values in the slice are of +// different types. +func goSliceToLbugList(slice []any) (*C.lbug_value, error) { + numItems := C.uint64_t(len(slice)) + if numItems == 0 { + return nil, fmt.Errorf("failed to create LIST value because the slice is empty") + } + values := make([]*C.lbug_value, 0, len(slice)) + for _, item := range slice { + value, error := goValueToLbugValue(item) + if error != nil { + return nil, fmt.Errorf("failed to convert value in the slice with error: %w", error) + } + values = append(values, value) + defer C.lbug_value_destroy(value) + } + var lbugValue *C.lbug_value + status := C.lbug_value_create_list(numItems, &values[0], &lbugValue) + if status != C.LbugSuccess { + return nil, fmt.Errorf("failed to create LIST value with status: %d. please make sure all the values are of the same type", status) + } + return lbugValue, nil +} + +// lbugValueToGoValue converts a Go value to a lbug_value. +func goValueToLbugValue(value any) (*C.lbug_value, error) { + if value == nil { + return C.lbug_value_create_null(), nil + } + var lbugValue *C.lbug_value + switch v := value.(type) { + case bool: + lbugValue = C.lbug_value_create_bool(C.bool(v)) + case int: + lbugValue = C.lbug_value_create_int64(C.int64_t(v)) + case int64: + lbugValue = C.lbug_value_create_int64(C.int64_t(v)) + case int32: + lbugValue = C.lbug_value_create_int32(C.int32_t(v)) + case int16: + lbugValue = C.lbug_value_create_int16(C.int16_t(v)) + case int8: + lbugValue = C.lbug_value_create_int8(C.int8_t(v)) + case uint: + lbugValue = C.lbug_value_create_uint64(C.uint64_t(v)) + case uint64: + lbugValue = C.lbug_value_create_uint64(C.uint64_t(v)) + case uint32: + lbugValue = C.lbug_value_create_uint32(C.uint32_t(v)) + case uint16: + lbugValue = C.lbug_value_create_uint16(C.uint16_t(v)) + case uint8: + lbugValue = C.lbug_value_create_uint8(C.uint8_t(v)) + case float64: + lbugValue = C.lbug_value_create_double(C.double(v)) + case float32: + lbugValue = C.lbug_value_create_float(C.float(v)) + case string: + cStr := C.CString(v) + lbugValue = C.lbug_value_create_string(cStr) + C.free(unsafe.Pointer(cStr)) + case time.Time: + if timeHasNanoseconds(v) { + lbugValue = C.lbug_value_create_timestamp_ns(timeToLbugTimestampNs(v)) + } else { + lbugValue = C.lbug_value_create_timestamp(timeToLbugTimestamp(v)) + } + case time.Duration: + interval := durationToLbugInterval(v) + lbugValue = C.lbug_value_create_interval(interval) + case map[string]any: + return goMapToLbugStruct(v) + case []MapItem: + return goSliceOfMapItemsToLbugMap(v) + case []any: + return goSliceToLbugList(v) + default: + if reflect.TypeOf(value).Kind() == reflect.Slice { + sliceValue := reflect.ValueOf(value) + slice := make([]any, sliceValue.Len()) + for i := 0; i < sliceValue.Len(); i++ { + slice[i] = sliceValue.Index(i).Interface() + } + return goSliceToLbugList(slice) + } + return nil, fmt.Errorf("unsupported type: %T", v) + } + return lbugValue, nil +} diff --git a/internal/wiki/generator.go b/internal/wiki/generator.go index ac461abc..15dfa959 100644 --- a/internal/wiki/generator.go +++ b/internal/wiki/generator.go @@ -24,7 +24,7 @@ type SemanticProviderStatus struct { // Inputs is the dependency bundle the Generator needs. All fields are // optional except Graph (without a graph there is nothing to render). type Inputs struct { - Graph *graph.Graph + Graph graph.Store Communities *analysis.CommunityResult Processes *analysis.ProcessResult Hotspots []analysis.HotspotEntry @@ -51,7 +51,7 @@ type Result struct { // derives the supporting lookup maps; Generate writes the markdown // pages and flushes the writer. type Generator struct { - graph *graph.Graph + graph graph.Store communities *analysis.CommunityResult processes *analysis.ProcessResult hotspots []analysis.HotspotEntry diff --git a/internal/wiki/mermaid.go b/internal/wiki/mermaid.go index 3fee41fe..c4246291 100644 --- a/internal/wiki/mermaid.go +++ b/internal/wiki/mermaid.go @@ -42,7 +42,7 @@ func mermaidEscape(s string) string { // the cross-community calls between them. Each node is a community; // edge weights are the number of calls flowing across the boundary. // Used both on the index page and as the wiki//_assets file. -func RenderCommunityGraph(g *graph.Graph, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { +func RenderCommunityGraph(g graph.Store, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { if communities == nil || len(communities.Communities) == 0 { return "graph LR\n empty[\"No communities detected\"]\n" } @@ -235,7 +235,7 @@ func stepLabel(id string, nodeByID map[string]*graph.Node) string { // RenderArchitecture emits a Mermaid flowchart showing communities // grouped by parent (when present) plus cross-community arrows. // Mirrors the architecture overview page. -func RenderArchitecture(g *graph.Graph, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { +func RenderArchitecture(g graph.Store, communities *analysis.CommunityResult, opts CommunityGraphOpts) string { if communities == nil || len(communities.Communities) == 0 { return "graph TB\n empty[\"No communities detected\"]\n" } diff --git a/scripts/fetch-lbug.sh b/scripts/fetch-lbug.sh new file mode 100755 index 00000000..c11ed04a --- /dev/null +++ b/scripts/fetch-lbug.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash +# Fetch the prebuilt liblbug for one or more target platforms and place +# it where cgo_shared.go expects it. The native libs are NOT committed +# (see .gitignore); this script is the single source of truth and is run +# by `make build`/`make test`, by CI, and by the release pipeline. +# +# Link model (see internal/thirdparty/go-ladybug/cgo_shared.go): +# - linux / darwin : STATIC -> lib/static/-/liblbug.a +# - windows : DYNAMIC -> lib/dynamic/windows/{lbug_shared.dll, +# liblbug_shared.dll.a} (mingw import lib +# generated from the MSVC-built DLL; the +# DLL ships next to gortex.exe at runtime) +# +# Usage: +# scripts/fetch-lbug.sh # host os/arch +# scripts/fetch-lbug.sh all # every release target +# scripts/fetch-lbug.sh linux arm64 # one explicit target +# +# Env: +# LBUG_VERSION liblbug release tag without the leading v (default below) +# LBUG_VARIANT linux static flavour: compat (default) | perf +set -euo pipefail + +LBUG_VERSION="${LBUG_VERSION:-0.17.0}" +LBUG_VARIANT="${LBUG_VARIANT:-compat}" +REPO="LadybugDB/ladybug" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GO_LBUG_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/internal/thirdparty/go-ladybug" +LIB_STATIC="$GO_LBUG_DIR/lib/static" +LIB_DYNAMIC="$GO_LBUG_DIR/lib/dynamic" + +log() { printf '\033[36m[fetch-lbug]\033[0m %s\n' "$*" >&2; } +die() { printf '\033[31m[fetch-lbug] %s\033[0m\n' "$*" >&2; exit 1; } + +download() { + local url="$1" out="$2" + if command -v curl >/dev/null 2>&1; then + curl -fsSL -o "$out" "$url" + elif command -v wget >/dev/null 2>&1; then + wget -qO "$out" "$url" + else + die "need curl or wget" + fi +} + +extract() { + local file="$1" dir="$2" + mkdir -p "$dir" + case "$file" in + *.tar.gz|*.tgz) tar -xzf "$file" -C "$dir" ;; + *.zip) unzip -oq "$file" -d "$dir" ;; + *) die "unknown archive: $file" ;; + esac +} + +# place_header copies lbug.h next to the cgo binding if it isn't already +# there (it is committed, so this only helps a stripped checkout). +place_header() { + local src_root="$1" + if [ ! -f "$GO_LBUG_DIR/lbug.h" ]; then + local h; h="$(find "$src_root" -name lbug.h | head -1 || true)" + if [ -n "$h" ]; then cp "$h" "$GO_LBUG_DIR/lbug.h"; log "placed lbug.h"; fi + fi +} + +fetch_static() { + local os="$1" arch="$2" asset libarch destdir + case "$os-$arch" in + linux-amd64) libarch=x86_64; asset="liblbug-static-linux-x86_64-${LBUG_VARIANT}.tar.gz" ;; + linux-arm64) libarch=aarch64; asset="liblbug-static-linux-aarch64-${LBUG_VARIANT}.tar.gz" ;; + darwin-amd64) asset="liblbug-static-osx-x86_64.tar.gz" ;; + darwin-arm64) asset="liblbug-static-osx-arm64.tar.gz" ;; + *) die "no static asset for $os/$arch" ;; + esac + destdir="$LIB_STATIC/$os-$arch" + if [ -f "$destdir/liblbug.a" ] && [ -z "${LBUG_FORCE:-}" ]; then + log "$os/$arch already present (LBUG_FORCE=1 to refetch)"; return 0 + fi + local tmp; tmp="$(mktemp -d)" + log "$os/$arch (static): $asset @ v$LBUG_VERSION" + download "https://github.com/$REPO/releases/download/v$LBUG_VERSION/$asset" "$tmp/$asset" + extract "$tmp/$asset" "$tmp/x" + local a; a="$(find "$tmp/x" -name 'liblbug.a' | head -1 || true)" + [ -n "$a" ] || die "liblbug.a not found in $asset" + mkdir -p "$destdir" + # Only liblbug.a goes in the static dir so `-llbug` resolves to the + # archive (no .so/.dylib for the linker to prefer). + cp "$a" "$destdir/liblbug.a" + place_header "$tmp/x" + rm -rf "$tmp" + log " -> $destdir/liblbug.a" +} + +fetch_windows() { + local asset="liblbug-windows-x86_64.zip" destdir="$LIB_DYNAMIC/windows" + if [ -f "$destdir/lbug_shared.dll" ] && [ -z "${LBUG_FORCE:-}" ]; then + log "windows/amd64 already present (LBUG_FORCE=1 to refetch)"; return 0 + fi + local tmp; tmp="$(mktemp -d)" + log "windows/amd64 (dynamic): $asset @ v$LBUG_VERSION" + download "https://github.com/$REPO/releases/download/v$LBUG_VERSION/$asset" "$tmp/$asset" + extract "$tmp/$asset" "$tmp/x" + mkdir -p "$destdir" + local dll; dll="$(find "$tmp/x" -name 'lbug_shared.dll' | head -1 || true)" + [ -n "$dll" ] || die "lbug_shared.dll not found in $asset" + # The .exe links directly against the DLL (cgo: -l:lbug_shared.dll), + # so no import lib is needed. The DLL itself must ship next to the + # .exe at runtime (the release windows job bundles it + the VC++ + # runtime). + cp "$dll" "$destdir/lbug_shared.dll" + place_header "$tmp/x" + rm -rf "$tmp" + log " -> $destdir/lbug_shared.dll" +} + +fetch_one() { + local os="$1" arch="$2" + case "$os" in + windows) fetch_windows ;; + linux|darwin) fetch_static "$os" "$arch" ;; + *) die "unsupported os $os" ;; + esac +} + +# ---- target selection ----------------------------------------------------- +declare -a targets=() +case "${1:-}" in + all) + targets=("linux amd64" "linux arm64" "darwin amd64" "darwin arm64" "windows amd64") + ;; + ""|host) + os="$(uname -s)"; arch="$(uname -m)" + case "$os" in + Linux) os=linux ;; Darwin) os=darwin ;; + MINGW*|MSYS*|CYGWIN*) os=windows ;; + *) die "unknown host os $os" ;; + esac + case "$arch" in x86_64|amd64) arch=amd64 ;; arm64|aarch64) arch=arm64 ;; esac + targets=("$os $arch") + ;; + *) + targets=("$1 ${2:-amd64}") + ;; +esac + +for t in "${targets[@]}"; do + # shellcheck disable=SC2086 + fetch_one $t +done +log "liblbug v$LBUG_VERSION ready" diff --git a/scripts/install.ps1 b/scripts/install.ps1 index dfa0eee9..8ffc491c 100644 --- a/scripts/install.ps1 +++ b/scripts/install.ps1 @@ -4,7 +4,9 @@ .DESCRIPTION Downloads the signed Windows release archive, verifies its SHA-256 - checksum, installs the binary, and puts it on the user PATH. + checksum, installs gortex.exe together with the runtime DLLs it ships + with (lbug_shared.dll + the mingw and VC++ runtime), and puts the + install directory on the user PATH. Usage: irm https://get.gortex.dev/install.ps1 | iex @@ -127,8 +129,9 @@ function Main { } Write-Info 'extracting' - Expand-Archive -Path $zipPath -DestinationPath $tmp -Force - $extracted = Join-Path $tmp $BinName + $staging = Join-Path $tmp 'extract' + Expand-Archive -Path $zipPath -DestinationPath $staging -Force + $extracted = Join-Path $staging $BinName if (-not (Test-Path $extracted)) { Die "archive did not contain a $BinName binary" } @@ -140,8 +143,14 @@ function Main { Write-Info "backing up existing binary to $backup" Move-Item -Path $target -Destination $backup -Force } - Move-Item -Path $extracted -Destination $target -Force - Write-Ok "installed $target" + # Install the whole archive, not just the .exe: on Windows gortex + # links liblbug DYNAMICALLY and ships lbug_shared.dll plus the + # mingw and VC++ runtime DLLs in the zip. Windows resolves DLLs + # from the executable's own directory, so every file must land + # next to gortex.exe or it won't start. + Copy-Item -Path (Join-Path $staging '*') -Destination $installDir -Recurse -Force + $dllCount = (Get-ChildItem -Path $installDir -Filter *.dll -ErrorAction SilentlyContinue | Measure-Object).Count + Write-Ok "installed $target (+ $dllCount runtime DLLs)" if (-not $env:GORTEX_NO_PATH) { Add-ToUserPath $installDir