diff --git a/src/conductor/cli/app.py b/src/conductor/cli/app.py index 0a5b629f..758961b3 100644 --- a/src/conductor/cli/app.py +++ b/src/conductor/cli/app.py @@ -373,6 +373,19 @@ def run( help="Path to instruction file(s) to prepend to all agent prompts. Can be repeated.", ), ] = None, + print_loaded_instructions: Annotated[ + bool, + typer.Option( + "--print-loaded-instructions", + help=( + "Print the resolved list of workspace instruction files (with " + "their scope and reason for inclusion) to stderr before running " + "the workflow. Useful for debugging why an instruction file is " + "or isn't being picked up by --workspace-instructions. Has no " + "effect unless --workspace-instructions is also set." + ), + ), + ] = False, ) -> None: """Run a workflow from a YAML file. @@ -477,6 +490,7 @@ def run( metadata=cli_metadata, workspace_instructions=workspace_instructions, cli_instructions=raw_instructions, + print_loaded_instructions=print_loaded_instructions, ) if is_verbose(): console.print(f"[bold cyan]Dashboard:[/bold cyan] {launch.url}") @@ -506,6 +520,7 @@ def run( metadata=cli_metadata, workspace_instructions=workspace_instructions, cli_instructions=raw_instructions, + print_loaded_instructions=print_loaded_instructions, ) ) diff --git a/src/conductor/cli/bg_runner.py b/src/conductor/cli/bg_runner.py index ad748f25..ecacb46c 100644 --- a/src/conductor/cli/bg_runner.py +++ b/src/conductor/cli/bg_runner.py @@ -508,6 +508,7 @@ def launch_background( metadata: dict[str, str] | None = None, workspace_instructions: bool = False, cli_instructions: list[str] | None = None, + print_loaded_instructions: bool = False, ) -> BackgroundLaunch: """Fork a detached child process running the workflow with a web dashboard. @@ -527,6 +528,9 @@ def launch_background( metadata: Optional CLI metadata key=value pairs. workspace_instructions: Whether to auto-discover workspace instruction files. cli_instructions: Optional list of instruction file paths. + print_loaded_instructions: Whether to forward ``--print-loaded-instructions`` + to the background child. Output goes to the child's captured stderr + log, not to the parent's TTY. Returns: A ``BackgroundLaunch`` describing the launch (dashboard URL, @@ -582,6 +586,9 @@ def launch_background( for instr_path in cli_instructions: cmd.extend(["--instructions", instr_path]) + if print_loaded_instructions: + cmd.append("--print-loaded-instructions") + return _spawn_bg_child(cmd=cmd, web_port=web_port, pid_workflow_ref=workflow_path) diff --git a/src/conductor/cli/run.py b/src/conductor/cli/run.py index 380ed87e..315e6947 100644 --- a/src/conductor/cli/run.py +++ b/src/conductor/cli/run.py @@ -30,6 +30,7 @@ from conductor.providers.registry import ProviderRegistry if TYPE_CHECKING: + from conductor.config.instructions import DiscoveredInstruction from conductor.config.schema import ProviderSettings from conductor.events import WorkflowEvent @@ -1371,6 +1372,77 @@ async def _execute_with_stop_signal( raise ExecutionError(stop_message) +def _emit_loaded_instructions_debug(start_dir: Path | None, enabled: bool) -> None: + """Discover-and-print the workspace instructions list when the debug flag + is enabled and auto-discovery actually ran. + + Extracted from :func:`run_workflow_async` so the branch is directly + testable without spinning up the full workflow runner. ``start_dir`` must + be the same path that :func:`build_instructions_preamble` was given as + ``auto_discover_dir`` — sharing the path is what guarantees the printed + list cannot drift from what was actually loaded. + + Args: + start_dir: The auto-discovery start directory used by the loader, or + ``None`` when auto-discovery did not run. ``None`` short-circuits + so the debug print is a no-op (matching the contract that this + flag is "meaningful only when --workspace-instructions is set"). + enabled: Whether ``--print-loaded-instructions`` was passed on the + CLI. ``False`` short-circuits. + """ + if not enabled or start_dir is None: + return + from conductor.config.instructions import discover_workspace_instructions_detailed + + detailed = discover_workspace_instructions_detailed(start_dir) + _print_loaded_instructions(detailed) + + +def _print_loaded_instructions(detailed: list[DiscoveredInstruction]) -> None: + """Emit a human-readable summary of discovered workspace instruction files + to stderr. + + Format: + + .. code-block:: text + + [workspace-instructions] 4 file(s) loaded from CWD: + AGENTS.md + source=AGENTS.md reason=file-convention + .github/instructions/csharp-coding-standards.instructions.md + source=.github/instructions reason=scope-overlap applyTo='**/*.cs' + ... + + Goes to stderr (not stdout) so it doesn't pollute JSON output. Uses a + plain-print rather than the rich console so it's reliably available in + background/non-TTY launchers. + + ``DiscoveredInstruction`` is imported under ``TYPE_CHECKING`` only: this + module has ``from __future__ import annotations``, so the annotation is + never evaluated at import time, and the lazy import inside + :func:`_emit_loaded_instructions_debug` is what actually keeps the + discovery code path out of the module graph when the flag is unused. + """ + import sys + + from conductor.config.instructions import ALWAYS_ON_SCOPE + + if not detailed: + print("[workspace-instructions] 0 files discovered from CWD.", file=sys.stderr) + return + print( + f"[workspace-instructions] {len(detailed)} file(s) discovered from CWD:", + file=sys.stderr, + ) + for d in detailed: + print(f" {d.path}", file=sys.stderr) + scope_part = f" applyTo={d.scope!r}" if d.scope and d.scope != ALWAYS_ON_SCOPE else "" + print( + f" source={d.source} reason={d.reason}{scope_part}", + file=sys.stderr, + ) + + async def run_workflow_async( workflow_path: Path, inputs: dict[str, Any], @@ -1385,6 +1457,7 @@ async def run_workflow_async( metadata: dict[str, str] | None = None, workspace_instructions: bool = False, cli_instructions: list[str] | None = None, + print_loaded_instructions: bool = False, ) -> dict[str, Any]: """Execute a workflow asynchronously. @@ -1401,6 +1474,9 @@ async def run_workflow_async( metadata: Optional CLI metadata to merge on top of YAML-declared metadata. workspace_instructions: If True, auto-discover workspace instruction files. cli_instructions: Optional list of instruction file paths from CLI. + print_loaded_instructions: If True, print the resolved instruction file + list (with scope and inclusion reason) to stderr before running. + No-op unless ``workspace_instructions`` is also True. Returns: The workflow output as a dictionary. @@ -1503,8 +1579,16 @@ async def run_workflow_async( if workspace_instructions or cli_instructions or config.workflow.instructions: from conductor.config.instructions import build_instructions_preamble + # Compute the auto-discovery start dir once and share it between + # the loader and the --print-loaded-instructions debug dump. If + # these diverged (e.g. the cwd changed between the two calls, or + # one was passed a different path), the printed list would no + # longer reflect what was actually loaded — defeating the whole + # purpose of the debug flag. + start_dir = Path.cwd() if workspace_instructions else None + instructions_preamble = build_instructions_preamble( - auto_discover_dir=Path.cwd() if workspace_instructions else None, + auto_discover_dir=start_dir, yaml_instructions=config.workflow.instructions or None, cli_instruction_paths=cli_instructions, ) @@ -1514,6 +1598,12 @@ async def run_workflow_async( style="cyan", ) + # --print-loaded-instructions: dump the resolved discovery list to + # stderr for debugging. Only meaningful when auto-discovery ran, + # and must use the same start_dir as the loader above so the + # printed list cannot silently drift from what was loaded. + _emit_loaded_instructions_debug(start_dir, print_loaded_instructions) + # Convert MCP servers from workflow config to SDK format mcp_servers = await _build_mcp_servers(config) diff --git a/src/conductor/config/instructions.py b/src/conductor/config/instructions.py index 854d27a2..8e74e62b 100644 --- a/src/conductor/config/instructions.py +++ b/src/conductor/config/instructions.py @@ -27,6 +27,7 @@ import re from collections.abc import Callable, Iterator from dataclasses import dataclass +from enum import Enum, StrEnum, auto from pathlib import Path from ruamel.yaml import YAML @@ -62,9 +63,40 @@ class ConventionDirectory: Discovery: walk CWD → git root. At each level, look for the convention directory. If found, find files matching ``pattern`` (recursively when - ``recursive=True``), apply the optional ``include_file`` predicate, and - track surviving files keyed by their *relative path within the convention - directory*. Closest-wins per relative-path key — local to this convention. + ``recursive=True``), apply the optional ``include_file`` and + ``extract_scope`` filters, and track surviving files keyed by their + *relative path within the convention directory*. Closest-wins per + relative-path key — local to this convention. + + Filter chain (applied in order, short-circuit on first reject): + + 1. ``include_file(file)`` — coarse eligibility gate. Retained for + backward-compat and for conventions whose eligibility logic is + orthogonal to scope (e.g., "skip files without frontmatter"). + Convention authors writing new scoped conventions should typically + reach for ``extract_scope`` instead. + 2. ``extract_scope(file)`` — returns the file's scope glob, or ``None`` + to opt out of auto-discovery for this file. The conductor core + interprets the return value: + + * ``None`` → opt out (file is *not* loaded by auto-discovery, + matching today's behavior for ``.github/instructions/`` files with + absent or non-string ``applyTo``). + * ``ALWAYS_ON_SCOPE`` (the string ``"**"``) → always include, + regardless of CWD. Convention authors should use the constant + rather than the literal so the contract is greppable. + * Any other string → treat as a path glob and run a bidirectional + overlap test against the user's CWD (relative to the convention + directory's owner directory). The file is included if the glob's + matched subtree overlaps with the CWD subtree in *either* + direction. Multi-glob values are supported (separators ``;`` and + ``,``); the file is included if *any* sub-glob overlaps. + + The overlap test is owned by conductor core (single source of truth), + not by individual convention authors — so adding a second scoped + convention later (e.g., Cursor's ``.cursor/rules/*.mdc`` with a + ``globs:`` frontmatter field) only requires implementing its own + ``extract_scope`` callable, not re-implementing the overlap semantic. Symlink policy: directory traversal does NOT follow symlinked directories (``os.walk(followlinks=False)``) to avoid loops and out-of-tree expansion. @@ -75,12 +107,62 @@ class ConventionDirectory: path: str pattern: str include_file: Callable[[Path], bool] | None = None + extract_scope: Callable[[Path], str | None] | None = None recursive: bool = True Convention = ConventionFile | ConventionDirectory +# Convention-author-facing sentinel for "include this file regardless of CWD". +# When ``ConventionDirectory.extract_scope`` returns this exact string, the +# overlap test is short-circuited and the file is always loaded. Defined as a +# module constant so convention authors can grep for usages instead of +# scattering string literals. +ALWAYS_ON_SCOPE = "**" + + +class DiscoveryReason(StrEnum): + """Stable, machine-readable reason a file was auto-discovered. + + A ``StrEnum`` (mirroring ``RegistryType`` in ``registry/config.py``) so the + f-string output and the string equality checks keep working unchanged, + while a typo like ``"scope_overlap"`` can no longer construct a record. + """ + + FILE_CONVENTION = "file-convention" + ALWAYS_ON = "always-on" + SCOPE_OVERLAP = "scope-overlap" + + +@dataclass(frozen=True) +class DiscoveredInstruction: + """A workspace instruction file discovered by ``--workspace-instructions``, + annotated with its filtering provenance. + + Used by :func:`discover_workspace_instructions_detailed` so that consumers + (e.g., ``--print-loaded-instructions``) can report *why* each file was + included without re-parsing the filesystem. + """ + + path: Path + # The convention's path identifier (e.g. ``"AGENTS.md"``, + # ``".github/instructions"``), useful for grouping in output. + source: str + # The scope value that was extracted from the file, if any. ``None`` for + # conventions with no scope concept (single-file conventions; directory + # conventions without ``extract_scope``). + scope: str | None + # Stable, machine-readable reason for inclusion. See :class:`DiscoveryReason`: + # + # * ``FILE_CONVENTION`` — single-file convention, no scope concept. + # * ``ALWAYS_ON`` — scoped convention but file's scope is + # :data:`ALWAYS_ON_SCOPE` (or the convention has no ``extract_scope``). + # * ``SCOPE_OVERLAP`` — scoped convention; file's scope glob overlapped + # with the user's CWD subtree. + reason: DiscoveryReason + + # --------------------------------------------------------------------------- # Frontmatter parser for GitHub Copilot's `.github/instructions/` convention # --------------------------------------------------------------------------- @@ -142,28 +224,154 @@ def _parse_frontmatter(path: Path) -> dict[str, object] | None: return fm -def _is_always_on_instructions_file(path: Path) -> bool: - """Return True iff a ``.github/instructions/*.instructions.md`` file is - explicitly always-on per GitHub's documented convention semantics. +def _extract_apply_to(path: Path) -> str | None: + """Extract the ``applyTo`` glob from a ``.github/instructions/*.instructions.md`` + file's YAML frontmatter. + + Wired to :class:`ConventionDirectory` as the ``extract_scope`` callable for + GitHub Copilot's instructions convention. Conductor core interprets the + return value (see :class:`ConventionDirectory` docstring for the full + contract): - Per - https://docs.github.com/en/copilot/customizing-copilot/about-customizing-github-copilot-chat-responses - and https://code.visualstudio.com/docs/copilot/customization/custom-instructions: + * ``None`` → file is opted out of auto-discovery (no parsable frontmatter, + missing ``applyTo``, or ``applyTo`` is not a string). Matches the + GitHub spec for "not applied automatically" files. + * Any string → conductor core decides what to do (``"**"`` = always include; + anything else = scoped glob, evaluated against CWD). - * ``applyTo: "**"`` → "always applied" → INCLUDE in conductor preamble - * ``applyTo: ""`` → scoped to matched files in chat → SKIP - (conductor has no per-agent file-scope concept) - * ``applyTo`` absent → "not applied automatically; you can still add them - manually to a chat request" → SKIP + Notes on what we *don't* parse: - Conductor's preamble is always-on for every agent prompt. To honor the - convention exactly, only files explicitly marked ``applyTo: "**"`` are - loaded. + * GitHub's spec permits ``applyTo`` to be a single string; YAML lists are + not part of the documented convention. Authors expressing multiple + patterns use semicolon- or comma-separated strings — which conductor's + overlap test splits natively. A list value here returns ``None`` + (opt-out) rather than silently joining; that mirrors the convention's + stated shape. """ fm = _parse_frontmatter(path) if fm is None: - return False - return fm.get("applyTo") == "**" + return None + apply_to = fm.get("applyTo") + if apply_to is None: + # ``applyTo`` genuinely absent → opt out silently, per GitHub's spec + # for files "not applied automatically". This is the expected quiet + # path, kept distinct from the authoring-slip paths below. + return None + if not isinstance(apply_to, str): + # Present but the wrong shape (e.g. a YAML list). That's an authoring + # slip, not an intentional opt-out — warn so it isn't a silent drop, + # then opt out (we don't guess how to join a list into one glob). + logger.warning( + "applyTo in %s is a %s, not a string; the convention expects a " + "single (optionally ';'/','-separated) glob. Skipping this file " + "from auto-discovery.", + path, + type(apply_to).__name__, + ) + return None + if not _MULTI_GLOB_SEP_RE.sub("", apply_to).strip(): + # Present but empty / whitespace / separators-only → no glob to match + # on. Same authoring-slip treatment: warn rather than silently drop. + logger.warning( + "applyTo in %s is empty or has no usable glob (%r); skipping this " + "file from auto-discovery.", + path, + apply_to, + ) + return None + return apply_to + + +# Multi-glob separators observed in real-world ``applyTo`` values: both ``;`` +# and ``,`` appear in production usage (see the Azure Chaos Studio sample in +# microsoft/conductor#231). Splitting on either keeps the helper compatible +# with authors' actual writing. NOTE: this does NOT support brace expansion +# inside a single glob (e.g., ``{src,tests}/**``) — the inner comma would be +# treated as a separator and the brace would never close. We don't see brace +# expansion in real-world data; if it becomes a pain point, swap this regex +# for a brace-aware splitter. +_MULTI_GLOB_SEP_RE = re.compile(r"[;,]") + +# Characters that signal the start of a wildcard segment in a path glob. +# Anything before the first segment containing one of these is a literal +# directory prefix, which is what we use for the overlap approximation. +_GLOB_WILDCARD_RE = re.compile(r"[*?\[]") + + +def _normalize_scope_path(p: str) -> str: + """Normalize a path-glob or CWD-relative fragment to a comparable form. + + * Converts backslashes (Windows-authored values) to forward slashes. + * Strips repeated ``./`` prefixes and leading ``/`` (some authors write + ``"/docs/eng.ms/**"`` — observed in real-world data). + * Strips a trailing ``/``. + * Normalises ``.`` (current dir) to the empty string so "workspace root" + has a single canonical representation. + """ + n = p.replace("\\", "/").strip() + while n.startswith("./"): + n = n[2:] + n = n.lstrip("/").rstrip("/") + if n == ".": + n = "" + return n + + +def _single_glob_overlaps(glob: str, cwd_norm: str) -> bool: + """Return True if a single normalized glob could match any file under + ``cwd_norm``, OR if the glob's target subtree is itself under ``cwd_norm``. + + Uses a literal-prefix approximation: the glob's directory parts up to the + first wildcard segment form a "minimum prefix" subtree the glob touches. + Two subtrees overlap if either is contained in the other (or they're + identical). + + This is intentionally conservative — it can over-include for globs like + ``"src/*/tests/**"`` evaluated against ``"src/foo/bar"`` (literal prefix + ``"src"`` overlaps ``"src/foo/bar"`` even though the glob can't actually + match a non-test file under ``bar/``). Over-inclusion is the safer failure + mode for this codepath: the alternative is silently dropping instructions + a user reasonably expects to be loaded. + """ + g = _normalize_scope_path(glob) + + prefix_parts: list[str] = [] + for part in g.split("/"): + if _GLOB_WILDCARD_RE.search(part): + break + prefix_parts.append(part) + prefix = "/".join(prefix_parts) + + # Empty prefix: glob's first segment is a wildcard (e.g. ``**/*.cs``), + # so it can match anywhere in the tree → overlaps any CWD. + if not prefix: + return True + # Empty CWD: user is at workspace root; everything is "inside" it. + if not cwd_norm: + return True + # Either subtree contains the other. + return ( + cwd_norm == prefix or cwd_norm.startswith(prefix + "/") or prefix.startswith(cwd_norm + "/") + ) + + +def _scope_overlaps(scope: str, cwd_rel: str) -> bool: + """Return True if the ``scope`` value (a single or multi-glob string) + overlaps with the ``cwd_rel`` subtree. + + Handles multi-glob values where authors separate sub-globs with ``;`` or + ``,`` (both observed in production ``applyTo`` strings). Empty sub-globs + (e.g., trailing separator) are skipped silently. + + See :func:`_single_glob_overlaps` for the per-glob semantics; this just + short-circuits on the first matching sub-glob. + """ + cwd_norm = _normalize_scope_path(cwd_rel) + for sub in _MULTI_GLOB_SEP_RE.split(scope): + sub = sub.strip() + if sub and _single_glob_overlaps(sub, cwd_norm): + return True + return False # --------------------------------------------------------------------------- @@ -181,7 +389,7 @@ def _is_always_on_instructions_file(path: Path) -> bool: ConventionDirectory( path=".github/instructions", pattern="*.instructions.md", - include_file=_is_always_on_instructions_file, + extract_scope=_extract_apply_to, recursive=True, ), ] @@ -223,14 +431,39 @@ def _find_git_root(start_dir: Path) -> Path | None: def _walk_directory_convention( - base_dir: Path, convention: ConventionDirectory -) -> Iterator[tuple[str, Path]]: - """Yield ``(relative_path, absolute_path)`` for files in ``base_dir`` - matching ``convention.pattern`` and (if set) ``convention.include_file``. + base_dir: Path, + convention: ConventionDirectory, + cwd_rel: str = "", +) -> Iterator[tuple[str, Path, str | None]]: + """Yield ``(relative_path, absolute_path, scope)`` for files in ``base_dir`` + matching ``convention.pattern`` and surviving the configured filter chain. The relative path is computed against ``base_dir`` and uses POSIX separators for cross-platform deterministic ordering. + The ``scope`` value carried in each tuple is whatever + ``convention.extract_scope`` returned for the file (or ``None`` when the + convention has no ``extract_scope`` callable). Consumers can use it to + build :class:`DiscoveredInstruction` records without re-parsing each + file's frontmatter. + + Filter chain (per file, short-circuit on first reject): + + 1. ``convention.include_file(file)`` if set. + 2. ``convention.extract_scope(file)`` if set — yields ``None``? Skip. + Returns :data:`ALWAYS_ON_SCOPE` (``"**"``)? Include unconditionally. + Returns any other glob string? Include only if it overlaps + ``cwd_rel``. + + ``cwd_rel`` is interpreted as a POSIX path relative to the convention's + *owner directory* (the directory containing ``convention.path`` — + typically the git root, but may be a nested project dir when the + convention directory was discovered at a non-root level). Callers that + discover the convention at multiple levels should pass the appropriate + per-level ``cwd_rel`` so a nested ``.github/instructions/foo.md`` with + ``applyTo: "src/**"`` resolves ``src`` relative to that nested project, + not the workspace root. + Symlinked directories are NOT traversed (``followlinks=False``); symlinked files inside the tree are listed and read like regular files. """ @@ -238,10 +471,11 @@ def _walk_directory_convention( for dirpath, _dirnames, filenames in os.walk(base_dir, followlinks=False): for fname in fnmatch.filter(filenames, convention.pattern): file_path = Path(dirpath) / fname - if convention.include_file is not None and not convention.include_file(file_path): + accepted_scope = _apply_convention_filters(convention, file_path, cwd_rel) + if accepted_scope is _REJECT: continue rel = file_path.relative_to(base_dir).as_posix() - yield rel, file_path + yield rel, file_path, accepted_scope else: try: entries = list(os.scandir(base_dir)) @@ -259,50 +493,152 @@ def _walk_directory_convention( if not fnmatch.fnmatch(entry.name, convention.pattern): continue file_path = Path(entry.path) - if convention.include_file is not None and not convention.include_file(file_path): + accepted_scope = _apply_convention_filters(convention, file_path, cwd_rel) + if accepted_scope is _REJECT: continue - yield entry.name, file_path + yield entry.name, file_path, accepted_scope + + +class _Reject(Enum): + """One-member sentinel enum for :func:`_apply_convention_filters`. + + A single-member ``Enum`` (rather than ``object()``) so an ``is _REJECT`` + guard gives the type checker real narrowing: afterwards the value is back + to ``str | None`` instead of a return type widened all the way to + ``object``. + """ + + TOKEN = auto() + + +# Sentinel returned by :func:`_apply_convention_filters` to signal "this file +# was rejected by one of the filters; skip it." Distinct from ``None``, which +# is a legitimate ``scope`` value meaning "no scope concept; the convention +# has no ``extract_scope`` callable." +_REJECT = _Reject.TOKEN + + +def _apply_convention_filters( + convention: ConventionDirectory, file_path: Path, cwd_rel: str +) -> str | None | _Reject: + """Apply :class:`ConventionDirectory`'s filter chain to a candidate file. + + Returns either: + + * :data:`_REJECT` — the file failed a filter and must be skipped. + * The scope value (``None`` for conventions without ``extract_scope``, + otherwise whatever ``extract_scope`` returned) — the file passed. + """ + if convention.include_file is not None and not convention.include_file(file_path): + return _REJECT + if convention.extract_scope is None: + return None # convention has no scope concept; file passes + scope = convention.extract_scope(file_path) + if scope is None: + return _REJECT # file opted out + if scope == ALWAYS_ON_SCOPE: + return scope # always-on, no overlap test + if not _scope_overlaps(scope, cwd_rel): + return _REJECT + return scope def discover_workspace_instructions(start_dir: Path) -> list[Path]: """Discover convention instruction files by walking up to the git root. - Searches from ``start_dir`` up to the git repository root for each entry - in :data:`CONVENTIONS`. Files closer to ``start_dir`` take precedence - when the same logical match key exists at multiple levels (closest-wins). + See :func:`discover_workspace_instructions_detailed` for the underlying + discovery algorithm and per-file filtering provenance. This wrapper + projects the result to just the file paths for callers that don't need + the metadata (e.g., :func:`load_instruction_files`). + """ + return [d.path for d in discover_workspace_instructions_detailed(start_dir)] + + +def discover_workspace_instructions_detailed(start_dir: Path) -> list[DiscoveredInstruction]: + """Discover convention instruction files, returning structured metadata. + Walks from ``start_dir`` up to the git repository root for each entry in + :data:`CONVENTIONS`. Files closer to ``start_dir`` take precedence when + the same logical match key exists at multiple levels (closest-wins). Each convention has its own discovery state, so keys do not collide across conventions (e.g., a hypothetical ``.cursor/rules/style.md`` would not shadow ``.github/instructions/style.instructions.md``). + For :class:`ConventionDirectory` entries with a configured + ``extract_scope`` callable, each candidate file's scope is evaluated + against the user's CWD *relative to the convention's owner directory* + (the directory containing the convention's path at the level where it + was found). This means a nested + ``services/GW/.github/instructions/foo.md`` with + ``applyTo: "src/**"`` resolves ``src`` relative to ``services/GW`` — + matching the "closest-wins / local-convention" semantic that makes + nested conventions meaningful. + Returns: - List of discovered instruction file paths. File conventions appear - first in their :data:`CONVENTIONS` declaration order; directory + List of :class:`DiscoveredInstruction` records. File conventions + appear first in :data:`CONVENTIONS` declaration order; directory conventions follow with their files sorted by relative path within the convention directory. """ + start_resolved = start_dir.resolve() git_root = _find_git_root(start_dir) - stop_at = git_root if git_root is not None else start_dir.resolve() + # Resolve git_root so equality against the (already-resolved) ``current`` + # walk pointer holds even when the git root is reached via a symlinked + # path. Without this, ``current == stop_at`` can be False at the git + # root and the walk overshoots into the parent filesystem. + stop_at = git_root.resolve() if git_root is not None else start_resolved - result: list[Path] = [] + result: list[DiscoveredInstruction] = [] for convention in CONVENTIONS: # Per-convention state: closest-wins is local to this convention only. - discovered: dict[str, Path] = {} + # Value carries the scope-at-discovery-time so we don't re-parse later. + discovered: dict[str, tuple[Path, str | None]] = {} - current = start_dir.resolve() + current = start_resolved while True: + # cwd_rel is relative to the *current* walk level — that is, the + # directory that owns the convention at this iteration. For a + # root-level convention this equals start-dir-relative-to-git-root; + # for a nested convention it's start-dir-relative-to-the-nested- + # project, which is what the convention's `applyTo` globs are + # naturally written against. + try: + cwd_rel = start_resolved.relative_to(current).as_posix() + except ValueError: + # ``current`` is somehow not an ancestor of ``start_resolved``. + # This can't happen today — ``current`` only ever moves up via + # ``.parent`` from ``start_resolved`` — so log before the + # permissive fallback (mirroring the assert in + # ``_discovery_reason``); a future regression then leaves a + # trace instead of a silent over-include. Keep the fallback to + # empty CWD, which makes the overlap test permissive — + # consistent with our "over-include rather than silently skip" + # bias. + logger.warning( + "Instruction discovery: %s is not an ancestor of %s; " + "falling back to a permissive (empty) CWD for the overlap " + "test. This is unexpected — please report it.", + current, + start_resolved, + ) + cwd_rel = "" + if cwd_rel == ".": + cwd_rel = "" + if isinstance(convention, ConventionFile): if convention.path not in discovered: candidate = current / convention.path if candidate.is_file(): - discovered[convention.path] = candidate + discovered[convention.path] = (candidate, None) logger.debug("Discovered instruction file: %s", candidate) else: # ConventionDirectory base_dir = current / convention.path if base_dir.is_dir(): - for rel, abs_path in _walk_directory_convention(base_dir, convention): + for rel, abs_path, scope in _walk_directory_convention( + base_dir, convention, cwd_rel + ): if rel not in discovered: - discovered[rel] = abs_path + discovered[rel] = (abs_path, scope) logger.debug("Discovered instruction file: %s", abs_path) if current == stop_at or current.parent == current: @@ -312,14 +648,50 @@ def discover_workspace_instructions(start_dir: Path) -> list[Path]: # Append this convention's discoveries in deterministic order. if isinstance(convention, ConventionFile): if convention.path in discovered: - result.append(discovered[convention.path]) + path, _ = discovered[convention.path] + result.append( + DiscoveredInstruction( + path=path, + source=convention.path, + scope=None, + reason=DiscoveryReason.FILE_CONVENTION, + ) + ) else: # ConventionDirectory for rel in sorted(discovered.keys()): - result.append(discovered[rel]) + path, scope = discovered[rel] + reason = _discovery_reason(convention, scope) + result.append( + DiscoveredInstruction( + path=path, + source=convention.path, + scope=scope, + reason=reason, + ) + ) return result +def _discovery_reason(convention: ConventionDirectory, scope: str | None) -> DiscoveryReason: + """Compute the machine-readable inclusion reason for a discovered file.""" + if convention.extract_scope is None: + # Convention with no scope concept (or include_file-only); the file + # passed the eligibility gate and that's all there is to say. + return DiscoveryReason.ALWAYS_ON + # Invariant: when ``extract_scope`` is set, ``_apply_convention_filters`` + # returns ``_REJECT`` rather than a ``None`` scope, so a discovered file + # always carries a concrete scope string here. Pin this with an assert + # so a future regression surfaces loudly instead of being silently + # relabeled as "always-on". + assert scope is not None, ( + "extract_scope set but scope is None — _apply_convention_filters invariant violated" + ) + if scope == ALWAYS_ON_SCOPE: + return DiscoveryReason.ALWAYS_ON + return DiscoveryReason.SCOPE_OVERLAP + + def load_instruction_files(paths: list[Path]) -> str: """Read and concatenate instruction files into a single text block. diff --git a/tests/test_config/test_instructions.py b/tests/test_config/test_instructions.py index 30398a24..bb615536 100644 --- a/tests/test_config/test_instructions.py +++ b/tests/test_config/test_instructions.py @@ -5,6 +5,7 @@ import logging import os from pathlib import Path +from typing import Any import pytest @@ -656,6 +657,65 @@ def test_cli_instructions_forwarded(self) -> None: assert cmd[instr_indices[0] + 1] == "AGENTS.md" assert cmd[instr_indices[1] + 1] == "CLAUDE.md" + def test_print_loaded_instructions_flag_forwarded(self) -> None: + """--print-loaded-instructions should appear in the subprocess command + when set so background runs surface discovery info in their captured + stderr log.""" + import contextlib + from unittest.mock import patch + + from conductor.cli.bg_runner import launch_background + + with ( + patch("conductor.cli.bg_runner.subprocess.Popen") as mock_popen, + patch("conductor.cli.bg_runner._wait_for_server", return_value=True), + ): + mock_popen.return_value.pid = 12345 + + with contextlib.suppress(Exception): + launch_background( + workflow_path=Path("test.yaml"), + inputs={}, + workspace_instructions=True, + print_loaded_instructions=True, + web_port=9999, + ) + + # Hard-assert that the subprocess command was actually built — + # otherwise the `in` assertion below silently no-ops if Popen + # was never reached. + assert mock_popen.called, "expected launch_background to invoke Popen" + cmd = mock_popen.call_args[0][0] + assert "--print-loaded-instructions" in cmd + + def test_print_loaded_instructions_flag_omitted_when_unset(self) -> None: + """--print-loaded-instructions must NOT appear when not requested + (avoid leaking into background runs by default).""" + import contextlib + from unittest.mock import patch + + from conductor.cli.bg_runner import launch_background + + with ( + patch("conductor.cli.bg_runner.subprocess.Popen") as mock_popen, + patch("conductor.cli.bg_runner._wait_for_server", return_value=True), + ): + mock_popen.return_value.pid = 12345 + + with contextlib.suppress(Exception): + launch_background( + workflow_path=Path("test.yaml"), + inputs={}, + workspace_instructions=True, + web_port=9999, + ) + + # Hard-assert that the subprocess command was actually built — + # otherwise the `not in` assertion below silently no-ops. + assert mock_popen.called, "expected launch_background to invoke Popen" + cmd = mock_popen.call_args[0][0] + assert "--print-loaded-instructions" not in cmd + # --------------------------------------------------------------------------- # Directory-convention discovery: .github/instructions/*.instructions.md @@ -706,13 +766,106 @@ def test_discovers_always_on_file(self, tmp_path: Path) -> None: result = discover_workspace_instructions(tmp_path) assert any(p.name == "style.instructions.md" for p in result) - def test_skips_scoped_file(self, tmp_path: Path) -> None: - """`applyTo: ''` is scoped per the docs and SHOULD NOT be loaded.""" + def test_scoped_file_loads_at_root_cwd(self, tmp_path: Path) -> None: + """`applyTo: ''` files load when CWD is the repo root. + + At the root, bidirectional overlap loads every scoped file + (correctness fix for the silently-skipped bug — see + microsoft/conductor#231). Narrowing kicks in only when the user + `cd`s into a subdir, exercised by + ``test_scoped_file_skipped_when_cwd_disjoint`` below. + """ (tmp_path / ".git").mkdir() f = tmp_path / ".github" / "instructions" / "ts-only.instructions.md" _write_with_frontmatter(f, apply_to="**/*.ts", body="TS rules.") result = discover_workspace_instructions(tmp_path) - assert not any(p.name == "ts-only.instructions.md" for p in result) + assert any(p.name == "ts-only.instructions.md" for p in result) + + def test_scoped_file_skipped_when_cwd_disjoint(self, tmp_path: Path) -> None: + """When CWD is a subdir whose subtree doesn't overlap the file's + `applyTo` glob, the scoped file is correctly skipped. + + This is the narrowing case: at `services/AS`, a file scoped to + `services/GW/**` should NOT load. + """ + (tmp_path / ".git").mkdir() + # Scoped instructions live at the repo root, applyTo = services/GW/** + _write_with_frontmatter( + tmp_path / ".github" / "instructions" / "gw-only.instructions.md", + apply_to="services/GW/**", + body="GW rules.", + ) + # CWD is services/AS — disjoint from services/GW + as_dir = tmp_path / "services" / "AS" + as_dir.mkdir(parents=True) + result = discover_workspace_instructions(as_dir) + assert not any(p.name == "gw-only.instructions.md" for p in result) + + def test_scoped_file_loads_when_cwd_inside_scope(self, tmp_path: Path) -> None: + """When CWD is inside the file's `applyTo` subtree, the file loads.""" + (tmp_path / ".git").mkdir() + _write_with_frontmatter( + tmp_path / ".github" / "instructions" / "gw-only.instructions.md", + apply_to="services/GW/**", + body="GW rules.", + ) + # CWD is deep inside services/GW + gw_src = tmp_path / "services" / "GW" / "src" / "Controllers" + gw_src.mkdir(parents=True) + result = discover_workspace_instructions(gw_src) + assert any(p.name == "gw-only.instructions.md" for p in result) + + def test_multi_glob_applyto_semicolon(self, tmp_path: Path) -> None: + """Multi-glob applyTo with ';' separator loads when any sub-glob overlaps.""" + (tmp_path / ".git").mkdir() + _write_with_frontmatter( + tmp_path / ".github" / "instructions" / "gw-or-be.instructions.md", + apply_to="services/GW/**;services/BE/**", + body="GW or BE rules.", + ) + be_dir = tmp_path / "services" / "BE" + be_dir.mkdir(parents=True) + result = discover_workspace_instructions(be_dir) + assert any(p.name == "gw-or-be.instructions.md" for p in result) + + def test_multi_glob_applyto_comma(self, tmp_path: Path) -> None: + """Multi-glob applyTo with ',' separator loads when any sub-glob overlaps.""" + (tmp_path / ".git").mkdir() + _write_with_frontmatter( + tmp_path / ".github" / "instructions" / "lang-mix.instructions.md", + apply_to="**/*.cs,**/*.csproj", + body="C# and project rules.", + ) + # CWD anywhere — leading-** sub-globs overlap any CWD + sub = tmp_path / "services" / "GW" + sub.mkdir(parents=True) + result = discover_workspace_instructions(sub) + assert any(p.name == "lang-mix.instructions.md" for p in result) + + def test_nested_convention_scope_relative_to_owner_dir(self, tmp_path: Path) -> None: + """A nested `.github/instructions/foo.md` interprets its `applyTo` + relative to the nested project's own directory (its owner), not + relative to the workspace root. + + Without per-walk-level cwd_rel, a nested file with + `applyTo: "src/**"` evaluated from `services/GW` would compare + `"src/**"` against cwd_rel `"services/GW"` (root-relative) and + miss. With per-owner cwd_rel, cwd_rel is `""` (start_dir == owner), + which always overlaps. + """ + (tmp_path / ".git").mkdir() + # Nested .github/instructions/ at services/GW/ + nested = tmp_path / "services" / "GW" / ".github" / "instructions" + nested.mkdir(parents=True) + _write_with_frontmatter( + nested / "gw-src.instructions.md", + apply_to="src/**", + body="GW src rules.", + ) + # CWD is services/GW (the owner dir for the nested convention) + gw_dir = tmp_path / "services" / "GW" + result = discover_workspace_instructions(gw_dir) + assert any(p.name == "gw-src.instructions.md" for p in result) def test_skips_no_frontmatter(self, tmp_path: Path) -> None: """Files without any frontmatter are 'manual-attach' per the docs → SKIP.""" @@ -849,9 +1002,546 @@ def test_pattern_must_match(self, tmp_path: Path) -> None: result = discover_workspace_instructions(tmp_path) assert not any(p.name == "foo.md" for p in result) + @pytest.mark.skipif( + os.name == "nt", + reason="Symlink creation requires elevation on Windows", + ) + def test_symlinked_git_root_stops_walk(self, tmp_path: Path) -> None: + """Regression for R1-003: a symlinked path to the git root must stop + the parent-walk at the git root. + + Before the fix, ``stop_at`` held the unresolved git_root path while + ``current`` was walked via ``start_dir.resolve()``. On a symlinked + git root the two paths differed, ``current == stop_at`` never held, + and the walk overshot into the parent filesystem — potentially + picking up out-of-repo instruction files. + """ + # Real repo with a top-level instruction file. + real_repo = tmp_path / "real_repo" + real_repo.mkdir() + (real_repo / ".git").mkdir() + _write_with_frontmatter( + real_repo / ".github" / "instructions" / "in_repo.instructions.md", + apply_to="**", + ) + # An out-of-repo instruction file in a sibling fake "outer" repo that + # also looks like a git root. If the walk overshoots past the real + # git root, it could end up here. + outer = tmp_path / "outer" + outer.mkdir() + (outer / ".git").mkdir() + _write_with_frontmatter( + outer / ".github" / "instructions" / "out_of_repo.instructions.md", + apply_to="**", + ) + # Access the repo via a symlink so start_dir.resolve() differs from + # the unresolved git_root that _find_git_root returns. + link = tmp_path / "link_to_repo" + os.symlink(real_repo, link, target_is_directory=True) + sub = link / "sub" + sub.mkdir() + + result = discover_workspace_instructions(sub) + names = [p.name for p in result] + assert "in_repo.instructions.md" in names + assert "out_of_repo.instructions.md" not in names + + +# --------------------------------------------------------------------------- +# _scope_overlaps + _single_glob_overlaps + _normalize_scope_path +# --------------------------------------------------------------------------- + + +class TestScopeOverlaps: + """Tests for the bidirectional glob-vs-CWD overlap helper. + + The overlap test is intentionally conservative (over-approximates): better + to load too many instructions than to silently skip ones the user expects. + These tests pin down both correct positives and the principled + over-approximations so future maintainers don't "fix" them into false + negatives. + """ + + def test_root_cwd_loads_everything(self) -> None: + """At repo root (cwd_rel=''), every prefix overlaps.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("services/GW/**", "") + assert _scope_overlaps("**/*.cs", "") + assert _scope_overlaps("docs/eng.ms/**", "") + assert _scope_overlaps("CHANGELOG.md", "") + + def test_cwd_inside_scope_subtree(self) -> None: + """CWD is a descendant of the glob's prefix subtree.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("services/GW/**", "services/GW") + assert _scope_overlaps("services/GW/**", "services/GW/src/Controllers") + assert _scope_overlaps("docs/**", "docs/api/reference") + + def test_scope_inside_cwd_subtree(self) -> None: + """The glob's prefix subtree is a descendant of CWD.""" + from conductor.config.instructions import _scope_overlaps + + # Repo root with a nested-target scope + assert _scope_overlaps("services/GW/**", "services") + # CWD = services, scope target = services/GW/** → scope is inside CWD + assert _scope_overlaps("docs/api/**", "docs") + + def test_disjoint_subtrees(self) -> None: + """Disjoint prefix subtrees: no overlap.""" + from conductor.config.instructions import _scope_overlaps + + assert not _scope_overlaps("services/GW/**", "services/AS") + assert not _scope_overlaps("docs/eng.ms/**", "services/GW") + assert not _scope_overlaps("portal/extension/**", "services/GW") + + def test_segment_boundary_not_prefix_match(self) -> None: + """`foo` must NOT overlap `food` — segment boundaries matter. + + This pins down a class of regression where a future "simplification" + might use bare `str.startswith` (would falsely match `food` against + prefix `foo`). The implementation guards against this with + `prefix + "/"` boundary checks. + """ + from conductor.config.instructions import _scope_overlaps + + assert not _scope_overlaps("foo", "food") + assert not _scope_overlaps("services/GW", "services/GWX") + assert not _scope_overlaps("services/GW/**", "services/GWX/src") + + def test_leading_double_star_matches_anywhere(self) -> None: + """Globs starting with `**` have an empty literal prefix → always overlap.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("**/*.cs", "services/GW/src") + assert _scope_overlaps("**/portal-extension/**", "services/AS") + assert _scope_overlaps("**/kubectl/*.yaml", "anywhere/at/all") + + def test_multi_glob_semicolon_separator(self) -> None: + """`;`-separated multi-glob: overlap iff any sub-glob overlaps.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("services/GW/**;services/BE/**", "services/BE") + assert _scope_overlaps("services/GW/**;services/BE/**", "services/GW") + assert not _scope_overlaps("services/GW/**;services/BE/**", "services/AS") + + def test_multi_glob_comma_separator(self) -> None: + """`,`-separated multi-glob: overlap iff any sub-glob overlaps.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("**/*.cs,**/*.csproj,**/Directory.Packages.props", "anywhere") + assert _scope_overlaps("docs/**,**/*.md", "docs/api") + # Note: leading-** sub-globs make this trivially true; an unambiguous + # disjoint case requires fully-prefixed sub-globs. + assert not _scope_overlaps("services/GW/**,services/BE/**", "services/AS") + + def test_leading_slash_in_glob(self) -> None: + """Authors sometimes write `/docs/...` (observed in real data). + Leading slash should not break the overlap test.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("/docs/eng.ms/**", "docs/eng.ms/articles") + assert not _scope_overlaps("/docs/eng.ms/**", "services/GW") + + def test_exact_file_glob(self) -> None: + """A glob with no wildcards is an exact file path. Overlap if CWD + is the file's directory (or an ancestor of it).""" + from conductor.config.instructions import _scope_overlaps + + # CWD = src, scope = src/foo/bar.cs → scope is inside CWD → overlap + assert _scope_overlaps("src/foo/bar.cs", "src") + # CWD = src/foo/bar, scope = src/foo/bar.cs → scope is at parent of CWD; + # technically the file is at src/foo/, NOT inside src/foo/bar. The + # literal-prefix-overlap algorithm conservatively returns True (prefix + # 'src/foo/bar.cs' overlaps with cwd 'src/foo/bar' — neither contains + # the other strictly, so disjoint). This is OVER-APPROXIMATION in the + # principled direction; documenting here so it's not "fixed". + assert not _scope_overlaps("src/foo/bar.cs", "src/foo/baz") + + def test_over_approximation_intermediate_wildcard(self) -> None: + """`src/*/tests/**` evaluated against `src/foo/bar` has literal prefix + `src` which overlaps `src/foo/bar`. The glob can't actually match any + file under `src/foo/bar/` (because the path doesn't go through + `tests/`), but we over-include rather than risk skipping. Document the + intentional false-positive so it isn't "fixed" into a false negative.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("src/*/tests/**", "src/foo/bar") + # The narrowing case still works: cwd outside `src/*` is correctly disjoint + assert not _scope_overlaps("src/*/tests/**", "docs") + + def test_empty_and_whitespace_components(self) -> None: + """Empty / whitespace sub-globs are skipped, not treated as 'always-match'.""" + from conductor.config.instructions import _scope_overlaps + + # Trailing separator → empty trailing sub-glob skipped + assert _scope_overlaps("services/GW/**;", "services/GW") + # Leading separator and whitespace → empty leading sub-glob skipped + assert not _scope_overlaps(" ; services/GW/**", "services/AS") + + def test_only_separators_rejects(self) -> None: + """A scope value of just `;` or `,` (or whitespace) splits into all + empty sub-globs and overlaps with nothing — file is rejected. + + Regression guard: a future change that treated empty-after-split as + "always-on" would silently broaden the filter. + """ + from conductor.config.instructions import _scope_overlaps + + assert not _scope_overlaps(";", "anywhere") + assert not _scope_overlaps(",", "anywhere") + assert not _scope_overlaps(" ; , ", "anywhere") + + def test_brace_expansion_unsupported(self) -> None: + """Brace-expansion globs (e.g. `src/{foo,bar}/**`) are NOT supported: + the comma is treated as a multi-glob separator, splitting the brace + and producing nonsensical sub-globs. + + This documents the known limitation. Real-world ``applyTo`` values + observed across Azure repos use `;`/`,`-separated whole-string + multi-globs rather than brace expansion, so we accept this gap. + If brace expansion becomes a real need, swap the separator regex for + a brace-aware splitter. + """ + from conductor.config.instructions import _scope_overlaps + + # `src/{foo,bar}/**` splits on `,` into [`src/{foo`, `bar}/**`]. + # Neither sub-glob matches the intent. The test asserts the + # *current* (broken-but-documented) behavior so a future maintainer + # who attempts brace expansion knows to update this test. + # The first sub-glob `src/{foo` has literal prefix `src/{foo` + # (brace is not a recognized wildcard char in our regex `[*?\[]`). + # That prefix won't match `src/foo`, so this returns False — + # documenting that we DON'T magically handle braces. + assert not _scope_overlaps("src/{foo,bar}/**", "src/foo") + + def test_windows_backslash_normalised(self) -> None: + """Authors on Windows may write backslashes; normalise to forward slashes.""" + from conductor.config.instructions import _scope_overlaps + + assert _scope_overlaps("services\\GW\\**", "services/GW") + assert _scope_overlaps("services/GW/**", "services\\GW") + + def test_normalize_strips_dotslash_and_dot(self) -> None: + """`_normalize_scope_path` strips `./` prefixes and treats `.` as the + empty (workspace-root) path. Authors sometimes write `./foo/**` to be + explicit about repo-relative paths; that must behave identically to + `foo/**`. Closes the normaliser's branches not exercised by the + realistic Chaos-repo applyTo values.""" + from conductor.config.instructions import _normalize_scope_path, _scope_overlaps + + # Direct helper coverage + assert _normalize_scope_path("./foo/bar") == "foo/bar" + assert _normalize_scope_path("././foo") == "foo" + assert _normalize_scope_path(".") == "" + + # Through the overlap helper (CWD or glob written with `./` or `.`) + assert _scope_overlaps("./services/GW/**", "services/GW") + assert _scope_overlaps("services/GW/**", ".") # CWD == workspace root + + +class TestApplyConventionFiltersInvariants: + """Direct tests for the filter-chain helper to pin the `_REJECT` sentinel + contract and exercise the `_discovery_reason` branch that's only reached + when a ``ConventionDirectory`` is registered without ``extract_scope``.""" + + def test_extract_scope_unset_loads_unconditionally(self, tmp_path: Path) -> None: + """A ConventionDirectory with no `extract_scope` loads every matching + file as 'always-on' regardless of any frontmatter — exercises the + `_discovery_reason` branch where `convention.extract_scope is None`.""" + from unittest.mock import patch + + from conductor.config.instructions import ( + ConventionDirectory, + DiscoveredInstruction, + discover_workspace_instructions_detailed, + ) + + (tmp_path / ".git").mkdir() + scoped_file = tmp_path / "rules" / "anything.md" + scoped_file.parent.mkdir(parents=True) + # Frontmatter with a scoped applyTo would normally narrow, but this + # convention has no extract_scope so it loads unconditionally. + scoped_file.write_text("---\napplyTo: 'services/GW/**'\n---\nbody", encoding="utf-8") + + custom = ConventionDirectory( + path="rules", + pattern="*.md", + recursive=False, + ) + + with patch("conductor.config.instructions.CONVENTIONS", [custom]): + result = discover_workspace_instructions_detailed(tmp_path) + + assert len(result) == 1 + d = result[0] + assert isinstance(d, DiscoveredInstruction) + assert d.path == scoped_file + assert d.scope is None + # The 'always-on' reason here is the convention-without-extract_scope + # branch in _discovery_reason — distinct from the + # extract_scope-returning-'**' branch covered elsewhere. + assert d.reason == "always-on" + + def test_extract_scope_opt_out_rejects(self, tmp_path: Path) -> None: + """When `extract_scope` returns None for a file (its opt-out + sentinel), the file is dropped regardless of CWD.""" + from unittest.mock import patch + + from conductor.config.instructions import ( + ConventionDirectory, + discover_workspace_instructions_detailed, + ) + + (tmp_path / ".git").mkdir() + f = tmp_path / "rules" / "opted-out.md" + f.parent.mkdir(parents=True) + f.write_text("body", encoding="utf-8") + + # extract_scope always returns None → file always opts out + custom = ConventionDirectory( + path="rules", + pattern="*.md", + recursive=False, + extract_scope=lambda _p: None, + ) + + with patch("conductor.config.instructions.CONVENTIONS", [custom]): + result = discover_workspace_instructions_detailed(tmp_path) + + assert result == [] + + def test_relative_to_value_error_falls_back_to_empty_cwd( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """When `start_resolved.relative_to(current)` raises ValueError (e.g., + a symlink makes the walked `current` not an ancestor of `start_dir`), + the per-level `cwd_rel` falls back to '' so the overlap test stays + permissive — matching the over-include-rather-than-silently-skip + bias. Exercises the defensive ValueError branch.""" + from unittest.mock import patch + + from conductor.config.instructions import ( + ConventionDirectory, + discover_workspace_instructions_detailed, + ) + + (tmp_path / ".git").mkdir() + f = tmp_path / "rules" / "scoped.md" + f.parent.mkdir(parents=True) + # Scoped to services/GW — would NOT overlap from a real subdir, but + # the ValueError fallback forces cwd_rel='' so it loads anyway. + f.write_text("---\napplyTo: 'services/GW/**'\n---\nbody", encoding="utf-8") + + custom = ConventionDirectory( + path="rules", + pattern="*.md", + recursive=False, + extract_scope=lambda p: "services/GW/**", + ) + + # Force `Path.relative_to` to raise ValueError on the first call, + # simulating the symlink-induced ancestor mismatch. Subsequent calls + # behave normally so the walk's other logic isn't disrupted. + original_relative_to = Path.relative_to + call_count = {"n": 0} + + def flaky_relative_to(self: Path, *args: Any, **kwargs: Any) -> Path: + call_count["n"] += 1 + if call_count["n"] == 1: + raise ValueError("simulated symlink ancestor mismatch") + return original_relative_to(self, *args, **kwargs) + + with ( + patch("conductor.config.instructions.CONVENTIONS", [custom]), + monkeypatch.context() as m, + ): + m.setattr(Path, "relative_to", flaky_relative_to) + result = discover_workspace_instructions_detailed(tmp_path) + + # File loads — the fallback `cwd_rel=''` matches any scope, so the + # services/GW/** glob overlaps (over-inclusion = principled fallback). + assert len(result) == 1 + assert result[0].path == f + + +class TestEmitLoadedInstructionsDebug: + """Tests for `_emit_loaded_instructions_debug`, the small helper extracted + from `run_workflow_async` for direct testability. Pins the + short-circuit-on-disabled and short-circuit-on-no-start-dir branches and + proves the discover→print path uses the caller-supplied start_dir.""" + + def test_disabled_is_noop(self, tmp_path: Path, capsys: pytest.CaptureFixture[str]) -> None: + from conductor.cli.run import _emit_loaded_instructions_debug + + (tmp_path / "AGENTS.md").write_text("body", encoding="utf-8") + _emit_loaded_instructions_debug(tmp_path, enabled=False) + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + def test_no_start_dir_is_noop(self, capsys: pytest.CaptureFixture[str]) -> None: + """When auto-discovery didn't run (`start_dir is None`), the debug + print is suppressed even if the flag is on — matches the docstring + contract.""" + from conductor.cli.run import _emit_loaded_instructions_debug + + _emit_loaded_instructions_debug(None, enabled=True) + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + def test_enabled_with_start_dir_prints_discovery( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + """Happy path: discover→print, using the caller-supplied start_dir.""" + from conductor.cli.run import _emit_loaded_instructions_debug + + (tmp_path / ".git").mkdir() + (tmp_path / "AGENTS.md").write_text("body", encoding="utf-8") + + _emit_loaded_instructions_debug(tmp_path, enabled=True) + captured = capsys.readouterr() + assert "1 file(s) discovered from CWD" in captured.err + assert "AGENTS.md" in captured.err + # stdout invariant preserved + assert captured.out == "" + + +class TestPrintLoadedInstructionsHelper: + """Direct tests for the `_print_loaded_instructions` helper. + + The helper is called only from `run_workflow_async`, which is hard to + exercise without a full workflow run. Direct tests cover the formatting + branches (empty list, populated list, scope/no-scope, always-on sentinel + suppression) so future format tweaks are caught. + """ + + def test_emits_empty_state(self, capsys: pytest.CaptureFixture[str]) -> None: + from conductor.cli.run import _print_loaded_instructions + + _print_loaded_instructions([]) + captured = capsys.readouterr() + # stdout MUST stay clean (don't pollute JSON output) + assert captured.out == "" + assert "0 files discovered" in captured.err + + def test_emits_populated_state_with_scope( + self, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + from conductor.cli.run import _print_loaded_instructions + from conductor.config.instructions import ( + DiscoveredInstruction, + DiscoveryReason, + ) + + always_on = DiscoveredInstruction( + path=tmp_path / "AGENTS.md", + source="AGENTS.md", + scope=None, + reason=DiscoveryReason.FILE_CONVENTION, + ) + scoped = DiscoveredInstruction( + path=tmp_path / ".github" / "instructions" / "cs.md", + source=".github/instructions", + scope="**/*.cs", + reason=DiscoveryReason.SCOPE_OVERLAP, + ) + always_on_dir = DiscoveredInstruction( + path=tmp_path / ".github" / "instructions" / "all.md", + source=".github/instructions", + scope="**", + reason=DiscoveryReason.ALWAYS_ON, + ) + + _print_loaded_instructions([always_on, scoped, always_on_dir]) + captured = capsys.readouterr() + + # Header tracks count + assert "3 file(s) discovered from CWD" in captured.err + # All three paths emitted + assert "AGENTS.md" in captured.err + assert "cs.md" in captured.err + assert "all.md" in captured.err + # Reasons emitted with their source labels + assert "reason=file-convention" in captured.err + assert "reason=scope-overlap" in captured.err + assert "reason=always-on" in captured.err + # Scope shown only for the truly scoped entry (not for file-convention + # or for always-on `**` — `**` is the always-on sentinel; printing it + # would be redundant noise). + assert "applyTo='**/*.cs'" in captured.err + assert "applyTo='**'" not in captured.err + # stdout untouched + assert captured.out == "" + + +# --------------------------------------------------------------------------- +# discover_workspace_instructions_detailed (metadata-bearing variant) +# --------------------------------------------------------------------------- + + +class TestDiscoverDetailed: + """Tests for the structured-discovery variant used by + --print-loaded-instructions and any future caller that needs to reason + about *why* a file was included.""" + + def test_returns_discovered_instruction_records(self, tmp_path: Path) -> None: + from conductor.config.instructions import ( + DiscoveredInstruction, + discover_workspace_instructions_detailed, + ) + + (tmp_path / ".git").mkdir() + (tmp_path / "AGENTS.md").write_text("agents") + _write_with_frontmatter( + tmp_path / ".github" / "instructions" / "always.instructions.md", + apply_to="**", + ) + _write_with_frontmatter( + tmp_path / ".github" / "instructions" / "cs.instructions.md", + apply_to="**/*.cs", + ) + + result = discover_workspace_instructions_detailed(tmp_path) + assert all(isinstance(d, DiscoveredInstruction) for d in result) + by_name = {d.path.name: d for d in result} + + # AGENTS.md is a file convention — no scope concept. + assert by_name["AGENTS.md"].reason == "file-convention" + assert by_name["AGENTS.md"].scope is None + + # always.instructions.md is always-on per applyTo: "**" + assert by_name["always.instructions.md"].reason == "always-on" + assert by_name["always.instructions.md"].scope == "**" + + # cs.instructions.md is scoped; at root CWD it loads via overlap + assert by_name["cs.instructions.md"].reason == "scope-overlap" + assert by_name["cs.instructions.md"].scope == "**/*.cs" + + def test_paths_wrapper_matches_detailed(self, tmp_path: Path) -> None: + """`discover_workspace_instructions` is a thin wrapper over the + detailed variant; the paths must match exactly.""" + from conductor.config.instructions import ( + discover_workspace_instructions, + discover_workspace_instructions_detailed, + ) + + (tmp_path / ".git").mkdir() + (tmp_path / "AGENTS.md").write_text("agents") + _write_with_frontmatter( + tmp_path / ".github" / "instructions" / "always.instructions.md", + apply_to="**", + ) + + paths = discover_workspace_instructions(tmp_path) + detailed = discover_workspace_instructions_detailed(tmp_path) + assert paths == [d.path for d in detailed] + # --------------------------------------------------------------------------- -# Frontmatter robustness: edge cases for `_is_always_on_instructions_file` +# Frontmatter robustness: edge cases for `_extract_apply_to` # --------------------------------------------------------------------------- @@ -911,6 +1601,51 @@ def test_non_dict_yaml_skipped(self, tmp_path: Path) -> None: assert "empty.instructions.md" not in names assert "scalar.instructions.md" not in names + def test_applyto_list_warns_and_skips( + self, tmp_path: Path, caplog: pytest.LogCaptureFixture + ) -> None: + """A list-valued ``applyTo`` is an authoring slip, not an opt-out, so it + is skipped *loudly* (the present-but-unusable path warns).""" + (tmp_path / ".git").mkdir() + f = tmp_path / ".github" / "instructions" / "listapply.instructions.md" + f.parent.mkdir(parents=True) + f.write_text( + "---\napplyTo:\n - '**/*.cs'\n - '**/*.ts'\n---\nBody.\n", + encoding="utf-8", + ) + with caplog.at_level(logging.WARNING): + result = discover_workspace_instructions(tmp_path) + assert not any(p.name == "listapply.instructions.md" for p in result) + assert "not a string" in caplog.text + + def test_applyto_empty_warns_and_skips( + self, tmp_path: Path, caplog: pytest.LogCaptureFixture + ) -> None: + """An empty / separator-only ``applyTo`` has no usable glob, so it is + skipped with a warning rather than silently dropped.""" + (tmp_path / ".git").mkdir() + f = tmp_path / ".github" / "instructions" / "emptyapply.instructions.md" + f.parent.mkdir(parents=True) + f.write_text("---\napplyTo: ' ; '\n---\nBody.\n", encoding="utf-8") + with caplog.at_level(logging.WARNING): + result = discover_workspace_instructions(tmp_path) + assert not any(p.name == "emptyapply.instructions.md" for p in result) + assert "no usable glob" in caplog.text + + def test_applyto_absent_skips_silently( + self, tmp_path: Path, caplog: pytest.LogCaptureFixture + ) -> None: + """A genuinely absent ``applyTo`` opts out per the GitHub spec and stays + quiet — this is the other half of the absent-vs-unusable split.""" + (tmp_path / ".git").mkdir() + f = tmp_path / ".github" / "instructions" / "noapply.instructions.md" + f.parent.mkdir(parents=True) + f.write_text("---\ndescription: no applyTo here\n---\nBody.\n", encoding="utf-8") + with caplog.at_level(logging.WARNING): + result = discover_workspace_instructions(tmp_path) + assert not any(p.name == "noapply.instructions.md" for p in result) + assert "noapply.instructions.md" not in caplog.text + def test_closing_delimiter_at_eof(self, tmp_path: Path) -> None: """Frontmatter with closing `---` at EOF (no trailing newline) parses correctly. @@ -1031,7 +1766,7 @@ def test_non_recursive_skips_subdirectory_files(self, tmp_path: Path) -> None: (sub / "csharp.md").write_text("nested rule", encoding="utf-8") conv = ConventionDirectory(path="rules", pattern="*.md", recursive=False) - results = dict(_walk_directory_convention(base, conv)) + results = {rel: path for rel, path, _scope in _walk_directory_convention(base, conv)} assert "top.md" in results assert "csharp.md" not in results @@ -1051,7 +1786,7 @@ def test_non_recursive_applies_pattern(self, tmp_path: Path) -> None: (base / "skip.txt").write_text("txt", encoding="utf-8") conv = ConventionDirectory(path="rules", pattern="*.md", recursive=False) - results = dict(_walk_directory_convention(base, conv)) + results = {rel: path for rel, path, _scope in _walk_directory_convention(base, conv)} assert "match.md" in results assert "skip.txt" not in results @@ -1074,7 +1809,7 @@ def test_non_recursive_applies_include_file_predicate(self, tmp_path: Path) -> N recursive=False, include_file=lambda p: "INCLUDE" in p.read_text(encoding="utf-8"), ) - results = dict(_walk_directory_convention(base, conv)) + results = {rel: path for rel, path, _scope in _walk_directory_convention(base, conv)} assert "include.md" in results assert "skip.md" not in results @@ -1119,7 +1854,7 @@ def patched_is_file(self, *, follow_symlinks=True): return original_is_file(self, follow_symlinks=follow_symlinks) with patch.object(os.DirEntry, "is_file", patched_is_file): - results = dict(_walk_directory_convention(base, conv)) + results = {rel: path for rel, path, _scope in _walk_directory_convention(base, conv)} assert "good.md" in results assert "broken.md" not in results