OmkarRayAI · OmkarRayAI · Jun 4, 2026 · Jun 4, 2026
diff --git a/.github/workflows/ci-real-api.yml b/.github/workflows/ci-real-api.yml
@@ -0,0 +1,72 @@
+name: ci-real-api
+
+# Real-API verification for the OpenAI / Anthropic adapters.
+# Runs only when the corresponding repo secret is configured AND the
+# workflow is dispatched manually (or on the weekly schedule). Pull
+# requests cannot reach this — the secrets aren't exposed to forks.
+#
+# Cost per run is a fraction of a cent. Set OPENAI_API_KEY and
+# ANTHROPIC_API_KEY in repo Settings → Secrets and Variables → Actions
+# to enable.
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Tuesdays 14:00 UTC. Catches upstream API drift roughly weekly
+    # without burning the budget on every push.
+    - cron: "0 14 * * 2"
+
+# Cancel in-progress runs on the same ref so a flaky upstream doesn't
+# queue retries.
+concurrency:
+  group: ci-real-api-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  openai:
+    name: openai (real api)
+    runs-on: ubuntu-latest
+    if: ${{ vars.WIKITRACE_OPENAI_TESTS_ENABLED != 'false' }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+      - name: Install
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e '.[cloud,dev]' openai
+      - name: Test
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          if [ -z "$OPENAI_API_KEY" ]; then
+            echo "OPENAI_API_KEY secret not set — skipping (this is fine for forks)."
+            exit 0
+          fi
+          pytest -q tests/integration/test_openai_real.py
+
+  anthropic:
+    name: anthropic (real api)
+    runs-on: ubuntu-latest
+    if: ${{ vars.WIKITRACE_ANTHROPIC_TESTS_ENABLED != 'false' }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+      - name: Install
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e '.[cloud,dev]' anthropic
+      - name: Test
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          if [ -z "$ANTHROPIC_API_KEY" ]; then
+            echo "ANTHROPIC_API_KEY secret not set — skipping (this is fine for forks)."
+            exit 0
+          fi
+          pytest -q tests/integration/test_anthropic_real.py
diff --git a/README.md b/README.md
@@ -538,6 +538,30 @@ SQL works as-is.
 
 ---
 
+## Tests
+
+```bash
+# Default suite (free): SDK + cloud + ingest server tests
+pip install -e '.[cloud,dev]'
+pytest -q tests/
+
+# Postgres path (asyncpg + JSONB round-trip): requires a running
+# Postgres and DATABASE_URL set
+DATABASE_URL=postgresql://localhost/wikitrace pytest -q tests/
+
+# Real-API verification (costs pennies; verifies the OpenAI / Anthropic
+# patches against live endpoints): requires a key
+OPENAI_API_KEY=sk-...    pytest -q tests/integration/test_openai_real.py
+ANTHROPIC_API_KEY=sk-... pytest -q tests/integration/test_anthropic_real.py
+```
+
+Integration tests skip cleanly when the corresponding key is unset, so
+the default `pytest tests/` invocation never hits an external API.
+CI on every push runs the free + Postgres paths; the real-API workflow
+runs weekly (and on manual dispatch) when repo secrets are configured.
+
+---
+
 ## Pricing
 
 | | |

diff --git a/pyproject.toml b/pyproject.toml
@@ -82,6 +82,7 @@ asyncio_mode = "auto"
 testpaths = ["tests"]
 markers = [
     "postgres: requires DATABASE_URL pointing at a Postgres instance",
+    "integration: hits a real external API; requires the corresponding key in the env",
 ]
 filterwarnings = [
     # FastAPI's deprecated on_event used in the cloud server; suppress

diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -0,0 +1,37 @@
+"""Shared helpers for real-API integration tests.
+
+These tests cost money. We use the cheapest model per provider, the
+shortest possible prompt, and the smallest output. A full pass should
+be a fraction of a cent.
+
+Each test skips when the corresponding key isn't in the env, so the
+default `pytest tests/` run on a contributor laptop and on CI without
+secrets remains free.
+"""
+
+from __future__ import annotations
+
+import os
+
+import pytest
+
+
+def _require_key(name: str) -> str:
+    key = os.environ.get(name)
+    if not key:
+        pytest.skip(
+            f"{name} not set. Real-API tests run only when the env var "
+            "is provided. Set it locally or as a CI secret to verify "
+            "this adapter end-to-end.",
+        )
+    return key
+
+
+@pytest.fixture
+def openai_key() -> str:
+    return _require_key("OPENAI_API_KEY")
+
+
+@pytest.fixture
+def anthropic_key() -> str:
+    return _require_key("ANTHROPIC_API_KEY")
diff --git a/tests/integration/test_anthropic_real.py b/tests/integration/test_anthropic_real.py
@@ -0,0 +1,115 @@
+"""Real-API verification for wikitrace.anthropic.patch().
+
+Hits the Anthropic Messages endpoint with the cheapest available model
+(claude-haiku) and asserts span shape. Skipped when ANTHROPIC_API_KEY
+is unset.
+
+Cost per pass: ~$0.0001 (claude-haiku-4-5 or 3-5-haiku, ~15 input
+tokens, max_tokens=10).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+from pathlib import Path
+
+import pytest
+
+import wikitrace as wt
+
+
+pytestmark = pytest.mark.integration
+
+
+def _llm_call_span(trace_dir: Path) -> dict:
+    p = trace_dir / "spans.jsonl"
+    spans = [json.loads(l) for l in p.read_text().splitlines() if l.strip()]
+    spans = [s for s in spans if s["name"] == "llm_call"]
+    assert spans, "no llm_call span recorded"
+    return spans[0]
+
+
+def _resolve_haiku_model() -> str:
+    """Pick whichever haiku model the user's account can hit. Try the
+    newer 4-5 first; fall back to 3-5-haiku (still cheap, broadly
+    available)."""
+    return "claude-haiku-4-5"
+
+
+def test_anthropic_sync_non_streaming(anthropic_key, trace_dir: Path):
+    pytest.importorskip("anthropic")
+    import anthropic
+    import wikitrace.anthropic
+
+    wikitrace.anthropic.patch()
+    client = anthropic.Anthropic(api_key=anthropic_key)
+
+    wt.init(pipeline="real-anthropic-sync", trace_dir=trace_dir)
+    msg = client.messages.create(
+        model=_resolve_haiku_model(),
+        max_tokens=10,
+        messages=[{"role": "user", "content": "ping"}],
+    )
+    wt.end()
+
+    s = _llm_call_span(trace_dir)
+    a = s["attrs"]
+    assert a["provider"] == "anthropic"
+    assert "claude" in a["model"]
+    assert a["input_tokens"] is not None and a["input_tokens"] > 0
+    assert a["output_tokens"] is not None and a["output_tokens"] >= 0
+    assert a["cost_usd"] is not None and a["cost_usd"] >= 0
+    assert a["latency_ms"] is not None and a["latency_ms"] > 0
+    assert a["retry_count"] == 0
+
+
+def test_anthropic_sync_streaming(anthropic_key, trace_dir: Path):
+    pytest.importorskip("anthropic")
+    import anthropic
+    import wikitrace.anthropic
+
+    wikitrace.anthropic.patch()
+    client = anthropic.Anthropic(api_key=anthropic_key)
+
+    wt.init(pipeline="real-anthropic-stream", trace_dir=trace_dir)
+    with client.messages.stream(
+        model=_resolve_haiku_model(),
+        max_tokens=10,
+        messages=[{"role": "user", "content": "say hi"}],
+    ) as stream:
+        # Consume the stream so the wrapper's __next__ loop closes the span.
+        events = list(stream)
+
+    wt.end()
+
+    assert len(events) > 0
+    s = _llm_call_span(trace_dir)
+    a = s["attrs"]
+    assert a["stream"] is True
+    token_events = [e for e in s["events"] if e["type"] == "token"]
+    assert len(token_events) >= 1
+
+
+def test_anthropic_async_non_streaming(anthropic_key, trace_dir: Path):
+    pytest.importorskip("anthropic")
+    import anthropic
+    import wikitrace.anthropic
+
+    wikitrace.anthropic.patch()
+
+    async def run():
+        client = anthropic.AsyncAnthropic(api_key=anthropic_key)
+        wt.init(pipeline="real-anthropic-async", trace_dir=trace_dir)
+        await client.messages.create(
+            model=_resolve_haiku_model(),
+            max_tokens=10,
+            messages=[{"role": "user", "content": "ping"}],
+        )
+        wt.end()
+
+    asyncio.run(run())
+
+    s = _llm_call_span(trace_dir)
+    assert s["attrs"]["provider"] == "anthropic"
+    assert s["attrs"]["input_tokens"] > 0
diff --git a/tests/integration/test_openai_real.py b/tests/integration/test_openai_real.py
@@ -0,0 +1,120 @@
+"""Real-API verification for wikitrace.openai.patch().
+
+Hits the OpenAI Chat Completions endpoint with the cheapest available
+model and asserts the wikitrace span captures model, prompt_chars,
+answer_chars, input_tokens, output_tokens, cost_usd, latency_ms, and
+retry_count. Skipped when OPENAI_API_KEY is unset.
+
+Cost per pass: ~$0.0001 (gpt-4o-mini, ~30 input tokens, max_tokens=10).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+from pathlib import Path
+
+import pytest
+
+import wikitrace as wt
+
+
+pytestmark = pytest.mark.integration
+
+
+def _spans(trace_dir: Path) -> list[dict]:
+    p = trace_dir / "spans.jsonl"
+    return [json.loads(l) for l in p.read_text().splitlines()] if p.exists() else []
+
+
+def _llm_call_span(trace_dir: Path) -> dict:
+    spans = [s for s in _spans(trace_dir) if s["name"] == "llm_call"]
+    assert spans, "no llm_call span recorded"
+    return spans[0]
+
+
+def test_openai_sync_non_streaming(openai_key, trace_dir: Path):
+    pytest.importorskip("openai")
+    import openai
+    import wikitrace.openai
+
+    wikitrace.openai.patch()
+    client = openai.OpenAI(api_key=openai_key)
+
+    wt.init(pipeline="real-openai-sync", trace_dir=trace_dir)
+    resp = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "ping"}],
+        max_tokens=10,
+    )
+    wt.end()
+
+    s = _llm_call_span(trace_dir)
+    a = s["attrs"]
+    assert a["provider"] == "openai"
+    assert a["model"].startswith("gpt-4o-mini")
+    assert a["prompt_chars"] > 0
+    assert a["answer_chars"] >= 0  # short max_tokens — could be 0 in edge cases
+    assert a["input_tokens"] is not None and a["input_tokens"] > 0
+    assert a["output_tokens"] is not None and a["output_tokens"] >= 0
+    assert a["cost_usd"] is not None and a["cost_usd"] >= 0
+    assert a["latency_ms"] is not None and a["latency_ms"] > 0
+    assert a["retry_count"] == 0
+
+
+def test_openai_sync_streaming(openai_key, trace_dir: Path):
+    pytest.importorskip("openai")
+    import openai
+    import wikitrace.openai
+
+    wikitrace.openai.patch()
+    client = openai.OpenAI(api_key=openai_key)
+
+    wt.init(pipeline="real-openai-stream", trace_dir=trace_dir)
+    stream = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": "say hi"}],
+        max_tokens=10,
+        stream=True,
+        stream_options={"include_usage": True},  # only way to get usage on streams
+    )
+    chunks = list(stream)  # exhaust to trigger span_close in our wrapper
+    wt.end()
+
+    assert len(chunks) > 0
+    s = _llm_call_span(trace_dir)
+    a = s["attrs"]
+    assert a["stream"] is True
+    assert a["provider"] == "openai"
+    # Token events should be on the span — at least one content delta arrived.
+    token_events = [e for e in s["events"] if e["type"] == "token"]
+    assert len(token_events) >= 1
+    # When stream_options.include_usage was honored, cost is computed.
+    if a["input_tokens"] is not None:
+        assert a["cost_usd"] is not None and a["cost_usd"] > 0
+
+
+def test_openai_async_non_streaming(openai_key, trace_dir: Path):
+    pytest.importorskip("openai")
+    import openai
+    import wikitrace.openai
+
+    wikitrace.openai.patch()
+
+    async def run():
+        client = openai.AsyncOpenAI(api_key=openai_key)
+        wt.init(pipeline="real-openai-async", trace_dir=trace_dir)
+        resp = await client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": "ping"}],
+            max_tokens=10,
+        )
+        wt.end()
+
+    asyncio.run(run())
+
+    s = _llm_call_span(trace_dir)
+    a = s["attrs"]
+    assert a["provider"] == "openai"
+    assert a["input_tokens"] > 0
+    assert a["latency_ms"] > 0