diff --git a/.github/workflows/ci-real-api.yml b/.github/workflows/ci-real-api.yml new file mode 100644 index 0000000..639f76f --- /dev/null +++ b/.github/workflows/ci-real-api.yml @@ -0,0 +1,72 @@ +name: ci-real-api + +# Real-API verification for the OpenAI / Anthropic adapters. +# Runs only when the corresponding repo secret is configured AND the +# workflow is dispatched manually (or on the weekly schedule). Pull +# requests cannot reach this — the secrets aren't exposed to forks. +# +# Cost per run is a fraction of a cent. Set OPENAI_API_KEY and +# ANTHROPIC_API_KEY in repo Settings → Secrets and Variables → Actions +# to enable. + +on: + workflow_dispatch: + schedule: + # Tuesdays 14:00 UTC. Catches upstream API drift roughly weekly + # without burning the budget on every push. + - cron: "0 14 * * 2" + +# Cancel in-progress runs on the same ref so a flaky upstream doesn't +# queue retries. +concurrency: + group: ci-real-api-${{ github.ref }} + cancel-in-progress: true + +jobs: + openai: + name: openai (real api) + runs-on: ubuntu-latest + if: ${{ vars.WIKITRACE_OPENAI_TESTS_ENABLED != 'false' }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + - name: Install + run: | + python -m pip install --upgrade pip + pip install -e '.[cloud,dev]' openai + - name: Test + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + if [ -z "$OPENAI_API_KEY" ]; then + echo "OPENAI_API_KEY secret not set — skipping (this is fine for forks)." + exit 0 + fi + pytest -q tests/integration/test_openai_real.py + + anthropic: + name: anthropic (real api) + runs-on: ubuntu-latest + if: ${{ vars.WIKITRACE_ANTHROPIC_TESTS_ENABLED != 'false' }} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + - name: Install + run: | + python -m pip install --upgrade pip + pip install -e '.[cloud,dev]' anthropic + - name: Test + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + if [ -z "$ANTHROPIC_API_KEY" ]; then + echo "ANTHROPIC_API_KEY secret not set — skipping (this is fine for forks)." + exit 0 + fi + pytest -q tests/integration/test_anthropic_real.py diff --git a/README.md b/README.md index c47eafc..8e6fc12 100644 --- a/README.md +++ b/README.md @@ -538,6 +538,30 @@ SQL works as-is. --- +## Tests + +```bash +# Default suite (free): SDK + cloud + ingest server tests +pip install -e '.[cloud,dev]' +pytest -q tests/ + +# Postgres path (asyncpg + JSONB round-trip): requires a running +# Postgres and DATABASE_URL set +DATABASE_URL=postgresql://localhost/wikitrace pytest -q tests/ + +# Real-API verification (costs pennies; verifies the OpenAI / Anthropic +# patches against live endpoints): requires a key +OPENAI_API_KEY=sk-... pytest -q tests/integration/test_openai_real.py +ANTHROPIC_API_KEY=sk-... pytest -q tests/integration/test_anthropic_real.py +``` + +Integration tests skip cleanly when the corresponding key is unset, so +the default `pytest tests/` invocation never hits an external API. +CI on every push runs the free + Postgres paths; the real-API workflow +runs weekly (and on manual dispatch) when repo secrets are configured. + +--- + ## Pricing | | | diff --git a/pyproject.toml b/pyproject.toml index 2bb1235..09e7391 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,7 @@ asyncio_mode = "auto" testpaths = ["tests"] markers = [ "postgres: requires DATABASE_URL pointing at a Postgres instance", + "integration: hits a real external API; requires the corresponding key in the env", ] filterwarnings = [ # FastAPI's deprecated on_event used in the cloud server; suppress diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 0000000..65f6fa4 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,37 @@ +"""Shared helpers for real-API integration tests. + +These tests cost money. We use the cheapest model per provider, the +shortest possible prompt, and the smallest output. A full pass should +be a fraction of a cent. + +Each test skips when the corresponding key isn't in the env, so the +default `pytest tests/` run on a contributor laptop and on CI without +secrets remains free. +""" + +from __future__ import annotations + +import os + +import pytest + + +def _require_key(name: str) -> str: + key = os.environ.get(name) + if not key: + pytest.skip( + f"{name} not set. Real-API tests run only when the env var " + "is provided. Set it locally or as a CI secret to verify " + "this adapter end-to-end.", + ) + return key + + +@pytest.fixture +def openai_key() -> str: + return _require_key("OPENAI_API_KEY") + + +@pytest.fixture +def anthropic_key() -> str: + return _require_key("ANTHROPIC_API_KEY") diff --git a/tests/integration/test_anthropic_real.py b/tests/integration/test_anthropic_real.py new file mode 100644 index 0000000..35e46b2 --- /dev/null +++ b/tests/integration/test_anthropic_real.py @@ -0,0 +1,115 @@ +"""Real-API verification for wikitrace.anthropic.patch(). + +Hits the Anthropic Messages endpoint with the cheapest available model +(claude-haiku) and asserts span shape. Skipped when ANTHROPIC_API_KEY +is unset. + +Cost per pass: ~$0.0001 (claude-haiku-4-5 or 3-5-haiku, ~15 input +tokens, max_tokens=10). +""" + +from __future__ import annotations + +import asyncio +import json +from pathlib import Path + +import pytest + +import wikitrace as wt + + +pytestmark = pytest.mark.integration + + +def _llm_call_span(trace_dir: Path) -> dict: + p = trace_dir / "spans.jsonl" + spans = [json.loads(l) for l in p.read_text().splitlines() if l.strip()] + spans = [s for s in spans if s["name"] == "llm_call"] + assert spans, "no llm_call span recorded" + return spans[0] + + +def _resolve_haiku_model() -> str: + """Pick whichever haiku model the user's account can hit. Try the + newer 4-5 first; fall back to 3-5-haiku (still cheap, broadly + available).""" + return "claude-haiku-4-5" + + +def test_anthropic_sync_non_streaming(anthropic_key, trace_dir: Path): + pytest.importorskip("anthropic") + import anthropic + import wikitrace.anthropic + + wikitrace.anthropic.patch() + client = anthropic.Anthropic(api_key=anthropic_key) + + wt.init(pipeline="real-anthropic-sync", trace_dir=trace_dir) + msg = client.messages.create( + model=_resolve_haiku_model(), + max_tokens=10, + messages=[{"role": "user", "content": "ping"}], + ) + wt.end() + + s = _llm_call_span(trace_dir) + a = s["attrs"] + assert a["provider"] == "anthropic" + assert "claude" in a["model"] + assert a["input_tokens"] is not None and a["input_tokens"] > 0 + assert a["output_tokens"] is not None and a["output_tokens"] >= 0 + assert a["cost_usd"] is not None and a["cost_usd"] >= 0 + assert a["latency_ms"] is not None and a["latency_ms"] > 0 + assert a["retry_count"] == 0 + + +def test_anthropic_sync_streaming(anthropic_key, trace_dir: Path): + pytest.importorskip("anthropic") + import anthropic + import wikitrace.anthropic + + wikitrace.anthropic.patch() + client = anthropic.Anthropic(api_key=anthropic_key) + + wt.init(pipeline="real-anthropic-stream", trace_dir=trace_dir) + with client.messages.stream( + model=_resolve_haiku_model(), + max_tokens=10, + messages=[{"role": "user", "content": "say hi"}], + ) as stream: + # Consume the stream so the wrapper's __next__ loop closes the span. + events = list(stream) + + wt.end() + + assert len(events) > 0 + s = _llm_call_span(trace_dir) + a = s["attrs"] + assert a["stream"] is True + token_events = [e for e in s["events"] if e["type"] == "token"] + assert len(token_events) >= 1 + + +def test_anthropic_async_non_streaming(anthropic_key, trace_dir: Path): + pytest.importorskip("anthropic") + import anthropic + import wikitrace.anthropic + + wikitrace.anthropic.patch() + + async def run(): + client = anthropic.AsyncAnthropic(api_key=anthropic_key) + wt.init(pipeline="real-anthropic-async", trace_dir=trace_dir) + await client.messages.create( + model=_resolve_haiku_model(), + max_tokens=10, + messages=[{"role": "user", "content": "ping"}], + ) + wt.end() + + asyncio.run(run()) + + s = _llm_call_span(trace_dir) + assert s["attrs"]["provider"] == "anthropic" + assert s["attrs"]["input_tokens"] > 0 diff --git a/tests/integration/test_openai_real.py b/tests/integration/test_openai_real.py new file mode 100644 index 0000000..ef2c23a --- /dev/null +++ b/tests/integration/test_openai_real.py @@ -0,0 +1,120 @@ +"""Real-API verification for wikitrace.openai.patch(). + +Hits the OpenAI Chat Completions endpoint with the cheapest available +model and asserts the wikitrace span captures model, prompt_chars, +answer_chars, input_tokens, output_tokens, cost_usd, latency_ms, and +retry_count. Skipped when OPENAI_API_KEY is unset. + +Cost per pass: ~$0.0001 (gpt-4o-mini, ~30 input tokens, max_tokens=10). +""" + +from __future__ import annotations + +import asyncio +import json +from pathlib import Path + +import pytest + +import wikitrace as wt + + +pytestmark = pytest.mark.integration + + +def _spans(trace_dir: Path) -> list[dict]: + p = trace_dir / "spans.jsonl" + return [json.loads(l) for l in p.read_text().splitlines()] if p.exists() else [] + + +def _llm_call_span(trace_dir: Path) -> dict: + spans = [s for s in _spans(trace_dir) if s["name"] == "llm_call"] + assert spans, "no llm_call span recorded" + return spans[0] + + +def test_openai_sync_non_streaming(openai_key, trace_dir: Path): + pytest.importorskip("openai") + import openai + import wikitrace.openai + + wikitrace.openai.patch() + client = openai.OpenAI(api_key=openai_key) + + wt.init(pipeline="real-openai-sync", trace_dir=trace_dir) + resp = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "ping"}], + max_tokens=10, + ) + wt.end() + + s = _llm_call_span(trace_dir) + a = s["attrs"] + assert a["provider"] == "openai" + assert a["model"].startswith("gpt-4o-mini") + assert a["prompt_chars"] > 0 + assert a["answer_chars"] >= 0 # short max_tokens — could be 0 in edge cases + assert a["input_tokens"] is not None and a["input_tokens"] > 0 + assert a["output_tokens"] is not None and a["output_tokens"] >= 0 + assert a["cost_usd"] is not None and a["cost_usd"] >= 0 + assert a["latency_ms"] is not None and a["latency_ms"] > 0 + assert a["retry_count"] == 0 + + +def test_openai_sync_streaming(openai_key, trace_dir: Path): + pytest.importorskip("openai") + import openai + import wikitrace.openai + + wikitrace.openai.patch() + client = openai.OpenAI(api_key=openai_key) + + wt.init(pipeline="real-openai-stream", trace_dir=trace_dir) + stream = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "say hi"}], + max_tokens=10, + stream=True, + stream_options={"include_usage": True}, # only way to get usage on streams + ) + chunks = list(stream) # exhaust to trigger span_close in our wrapper + wt.end() + + assert len(chunks) > 0 + s = _llm_call_span(trace_dir) + a = s["attrs"] + assert a["stream"] is True + assert a["provider"] == "openai" + # Token events should be on the span — at least one content delta arrived. + token_events = [e for e in s["events"] if e["type"] == "token"] + assert len(token_events) >= 1 + # When stream_options.include_usage was honored, cost is computed. + if a["input_tokens"] is not None: + assert a["cost_usd"] is not None and a["cost_usd"] > 0 + + +def test_openai_async_non_streaming(openai_key, trace_dir: Path): + pytest.importorskip("openai") + import openai + import wikitrace.openai + + wikitrace.openai.patch() + + async def run(): + client = openai.AsyncOpenAI(api_key=openai_key) + wt.init(pipeline="real-openai-async", trace_dir=trace_dir) + resp = await client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": "ping"}], + max_tokens=10, + ) + wt.end() + + asyncio.run(run()) + + s = _llm_call_span(trace_dir) + a = s["attrs"] + assert a["provider"] == "openai" + assert a["input_tokens"] > 0 + assert a["latency_ms"] > 0