Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions .github/workflows/ci-real-api.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: ci-real-api

# Real-API verification for the OpenAI / Anthropic adapters.
# Runs only when the corresponding repo secret is configured AND the
# workflow is dispatched manually (or on the weekly schedule). Pull
# requests cannot reach this — the secrets aren't exposed to forks.
#
# Cost per run is a fraction of a cent. Set OPENAI_API_KEY and
# ANTHROPIC_API_KEY in repo Settings → Secrets and Variables → Actions
# to enable.

on:
workflow_dispatch:
schedule:
# Tuesdays 14:00 UTC. Catches upstream API drift roughly weekly
# without burning the budget on every push.
- cron: "0 14 * * 2"

# Cancel in-progress runs on the same ref so a flaky upstream doesn't
# queue retries.
concurrency:
group: ci-real-api-${{ github.ref }}
cancel-in-progress: true

jobs:
openai:
name: openai (real api)
runs-on: ubuntu-latest
if: ${{ vars.WIKITRACE_OPENAI_TESTS_ENABLED != 'false' }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip
- name: Install
run: |
python -m pip install --upgrade pip
pip install -e '.[cloud,dev]' openai
- name: Test
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
if [ -z "$OPENAI_API_KEY" ]; then
echo "OPENAI_API_KEY secret not set — skipping (this is fine for forks)."
exit 0
fi
pytest -q tests/integration/test_openai_real.py

anthropic:
name: anthropic (real api)
runs-on: ubuntu-latest
if: ${{ vars.WIKITRACE_ANTHROPIC_TESTS_ENABLED != 'false' }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip
- name: Install
run: |
python -m pip install --upgrade pip
pip install -e '.[cloud,dev]' anthropic
- name: Test
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
if [ -z "$ANTHROPIC_API_KEY" ]; then
echo "ANTHROPIC_API_KEY secret not set — skipping (this is fine for forks)."
exit 0
fi
pytest -q tests/integration/test_anthropic_real.py
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,30 @@ SQL works as-is.

---

## Tests

```bash
# Default suite (free): SDK + cloud + ingest server tests
pip install -e '.[cloud,dev]'
pytest -q tests/

# Postgres path (asyncpg + JSONB round-trip): requires a running
# Postgres and DATABASE_URL set
DATABASE_URL=postgresql://localhost/wikitrace pytest -q tests/

# Real-API verification (costs pennies; verifies the OpenAI / Anthropic
# patches against live endpoints): requires a key
OPENAI_API_KEY=sk-... pytest -q tests/integration/test_openai_real.py
ANTHROPIC_API_KEY=sk-... pytest -q tests/integration/test_anthropic_real.py
```

Integration tests skip cleanly when the corresponding key is unset, so
the default `pytest tests/` invocation never hits an external API.
CI on every push runs the free + Postgres paths; the real-API workflow
runs weekly (and on manual dispatch) when repo secrets are configured.

---

## Pricing

| | |
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ asyncio_mode = "auto"
testpaths = ["tests"]
markers = [
"postgres: requires DATABASE_URL pointing at a Postgres instance",
"integration: hits a real external API; requires the corresponding key in the env",
]
filterwarnings = [
# FastAPI's deprecated on_event used in the cloud server; suppress
Expand Down
Empty file added tests/integration/__init__.py
Empty file.
37 changes: 37 additions & 0 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Shared helpers for real-API integration tests.

These tests cost money. We use the cheapest model per provider, the
shortest possible prompt, and the smallest output. A full pass should
be a fraction of a cent.

Each test skips when the corresponding key isn't in the env, so the
default `pytest tests/` run on a contributor laptop and on CI without
secrets remains free.
"""

from __future__ import annotations

import os

import pytest


def _require_key(name: str) -> str:
key = os.environ.get(name)
if not key:
pytest.skip(
f"{name} not set. Real-API tests run only when the env var "
"is provided. Set it locally or as a CI secret to verify "
"this adapter end-to-end.",
)
return key


@pytest.fixture
def openai_key() -> str:
return _require_key("OPENAI_API_KEY")


@pytest.fixture
def anthropic_key() -> str:
return _require_key("ANTHROPIC_API_KEY")
115 changes: 115 additions & 0 deletions tests/integration/test_anthropic_real.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Real-API verification for wikitrace.anthropic.patch().

Hits the Anthropic Messages endpoint with the cheapest available model
(claude-haiku) and asserts span shape. Skipped when ANTHROPIC_API_KEY
is unset.

Cost per pass: ~$0.0001 (claude-haiku-4-5 or 3-5-haiku, ~15 input
tokens, max_tokens=10).
"""

from __future__ import annotations

import asyncio
import json
from pathlib import Path

import pytest

import wikitrace as wt


pytestmark = pytest.mark.integration


def _llm_call_span(trace_dir: Path) -> dict:
p = trace_dir / "spans.jsonl"
spans = [json.loads(l) for l in p.read_text().splitlines() if l.strip()]
spans = [s for s in spans if s["name"] == "llm_call"]
assert spans, "no llm_call span recorded"
return spans[0]


def _resolve_haiku_model() -> str:
"""Pick whichever haiku model the user's account can hit. Try the
newer 4-5 first; fall back to 3-5-haiku (still cheap, broadly
available)."""
return "claude-haiku-4-5"


def test_anthropic_sync_non_streaming(anthropic_key, trace_dir: Path):
pytest.importorskip("anthropic")
import anthropic
import wikitrace.anthropic

wikitrace.anthropic.patch()
client = anthropic.Anthropic(api_key=anthropic_key)

wt.init(pipeline="real-anthropic-sync", trace_dir=trace_dir)
msg = client.messages.create(
model=_resolve_haiku_model(),
max_tokens=10,
messages=[{"role": "user", "content": "ping"}],
)
wt.end()

s = _llm_call_span(trace_dir)
a = s["attrs"]
assert a["provider"] == "anthropic"
assert "claude" in a["model"]
assert a["input_tokens"] is not None and a["input_tokens"] > 0
assert a["output_tokens"] is not None and a["output_tokens"] >= 0
assert a["cost_usd"] is not None and a["cost_usd"] >= 0
assert a["latency_ms"] is not None and a["latency_ms"] > 0
assert a["retry_count"] == 0


def test_anthropic_sync_streaming(anthropic_key, trace_dir: Path):
pytest.importorskip("anthropic")
import anthropic
import wikitrace.anthropic

wikitrace.anthropic.patch()
client = anthropic.Anthropic(api_key=anthropic_key)

wt.init(pipeline="real-anthropic-stream", trace_dir=trace_dir)
with client.messages.stream(
model=_resolve_haiku_model(),
max_tokens=10,
messages=[{"role": "user", "content": "say hi"}],
) as stream:
# Consume the stream so the wrapper's __next__ loop closes the span.
events = list(stream)

wt.end()

assert len(events) > 0
s = _llm_call_span(trace_dir)
a = s["attrs"]
assert a["stream"] is True
token_events = [e for e in s["events"] if e["type"] == "token"]
assert len(token_events) >= 1


def test_anthropic_async_non_streaming(anthropic_key, trace_dir: Path):
pytest.importorskip("anthropic")
import anthropic
import wikitrace.anthropic

wikitrace.anthropic.patch()

async def run():
client = anthropic.AsyncAnthropic(api_key=anthropic_key)
wt.init(pipeline="real-anthropic-async", trace_dir=trace_dir)
await client.messages.create(
model=_resolve_haiku_model(),
max_tokens=10,
messages=[{"role": "user", "content": "ping"}],
)
wt.end()

asyncio.run(run())

s = _llm_call_span(trace_dir)
assert s["attrs"]["provider"] == "anthropic"
assert s["attrs"]["input_tokens"] > 0
120 changes: 120 additions & 0 deletions tests/integration/test_openai_real.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""Real-API verification for wikitrace.openai.patch().

Hits the OpenAI Chat Completions endpoint with the cheapest available
model and asserts the wikitrace span captures model, prompt_chars,
answer_chars, input_tokens, output_tokens, cost_usd, latency_ms, and
retry_count. Skipped when OPENAI_API_KEY is unset.

Cost per pass: ~$0.0001 (gpt-4o-mini, ~30 input tokens, max_tokens=10).
"""

from __future__ import annotations

import asyncio
import json
from pathlib import Path

import pytest

import wikitrace as wt


pytestmark = pytest.mark.integration


def _spans(trace_dir: Path) -> list[dict]:
p = trace_dir / "spans.jsonl"
return [json.loads(l) for l in p.read_text().splitlines()] if p.exists() else []


def _llm_call_span(trace_dir: Path) -> dict:
spans = [s for s in _spans(trace_dir) if s["name"] == "llm_call"]
assert spans, "no llm_call span recorded"
return spans[0]


def test_openai_sync_non_streaming(openai_key, trace_dir: Path):
pytest.importorskip("openai")
import openai
import wikitrace.openai

wikitrace.openai.patch()
client = openai.OpenAI(api_key=openai_key)

wt.init(pipeline="real-openai-sync", trace_dir=trace_dir)
resp = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "ping"}],
max_tokens=10,
)
wt.end()

s = _llm_call_span(trace_dir)
a = s["attrs"]
assert a["provider"] == "openai"
assert a["model"].startswith("gpt-4o-mini")
assert a["prompt_chars"] > 0
assert a["answer_chars"] >= 0 # short max_tokens — could be 0 in edge cases
assert a["input_tokens"] is not None and a["input_tokens"] > 0
assert a["output_tokens"] is not None and a["output_tokens"] >= 0
assert a["cost_usd"] is not None and a["cost_usd"] >= 0
assert a["latency_ms"] is not None and a["latency_ms"] > 0
assert a["retry_count"] == 0


def test_openai_sync_streaming(openai_key, trace_dir: Path):
pytest.importorskip("openai")
import openai
import wikitrace.openai

wikitrace.openai.patch()
client = openai.OpenAI(api_key=openai_key)

wt.init(pipeline="real-openai-stream", trace_dir=trace_dir)
stream = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "say hi"}],
max_tokens=10,
stream=True,
stream_options={"include_usage": True}, # only way to get usage on streams
)
chunks = list(stream) # exhaust to trigger span_close in our wrapper
wt.end()

assert len(chunks) > 0
s = _llm_call_span(trace_dir)
a = s["attrs"]
assert a["stream"] is True
assert a["provider"] == "openai"
# Token events should be on the span — at least one content delta arrived.
token_events = [e for e in s["events"] if e["type"] == "token"]
assert len(token_events) >= 1
# When stream_options.include_usage was honored, cost is computed.
if a["input_tokens"] is not None:
assert a["cost_usd"] is not None and a["cost_usd"] > 0


def test_openai_async_non_streaming(openai_key, trace_dir: Path):
pytest.importorskip("openai")
import openai
import wikitrace.openai

wikitrace.openai.patch()

async def run():
client = openai.AsyncOpenAI(api_key=openai_key)
wt.init(pipeline="real-openai-async", trace_dir=trace_dir)
resp = await client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "ping"}],
max_tokens=10,
)
wt.end()

asyncio.run(run())

s = _llm_call_span(trace_dir)
a = s["attrs"]
assert a["provider"] == "openai"
assert a["input_tokens"] > 0
assert a["latency_ms"] > 0
Loading