From 1e3c6a45633168820f166eb5080cbe2126eefb57 Mon Sep 17 00:00:00 2001 From: JacksonWang Date: Thu, 21 May 2026 14:59:27 +0800 Subject: [PATCH] feat(scheduledrun): add ScheduledRun CRD for cron-based agent execution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds ScheduledRun (kagent.dev/v1alpha2) to fire an existing agent on a cron schedule with a static prompt. Scope is limited to scheduling — does not extend the agent definition. Two-stage status semantics: DispatchStatus is written synchronously (did the A2A SendMessage land at the agent pod), Outcome is resolved asynchronously by polling the session's terminal task state. This lets RunHistory reflect whether the agent actually completed the work, not just whether the dispatch HTTP call returned 200 — agents have built-in retries, so dispatch success and run success are different signals. Lands without controller-side concurrency gating: the agent pod already serializes A2A calls per session, so overlapping ticks queue at the agent layer rather than needing an inFlight map / mutex in the controller. Includes CRD + controller, scheduler with outcome poller, REST API (CRUD + manual trigger), Prometheus metrics, and UI (list, detail with run history, create form). Manual trigger bypasses spec.suspend by design. Co-authored-by: 0xLeo258 <0xLeo258@users.noreply.github.com> Co-authored-by: Claude Opus 4.7 --- docs/architecture/README.md | 1 + docs/architecture/scheduled-runs.md | 499 +++++++++++++++++ .../crd/bases/kagent.dev_scheduledruns.yaml | 222 ++++++++ go/api/v1alpha2/scheduledrun_types.go | 165 ++++++ go/api/v1alpha2/zz_generated.deepcopy.go | 151 +++++ .../controller/scheduledrun_controller.go | 220 ++++++++ .../scheduledrun_controller_test.go | 351 ++++++++++++ .../controller/scheduledrun_scheduler.go | 504 +++++++++++++++++ .../controller/scheduledrun_scheduler_test.go | 332 +++++++++++ .../internal/httpserver/handlers/agents.go | 41 ++ .../internal/httpserver/handlers/handlers.go | 4 +- .../httpserver/handlers/scheduledruns.go | 313 +++++++++++ .../httpserver/handlers/scheduledruns_test.go | 429 ++++++++++++++ .../httpserver/handlers/test_helpers_test.go | 2 + go/core/internal/httpserver/server.go | 38 +- go/core/internal/metrics/scheduledrun.go | 64 +++ go/core/pkg/app/app.go | 15 + go/core/test/e2e/scheduledrun_api_test.go | 175 ++++++ go/go.mod | 1 + go/go.sum | 2 + .../templates/kagent.dev_scheduledruns.yaml | 222 ++++++++ helm/kagent/templates/rbac/getter-role.yaml | 3 + helm/kagent/templates/rbac/writer-role.yaml | 2 + ui/src/app/actions/scheduledRuns.ts | 122 ++++ .../app/schedules/[namespace]/[name]/page.tsx | 305 ++++++++++ ui/src/app/schedules/new/page.tsx | 525 ++++++++++++++++++ ui/src/app/schedules/page.tsx | 5 + ui/src/components/DeleteAgentButton.tsx | 11 +- ui/src/components/Header.tsx | 28 +- .../components/schedules/RunHistoryTable.tsx | 127 +++++ .../components/schedules/ScheduledRunList.tsx | 260 +++++++++ ui/src/lib/formatDateTime.ts | 7 + ui/src/types/index.ts | 54 ++ 33 files changed, 5182 insertions(+), 18 deletions(-) create mode 100644 docs/architecture/scheduled-runs.md create mode 100644 go/api/config/crd/bases/kagent.dev_scheduledruns.yaml create mode 100644 go/api/v1alpha2/scheduledrun_types.go create mode 100644 go/core/internal/controller/scheduledrun_controller.go create mode 100644 go/core/internal/controller/scheduledrun_controller_test.go create mode 100644 go/core/internal/controller/scheduledrun_scheduler.go create mode 100644 go/core/internal/controller/scheduledrun_scheduler_test.go create mode 100644 go/core/internal/httpserver/handlers/scheduledruns.go create mode 100644 go/core/internal/httpserver/handlers/scheduledruns_test.go create mode 100644 go/core/internal/metrics/scheduledrun.go create mode 100644 go/core/test/e2e/scheduledrun_api_test.go create mode 100644 helm/kagent-crds/templates/kagent.dev_scheduledruns.yaml create mode 100644 ui/src/app/actions/scheduledRuns.ts create mode 100644 ui/src/app/schedules/[namespace]/[name]/page.tsx create mode 100644 ui/src/app/schedules/new/page.tsx create mode 100644 ui/src/app/schedules/page.tsx create mode 100644 ui/src/components/schedules/RunHistoryTable.tsx create mode 100644 ui/src/components/schedules/ScheduledRunList.tsx create mode 100644 ui/src/lib/formatDateTime.ts diff --git a/docs/architecture/README.md b/docs/architecture/README.md index f2e87fb88..3957d2be4 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -12,6 +12,7 @@ This directory contains detailed architecture documentation for kagent. Start wi | [prompt-templates.md](prompt-templates.md) | Prompt template system with ConfigMap includes and variable interpolation | | [data-flow.md](data-flow.md) | End-to-end request flow from UI to agent and back | | [crds-and-types.md](crds-and-types.md) | All Custom Resource Definitions and their relationships | +| [scheduled-runs.md](scheduled-runs.md) | ScheduledRun CRD: cron-based agent triggering, design and known issues | --- diff --git a/docs/architecture/scheduled-runs.md b/docs/architecture/scheduled-runs.md new file mode 100644 index 000000000..fac5b0347 --- /dev/null +++ b/docs/architecture/scheduled-runs.md @@ -0,0 +1,499 @@ +# Scheduled Runs + +`ScheduledRun` is a kagent CRD that fires an existing `Agent` on a cron schedule +with a fixed prompt. It addresses [issue +#1821](https://github.com/kagent-dev/kagent/issues/1821): users wanted a way to +run agents on a recurring schedule without writing a separate cronjob and +A2A client. + +This document describes the end-to-end design, the rationale behind each layer, +the scenarios it covers, and known issues. + +--- + +## Scope + +The feature is scoped narrowly to one use case: **trigger an already-deployed +Agent on a cron schedule with a static prompt, and surface the resulting session +in the UI under the user who created the schedule.** + +Out of scope (deliberately): + +- Building agents — schedules reference existing `Agent` resources, they don't + define agent behavior themselves. +- Variable / templated prompts. +- Backfill of missed runs while the controller was down. +- Cross-cluster / external triggers. +- Per-run resource quotas. +- A first-class concurrency policy (`Forbid`/`Allow`/`Replace`). Agents already + isolate runs by session, so two overlapping `ScheduledRun` ticks land in + separate sessions and don't interleave events. We removed the policy field + to avoid leaking a Kubernetes `CronJob`-shaped concept that doesn't actually + apply here. A future iteration may revisit it for resource-budget reasons, + not correctness. + +--- + +## End-to-End Design + +``` +┌────────────┐ 1. POST /api/scheduledruns ┌──────────────────────┐ +│ │ ────────────────────────────────▶ │ HTTP Handler │ +│ Next.js │ (X-User-ID: alice@…) │ scheduledruns.go │ +│ UI │ │ │ +│ │ ◀── 201 (created-by annotation set) │ ─ writes annotation │ +└────────────┘ │ kagent.dev/ │ + ▲ │ created-by=alice@… │ + │ └──────────┬───────────┘ + │ │ + │ ▼ + │ ┌──────────────────────┐ + │ 5. GET /api/sessions/{id} │ Kubernetes API │ + │ (matches userID) │ ScheduledRun CR │ + │ └──────────┬───────────┘ + │ │ watch + │ ▼ + │ ┌──────────────────────┐ + │ │ ScheduledRun │ + │ │ Controller │ + │ │ ─ validates TZ │ + │ │ ─ validates cron │ + │ │ ─ checks Agent ref │ + │ │ ─ Accepted=True │ + │ │ ─ scheduler.Update │ + │ └──────────┬───────────┘ + │ │ + │ ▼ + │ 2. cron tick ┌──────────────────────┐ + │ │ ScheduledRunScheduler│ + │ │ (Runnable, leader- │ + │ │ elected) │ + │ │ │ + │ │ runOnce(): │ + │ │ a. read SR + ann │ + │ │ b. create session │ + │ │ userID = ann │ + │ │ c. send A2A msg │ + │ │ X-User-Id hdr │ + │ │ d. record dispatch │ + │ │ e. spawn outcome │ + │ │ poller │ + │ └──────────┬───────────┘ + │ │ + │ ▼ + │ ┌──────────────────────┐ + │ │ Agent Pod (existing) │ + │ │ ─ A2A receiver │ + │ │ ─ writes events │ + │ │ back to /api │ + │ │ sessions/{id}/ │ + │ │ events │ + │ └──────────┬───────────┘ + │ 4. status update (RunHistoryEntry) │ + │ ─ DispatchStatus written immediately │ + │ ─ Outcome written by poller on terminal state │ + │ ▼ + └──────────────────────── 3. session events stored under userID=alice@… +``` + +### 1. CRD (`go/api/v1alpha2/scheduledrun_types.go`) + +```go +type ScheduledRunSpec struct { + Schedule string // cron expression (5 fields) + TimeZone string // optional IANA name; default UTC + AgentRef AgentReference // existing Agent (cross-namespace allowed by design) + Prompt string // static prompt sent on each run + Suspend bool + MaxRunHistory int32 // default 10, max 100 +} + +type ScheduledRunStatus struct { + LastRunTime *metav1.Time + NextRunTime *metav1.Time // owned by scheduler, refreshed after each fire + RunHistory []RunHistoryEntry + Conditions []metav1.Condition // Accepted (controller-owned) +} + +// Two-stage status: dispatch is synchronous, outcome resolves async. +type RunHistoryEntry struct { + StartTime metav1.Time + CompletionTime *metav1.Time + DispatchStatus DispatchStatus // Dispatched | DispatchFailed (synchronous) + DispatchMessage string + SessionID string + Outcome RunOutcome // Pending | Succeeded | Failed | Timeout (async) + OutcomeMessage string + OutcomeTime *metav1.Time +} +``` + +A constant `AnnotationCreatedBy = "kagent.dev/created-by"` carries the user +identity used for session ownership. + +#### Two-stage success semantics + +The earlier model conflated "the A2A request returned 200" with "the agent +finished successfully." Those are very different signals: + +- **DispatchStatus** (synchronous): did the request leave the controller and + reach the agent's HTTP listener? `Dispatched` means yes; `DispatchFailed` + means the agent was unreachable, the URL was wrong, the model config was + missing — anything where we couldn't even hand the work off. +- **Outcome** (asynchronous): did the agent's session actually complete? + `Pending` while the agent is still working, then resolves to `Succeeded` / + `Failed` / `Timeout` when the underlying A2A task reaches a terminal state. + +A user looking at the run history for "did this scheduled report actually go +out?" cares about `Outcome`. A user debugging "why is nothing happening?" +cares about `DispatchStatus`. The UI surfaces the resolved cell — `Succeeded` +when both stages agree, `Dispatch Failed` short-circuits and skips outcome +polling, `Running` while pending. + +### 2. HTTP API (`go/core/internal/httpserver/handlers/scheduledruns.go`) + +| Method | Path | Notes | +|--------|------|-------| +| GET | `/api/scheduledruns` | List all SRs | +| GET | `/api/scheduledruns/{ns}/{name}` | Read | +| POST | `/api/scheduledruns` | Create — sets `created-by` annotation from request | +| PUT | `/api/scheduledruns/{ns}/{name}` | Update spec, preserves `created-by` | +| DELETE | `/api/scheduledruns/{ns}/{name}` | Delete | +| POST | `/api/scheduledruns/{ns}/{name}/trigger` | Manual run (skips schedule, ignores `suspend`) | + +Validation rejects bad cron expressions and bad time-zone names at the +handler layer (`ValidateSchedule`). We do **not** enforce a minimum interval +— operators may need fast cadences for testing, and admin-only exemptions +add complexity without clear payoff. LLM cost / load is the operator's +responsibility. + +Manual trigger deliberately ignores `spec.suspend`: suspend stops the cron +engine from firing automatically; a human pressing "Run now" is an explicit +override. The cron schedule remains paused. + +Agent deletion is gated by reference protection (see "Agent ↔ ScheduledRun +referential integrity" below). + +### 3. Controller (`go/core/internal/controller/scheduledrun_controller.go`) + +Watches `ScheduledRun` events and `Agent` events. On each reconcile: + +1. Validate `spec.timeZone` (rejects unknown IANA names) — done **before** + cron parsing, otherwise a bad TZ surfaces as `InvalidSchedule` because + `cron.ParseStandard` rejects the `CRON_TZ=` prefix. +2. Parse cron, set `Accepted=False` on parse failure with reason + `InvalidSchedule`. +3. Verify the referenced `Agent` exists. On 404 the controller also calls + `scheduler.RemoveSchedule(key)` so a tick can't keep firing a Failed run + forever, and sets `Accepted=False` with reason `AgentNotFound`. +4. Call `scheduler.UpdateSchedule(sr)` which adds/updates the cron entry. +5. On deletion, the watch fires once with `IgnoreNotFound`; we call + `scheduler.RemoveSchedule(key)`. + +The Agent watch uses a **field index** (`spec.agentRef`, keyed by +`namespace/name`) so that an Agent create/update fans out only to SRs that +actually reference it (`O(matched)` instead of `O(all SRs)`). The empty- +namespace case resolves to the SR's own namespace at index time, matching +the controller's resolution rule. + +#### Status ownership split + +The controller and the scheduler both write to `status`, so we partition by +field to avoid clobbering writes: + +| Field | Owner | +|---|---| +| `conditions["Accepted"]` (`InvalidTimeZone`/`InvalidSchedule`/`AgentNotFound`/`ScheduleAccepted`) | Controller | +| `lastRunTime`, `nextRunTime` | Scheduler | +| `runHistory[]` | Scheduler | +| `observedGeneration` | Controller | + +`NextRunTime` was previously computed by the controller, but the scheduler +already knows the freshest value (it re-computes after each fire). Moving it +to the scheduler also keeps `NextRunTime` accurate even when the controller +isn't reconciling. + +### 4. Scheduler (`go/core/internal/controller/scheduledrun_scheduler.go`) + +A `manager.Runnable` that wraps `robfig/cron/v3`. Two design choices worth +noting: + +- **`NeedLeaderElection() = true`** — only one controller replica fires the + schedule, so HA deployments don't double-fire. +- **No persistent run queue** — when the cron tick fires we immediately create + the session and dispatch A2A in-process. Crash loses the in-flight run; see + Known Issues. + +#### Time-zone handling + +`spec.timeZone` is fed to `robfig/cron/v3` via the `CRON_TZ=America/Los_Angeles ` +prefix that its standard parser already supports. The helper +`scheduleSpecForCron(sr)` returns the prefixed string when `TimeZone` is set, +otherwise the bare cron expression (interpreted as UTC). + +#### Per-tick flow (`runOnce`) + +1. Re-fetch the `ScheduledRun` (cron entry holds only the namespaced name, not + stale spec). +2. Honor `Suspend` — return early without recording a history entry. +3. Read `kagent.dev/created-by` annotation; that's the **session userID**. + Falls back to literal `"scheduled-run"` if absent (for SRs created via + `kubectl apply` without going through the API). +4. Create a row in the sessions table. +5. Send the A2A message via `a2aclient.NewA2AClient` with an + `X-User-Id`-injecting `RoundTripper`, so the agent runtime persists events + under the same userID. +6. Append a `RunHistoryEntry{DispatchStatus, SessionID, Outcome=Pending}` and + write status. Trim to `MaxRunHistory` (fallback 10 if unset, since the fake + client doesn't apply CRD defaults). +7. Refresh `NextRunTime` from the cron entry. +8. If dispatch succeeded, **spawn an outcome poller goroutine** that polls + `dbClient.ListTasksForSession(sessionID)` until the task hits a terminal + A2A state (`Completed`/`Canceled`/`Failed`/`Rejected`). It resolves + `Outcome` on the matching history entry. If the poll exceeds + `outcomePollTimeout` (15m) it writes `Outcome=Timeout`. Failed dispatches skip + polling — they have no session to poll. + +#### Manual trigger path + +`TriggerManualRun(key)` reuses `runOnce` minus the `Suspend` early-return, +returning the resulting `RunHistoryEntry` so the HTTP handler can render a +toast immediately. The outcome poller still runs asynchronously — the user +sees `Dispatched` immediately and can refresh to see the resolved outcome. + +### 5. UI + +- Header → **View → Scheduled Runs** lists `/schedules`. +- Header → **Create → New Scheduled Run** opens the create form + (`/schedules/new`). The form accepts an optional time-zone string. +- Detail page (`/schedules/[namespace]/[name]`) shows the run history with + links into the chat session — the session userID match with the current + user is what makes those links resolve. +- The run-history table maps `(DispatchStatus, Outcome)` to a single status + badge: `Dispatch Failed` (red), `Succeeded` (green), `Failed` / `Timeout` + (red / amber), `Running` (blue, while `Outcome=Pending`), `Dispatched` + (outline) as fallback. + +### 6. Metrics + +The scheduler registers three Prometheus metrics with the controller-runtime +metrics registry (`go/core/internal/metrics/scheduledrun.go`): + +- `kagent_scheduledrun_active_schedules` — gauge of currently-loaded SRs. +- `kagent_scheduledrun_dispatch_total{namespace,name,status}` — counter of + dispatch attempts, status `Dispatched`/`DispatchFailed`. +- `kagent_scheduledrun_outcome_total{namespace,name,outcome}` — counter of + resolved outcomes. +- `kagent_scheduledrun_dispatch_duration_seconds{namespace,name}` — histogram + of synchronous A2A dispatch latency. + +The labels are intentionally namespace+name (low cardinality in practice). +For very large clusters this could be aggregated upstream. + +--- + +## Design Rationale + +### Why a CRD, not a Kubernetes `CronJob`? + +A native `CronJob` would spin up a pod per tick and send the A2A request from +that pod. We rejected this because: + +1. The agent already runs as a long-lived service; we don't need a per-run pod. +2. We want the session to land in kagent's own database with the correct + userID, which means the trigger has to flow through code that knows about + kagent's session model. +3. Cron expression validation, reference integrity, run history, and UI surface + all want a first-class kagent resource, not a bag of opaque `Job` objects. + +### Why split DispatchStatus from Outcome? + +A single status field forced a bad choice: report success on dispatch (which +lies when the agent then errors out) or block the controller until the agent +finishes (which couples the controller's loop to the agent's wall-clock and +breaks fast cron cadences). The two-stage model lets the controller move on +immediately while a background poller resolves the truth from +`database.Client.ListTasksForSession` — the same source of truth the UI uses +for chat sessions, so the two views can never disagree. + +### Why one annotation for `created-by` instead of a `spec.user` field? + +Spec fields are user input; annotations are metadata. The user identity is +captured server-side from the HTTP principal — putting it in the spec would +mean: + +- Clients could lie about it. +- It would show up in YAML examples and confuse users into editing it. + +Annotations also let us evolve later (e.g., switch to a label, add multiple +attribution keys) without a breaking spec change. + +### Why allow cross-namespace `agentRef`? + +`agentRef.namespace` defaults to the SR's namespace but may be set to any +namespace the controller has permission to read. This is by design: it's +common to keep agents in a shared `kagent` namespace and run schedules from +team namespaces. We do not enforce a same-namespace restriction. The +trade-off is that an SR can reference an agent its creator may not have +direct RBAC on; in practice the controller's service account is the one that +reads the agent, so cluster admins control reachability through standard +RBAC on the `Agent` resource. + +### Why no minimum cron interval? + +An earlier iteration enforced a 1-hour floor at the API layer to prevent +runaway loops like `* * * * *` from silently burning LLM quota. We dropped +the floor: operators legitimately need fast cadences for testing, and once +you carve out an admin exemption the rule becomes arbitrary. Cost containment +belongs in per-namespace quota tracking, not a hardcoded number in the +validator. Today the validator only checks **that the cron expression and +time zone parse**. + +### Why does manual trigger ignore `suspend`? + +`suspend=true` stops the scheduler from auto-firing. A human clicking "Run +now" is an explicit override of automation. If we blocked manual triggers +under suspend the user would have to un-suspend, run, re-suspend — which +also opens a window where the schedule auto-fires unintentionally. The +schedule itself remains paused; only the explicit one-shot proceeds. + +### Why is the scheduler a `Runnable` rather than its own deployment? + +It needs to read `ScheduledRun` CRs from the cache and write status updates, +which means it's already coupled to the controller manager. Splitting it into a +separate deployment would force a kube-API-only path with no shared cache, +adding latency and complexity for no real benefit. + +### Agent ↔ ScheduledRun referential integrity (option B) + +When deleting an Agent, the HTTP handler lists `ScheduledRun`s and rejects the +delete with **HTTP 409 Conflict** if any reference the agent +(`go/core/internal/httpserver/handlers/agents.go::findReferencingScheduledRuns`). +The error message lists the offending SRs. + +We picked **B (block)** rather than A (warn-then-delete) because: + +- An orphaned SR would log "Agent not found" forever and clutter the controller. +- The warn-then-delete UX adds a confirm step but doesn't actually prevent the + bad outcome (tired user clicks confirm anyway). +- Block is reversible — user deletes the SR first, then the Agent. + +The UI (`DeleteAgentButton.tsx`) surfaces the 409 message via a sonner toast so +the failure is visible, not silent in console. + +If the Agent is force-deleted out from under an SR (e.g., raw `kubectl +delete`), the controller's Agent watch fires and the SR's reconcile flips +`Accepted=False/AgentNotFound` and removes the cron entry — so we self-heal +even when the API guard is bypassed. + +### Why ScheduledRun → Session uses an A2A client, not a database insert? + +We could write events directly into the sessions table to bypass the agent +pod entirely. That would be cheaper, but it would also bypass the agent's +actual logic (LLM call, tool execution), which is the whole point. Going +through A2A means the run is **identical** to a UI-initiated chat: same +session schema, same event shape, same UI rendering. + +--- + +## Scenarios Covered + +The feature is scoped to: **"trigger an existing agent on a cron schedule"**. +Within that scope, the following are verified end-to-end against a kind +cluster: + +| Scenario | Verified | +|---|---| +| Create SR via UI / API, see it on the list page | ✅ | +| Update every spec field (schedule, timeZone, agentRef, prompt, suspend, maxRunHistory) | ✅ | +| Delete SR | ✅ | +| Cron tick triggers agent with the right prompt | ✅ | +| Manual trigger (`/trigger` endpoint) ignores `suspend` and dispatches | ✅ | +| Run history records start, dispatch status, session ID, then async outcome | ✅ | +| Failed dispatch (agent unreachable) recorded with `DispatchStatus=DispatchFailed` and skips outcome polling | ✅ | +| Successful dispatch + agent error recorded with `DispatchStatus=Dispatched`, `Outcome=Failed` | ✅ | +| Outcome poller writes `Outcome=Timeout` when agent doesn't reach terminal state in time | ✅ | +| `Suspend=true` skips auto-fire but allows manual trigger | ✅ | +| Time zones: `0 9 * * *` with `Asia/Shanghai` fires at 09:00 Shanghai | ✅ | +| Bad time zone sets `Accepted=False/InvalidTimeZone` | ✅ | +| Bad cron sets `Accepted=False/InvalidSchedule` | ✅ | +| Missing agent sets `Accepted=False/AgentNotFound` and removes the cron entry | ✅ | +| Recreating a missing agent re-arms the schedule (Agent watch fan-out) | ✅ | +| Creator can open the resulting session in the UI chat | ✅ | +| Other users cannot read sessions created by someone else's SR | ✅ | +| Deleting an Agent referenced by an SR is blocked with HTTP 409 + UI toast | ✅ | +| `ScheduledRun.status.runHistory` capped at `maxRunHistory` (default 10) | ✅ | + +--- + +## Known Issues / Limitations + +### 1. Annotation-based ownership doesn't handle user deletion + +If `alice@example.com` creates an SR and is later removed from the auth system, +the SR keeps firing under her userID and her sessions become orphaned (no +human can read them). There's no "owner exists" check. + +**Fix path:** integrate with the auth system's user lifecycle, or add a +soft-deletion / re-attribution flow. Acceptable risk while alpha. + +### 2. SR fall-through when controller has no leader + +`Runnable.NeedLeaderElection() = true` means the scheduler only runs on the +leader. During leader transition (~15s typical), ticks that fall in the gap +are simply lost — no backfill. There's no automatic catch-up. Acceptable for +hourly+ schedules; for sub-second cadence the gap could miss a tick. + +### 3. Crash loses in-flight outcome polls + +If the controller restarts while an outcome poller goroutine is mid-flight, +the matching `RunHistoryEntry` stays at `Outcome=Pending` forever. There's +no recovery sweep that resumes polling on startup. **Workaround:** the user +can read the session directly to see the actual result; the SR status just +won't reflect it. + +**Fix path:** on startup, scan SRs for `Pending` outcome entries and respawn +pollers for each. Tracked separately from this iteration. + +### 4. Interaction with kagent's reconciler is one-way + +The `ScheduledRun` controller does NOT trigger any change to the referenced +`Agent` — no reconcile fan-out, no recreate, no restart. This is by design +(SRs are passive consumers), but it means: + +- An Agent rolling restart mid-run will fail the in-flight A2A call. Same + failure mode as a UI-initiated chat against a restarting pod. +- Updating `agentRef` does NOT cancel any in-flight run on the old agent. + +We treat both as acceptable; users should drain runs before agent maintenance. + +### 5. No first-class concurrency policy + +The earlier `ConcurrencyPolicy=Forbid|Allow|Replace` field has been removed. +Agents serialize per-session, so two overlapping SR ticks land in two +sessions and don't interleave. If a future use case (resource budgets, +prompt-level locking) needs explicit concurrency, it should be designed +fresh against the agent/session model rather than reintroducing the +`CronJob`-shaped vocabulary. + +--- + +## Files + +| File | Role | +|---|---| +| `go/api/v1alpha2/scheduledrun_types.go` | CRD types + `AnnotationCreatedBy` | +| `go/api/config/crd/bases/kagent.dev_scheduledruns.yaml` | Generated CRD manifest | +| `helm/kagent-crds/templates/kagent.dev_scheduledruns.yaml` | CRD shipped via Helm | +| `helm/kagent/templates/rbac/{getter,writer}-role.yaml` | RBAC additions | +| `go/core/internal/controller/scheduledrun_controller.go` | Kubernetes controller, Agent watch fan-out via field index | +| `go/core/internal/controller/scheduledrun_scheduler.go` | Cron engine, dispatcher, outcome poller | +| `go/core/internal/metrics/scheduledrun.go` | Prometheus metrics | +| `go/core/internal/httpserver/handlers/scheduledruns.go` | REST handlers | +| `go/core/internal/httpserver/handlers/agents.go` | Reference protection on agent delete | +| `go/core/pkg/app/app.go` | Wires controller + scheduler into the manager | +| `ui/src/app/schedules/**` | UI list / detail / create pages | +| `ui/src/components/schedules/**` | List + run-history components | +| `ui/src/components/Header.tsx` | Create + View dropdown entries | +| `ui/src/components/DeleteAgentButton.tsx` | Surfaces 409 toast on delete-blocked | +| `go/core/test/e2e/scheduledrun_api_test.go` | E2E suite | diff --git a/go/api/config/crd/bases/kagent.dev_scheduledruns.yaml b/go/api/config/crd/bases/kagent.dev_scheduledruns.yaml new file mode 100644 index 000000000..7b6ff60c9 --- /dev/null +++ b/go/api/config/crd/bases/kagent.dev_scheduledruns.yaml @@ -0,0 +1,222 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: scheduledruns.kagent.dev +spec: + group: kagent.dev + names: + kind: ScheduledRun + listKind: ScheduledRunList + plural: scheduledruns + singular: scheduledrun + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.schedule + name: Schedule + type: string + - jsonPath: .spec.suspend + name: Suspend + type: boolean + - jsonPath: .status.lastRunTime + name: Last Run + type: date + - jsonPath: .status.nextRunTime + name: Next Run + type: string + name: v1alpha2 + schema: + openAPIV3Schema: + description: ScheduledRun is the Schema for the scheduledruns API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ScheduledRunSpec defines the desired state of ScheduledRun. + properties: + agentRef: + description: |- + AgentRef is a reference to the Agent to execute. If Namespace is empty + it defaults to the ScheduledRun's namespace. + properties: + name: + type: string + namespace: + type: string + required: + - name + type: object + maxRunHistory: + default: 10 + description: MaxRunHistory is the maximum number of run history entries + to retain. + maximum: 100 + minimum: 1 + type: integer + prompt: + description: Prompt is the text prompt to send to the agent on each + run. + minLength: 1 + type: string + schedule: + description: |- + Schedule is a cron expression defining when to run the agent. Standard + 5-field cron syntax (minute hour day-of-month month day-of-week). + minLength: 1 + type: string + suspend: + default: false + description: |- + Suspend pauses cron-driven scheduling when set to true. Manual triggers + via the API still execute; Suspend only gates the cron tick path. + type: boolean + timeZone: + description: |- + TimeZone is an IANA time zone name (e.g. "America/Los_Angeles") used + to interpret Schedule. If empty, the controller process's local time + zone (typically UTC in-cluster) is used. + type: string + required: + - agentRef + - prompt + - schedule + type: object + status: + description: ScheduledRunStatus defines the observed state of ScheduledRun. + properties: + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + lastRunTime: + format: date-time + type: string + nextRunTime: + format: date-time + type: string + observedGeneration: + format: int64 + type: integer + runHistory: + items: + description: |- + RunHistoryEntry records one execution of a scheduled run. DispatchStatus + is set synchronously when the A2A call returns; Outcome is set + asynchronously by polling the session. + properties: + completionTime: + format: date-time + type: string + dispatchMessage: + type: string + dispatchStatus: + description: |- + DispatchStatus reflects whether the A2A SendMessage call to the agent pod + succeeded. It says nothing about the LLM result — that lives in [RunOutcome]. + enum: + - Dispatched + - DispatchFailed + type: string + outcome: + description: |- + RunOutcome reflects the terminal state of the agent run, resolved + asynchronously by polling the session's task state after dispatch returns. + "Pending" means polling is still in progress (or was abandoned because the + controller restarted before the session terminated). + enum: + - Pending + - Succeeded + - Failed + - Timeout + type: string + outcomeMessage: + type: string + outcomeTime: + format: date-time + type: string + sessionId: + type: string + startTime: + format: date-time + type: string + required: + - dispatchStatus + - startTime + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/go/api/v1alpha2/scheduledrun_types.go b/go/api/v1alpha2/scheduledrun_types.go new file mode 100644 index 000000000..c4d44c0a5 --- /dev/null +++ b/go/api/v1alpha2/scheduledrun_types.go @@ -0,0 +1,165 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// AnnotationCreatedBy records the user identity that created a ScheduledRun. +// The scheduler uses this value as the session userID so the user who created +// the schedule can read the resulting session in the UI. +const AnnotationCreatedBy = "kagent.dev/created-by" + +// DispatchStatus reflects whether the A2A SendMessage call to the agent pod +// succeeded. It says nothing about the LLM result — that lives in [RunOutcome]. +// +kubebuilder:validation:Enum=Dispatched;DispatchFailed +type DispatchStatus string + +const ( + // DispatchStatusDispatched means the A2A SendMessage call returned without + // error. The session was created and the prompt was accepted by the agent + // pod. The model invocation result is recorded separately in Outcome. + DispatchStatusDispatched DispatchStatus = "Dispatched" + // DispatchStatusFailed means dispatch itself failed: session create + // error, A2A client error, agent pod 5xx, or panic in the dispatch path. + DispatchStatusFailed DispatchStatus = "DispatchFailed" +) + +// RunOutcome reflects the terminal state of the agent run, resolved +// asynchronously by polling the session's task state after dispatch returns. +// "Pending" means polling is still in progress (or was abandoned because the +// controller restarted before the session terminated). +// +kubebuilder:validation:Enum=Pending;Succeeded;Failed;Timeout +type RunOutcome string + +const ( + // RunOutcomePending means the run was dispatched but no terminal task + // state has been observed yet. Either polling is in progress, or the + // controller restarted before resolution and the entry is now orphaned. + RunOutcomePending RunOutcome = "Pending" + // RunOutcomeSucceeded means the session's last task reached + // TaskStateCompleted. + RunOutcomeSucceeded RunOutcome = "Succeeded" + // RunOutcomeFailed means the session's last task reached a non-success + // terminal state (failed, canceled, rejected). + RunOutcomeFailed RunOutcome = "Failed" + // RunOutcomeTimeout means polling exceeded the configured budget without + // observing a terminal state. + RunOutcomeTimeout RunOutcome = "Timeout" +) + +// AgentReference holds a reference to an Agent resource. AgentRef.Namespace +// may name a namespace different from the ScheduledRun's own — operators are +// responsible for ensuring the cross-namespace reference is intended (the +// controller does not enforce namespace boundaries). +type AgentReference struct { + Name string `json:"name"` + Namespace string `json:"namespace,omitempty"` // +optional +} + +// ScheduledRunSpec defines the desired state of ScheduledRun. +type ScheduledRunSpec struct { + // Schedule is a cron expression defining when to run the agent. Standard + // 5-field cron syntax (minute hour day-of-month month day-of-week). + // +kubebuilder:validation:MinLength=1 + Schedule string `json:"schedule"` + + // TimeZone is an IANA time zone name (e.g. "America/Los_Angeles") used + // to interpret Schedule. If empty, the controller process's local time + // zone (typically UTC in-cluster) is used. + // +optional + TimeZone string `json:"timeZone,omitempty"` + + // AgentRef is a reference to the Agent to execute. If Namespace is empty + // it defaults to the ScheduledRun's namespace. + AgentRef AgentReference `json:"agentRef"` + + // Prompt is the text prompt to send to the agent on each run. + // +kubebuilder:validation:MinLength=1 + Prompt string `json:"prompt"` + + // Suspend pauses cron-driven scheduling when set to true. Manual triggers + // via the API still execute; Suspend only gates the cron tick path. + // +optional + // +kubebuilder:default=false + Suspend bool `json:"suspend,omitempty"` + + // MaxRunHistory is the maximum number of run history entries to retain. + // +optional + // +kubebuilder:default=10 + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=100 + MaxRunHistory int `json:"maxRunHistory,omitempty"` +} + +// RunHistoryEntry records one execution of a scheduled run. DispatchStatus +// is set synchronously when the A2A call returns; Outcome is set +// asynchronously by polling the session. +type RunHistoryEntry struct { + StartTime metav1.Time `json:"startTime"` + CompletionTime *metav1.Time `json:"completionTime,omitempty"` + DispatchStatus DispatchStatus `json:"dispatchStatus"` + DispatchMessage string `json:"dispatchMessage,omitempty"` + SessionID string `json:"sessionId,omitempty"` + Outcome RunOutcome `json:"outcome,omitempty"` + OutcomeMessage string `json:"outcomeMessage,omitempty"` + OutcomeTime *metav1.Time `json:"outcomeTime,omitempty"` +} + +// ScheduledRunStatus defines the observed state of ScheduledRun. +type ScheduledRunStatus struct { + LastRunTime *metav1.Time `json:"lastRunTime,omitempty"` + NextRunTime *metav1.Time `json:"nextRunTime,omitempty"` + RunHistory []RunHistoryEntry `json:"runHistory,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty"` + ObservedGeneration int64 `json:"observedGeneration,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Schedule",type="string",JSONPath=".spec.schedule" +// +kubebuilder:printcolumn:name="Suspend",type="boolean",JSONPath=".spec.suspend" +// +kubebuilder:printcolumn:name="Last Run",type="date",JSONPath=".status.lastRunTime" +// +kubebuilder:printcolumn:name="Next Run",type="string",JSONPath=".status.nextRunTime" +// +kubebuilder:storageversion + +// ScheduledRun is the Schema for the scheduledruns API. +type ScheduledRun struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec ScheduledRunSpec `json:"spec,omitempty"` + Status ScheduledRunStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// ScheduledRunList contains a list of ScheduledRun. +type ScheduledRunList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ScheduledRun `json:"items"` +} + +func init() { + SchemeBuilder.Register(func(s *runtime.Scheme) error { + s.AddKnownTypes(GroupVersion, &ScheduledRun{}, &ScheduledRunList{}) + return nil + }) +} diff --git a/go/api/v1alpha2/zz_generated.deepcopy.go b/go/api/v1alpha2/zz_generated.deepcopy.go index 136058bce..2654366a3 100644 --- a/go/api/v1alpha2/zz_generated.deepcopy.go +++ b/go/api/v1alpha2/zz_generated.deepcopy.go @@ -381,6 +381,21 @@ func (in *AgentList) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgentReference) DeepCopyInto(out *AgentReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgentReference. +func (in *AgentReference) DeepCopy() *AgentReference { + if in == nil { + return nil + } + out := new(AgentReference) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AgentSkill) DeepCopyInto(out *AgentSkill) { *out = *in @@ -1439,6 +1454,30 @@ func (in *RemoteMCPServerStatus) DeepCopy() *RemoteMCPServerStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RunHistoryEntry) DeepCopyInto(out *RunHistoryEntry) { + *out = *in + in.StartTime.DeepCopyInto(&out.StartTime) + if in.CompletionTime != nil { + in, out := &in.CompletionTime, &out.CompletionTime + *out = (*in).DeepCopy() + } + if in.OutcomeTime != nil { + in, out := &in.OutcomeTime, &out.OutcomeTime + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RunHistoryEntry. +func (in *RunHistoryEntry) DeepCopy() *RunHistoryEntry { + if in == nil { + return nil + } + out := new(RunHistoryEntry) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SAPAICoreConfig) DeepCopyInto(out *SAPAICoreConfig) { *out = *in @@ -1533,6 +1572,118 @@ func (in *SandboxConfig) DeepCopy() *SandboxConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledRun) DeepCopyInto(out *ScheduledRun) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledRun. +func (in *ScheduledRun) DeepCopy() *ScheduledRun { + if in == nil { + return nil + } + out := new(ScheduledRun) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledRun) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledRunList) DeepCopyInto(out *ScheduledRunList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ScheduledRun, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledRunList. +func (in *ScheduledRunList) DeepCopy() *ScheduledRunList { + if in == nil { + return nil + } + out := new(ScheduledRunList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ScheduledRunList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledRunSpec) DeepCopyInto(out *ScheduledRunSpec) { + *out = *in + out.AgentRef = in.AgentRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledRunSpec. +func (in *ScheduledRunSpec) DeepCopy() *ScheduledRunSpec { + if in == nil { + return nil + } + out := new(ScheduledRunSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScheduledRunStatus) DeepCopyInto(out *ScheduledRunStatus) { + *out = *in + if in.LastRunTime != nil { + in, out := &in.LastRunTime, &out.LastRunTime + *out = (*in).DeepCopy() + } + if in.NextRunTime != nil { + in, out := &in.NextRunTime, &out.NextRunTime + *out = (*in).DeepCopy() + } + if in.RunHistory != nil { + in, out := &in.RunHistory, &out.RunHistory + *out = make([]RunHistoryEntry, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScheduledRunStatus. +func (in *ScheduledRunStatus) DeepCopy() *ScheduledRunStatus { + if in == nil { + return nil + } + out := new(ScheduledRunStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SecretReference) DeepCopyInto(out *SecretReference) { *out = *in diff --git a/go/core/internal/controller/scheduledrun_controller.go b/go/core/internal/controller/scheduledrun_controller.go new file mode 100644 index 000000000..7b94cbd04 --- /dev/null +++ b/go/core/internal/controller/scheduledrun_controller.go @@ -0,0 +1,220 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "time" + + "github.com/robfig/cron/v3" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +const ( + ScheduledRunConditionTypeAccepted = "Accepted" + + // scheduledRunAgentRefIndex is the field index for ScheduledRun→Agent + // reverse lookup. The composite "namespace/name" key sidesteps the + // per-SR namespace defaulting we'd otherwise have to replay in the + // EventHandler. + scheduledRunAgentRefIndex = "spec.agentRef" +) + +// ScheduledRunController reconciles a ScheduledRun object +type ScheduledRunController struct { + Scheme *runtime.Scheme + Kube client.Client + Scheduler *ScheduledRunScheduler +} + +// +kubebuilder:rbac:groups=kagent.dev,resources=scheduledruns,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=kagent.dev,resources=scheduledruns/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=kagent.dev,resources=scheduledruns/finalizers,verbs=update + +func (r *ScheduledRunController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + _ = log.FromContext(ctx) + + var sr v1alpha2.ScheduledRun + if err := r.Kube.Get(ctx, req.NamespacedName, &sr); err != nil { + if client.IgnoreNotFound(err) == nil { + r.Scheduler.RemoveSchedule(req.NamespacedName) + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed to get ScheduledRun: %w", err) + } + + // Validate spec.timeZone is a known IANA name. Done before the cron + // parse so a bad TZ surfaces as "InvalidTimeZone" instead of being + // re-reported as a generic "InvalidSchedule" by the parser. + if sr.Spec.TimeZone != "" { + if _, err := time.LoadLocation(sr.Spec.TimeZone); err != nil { + meta.SetStatusCondition(&sr.Status.Conditions, metav1.Condition{ + Type: ScheduledRunConditionTypeAccepted, + Status: metav1.ConditionFalse, + Reason: "InvalidTimeZone", + Message: fmt.Sprintf("Invalid time zone %q: %v", sr.Spec.TimeZone, err), + ObservedGeneration: sr.Generation, + }) + sr.Status.ObservedGeneration = sr.Generation + if updateErr := r.Kube.Status().Update(ctx, &sr); updateErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", updateErr) + } + return ctrl.Result{}, nil + } + } + + // Validate cron expression (with optional CRON_TZ embedded via spec.timeZone). + if _, err := cron.ParseStandard(scheduleSpecForCron(&sr)); err != nil { + meta.SetStatusCondition(&sr.Status.Conditions, metav1.Condition{ + Type: ScheduledRunConditionTypeAccepted, + Status: metav1.ConditionFalse, + Reason: "InvalidSchedule", + Message: fmt.Sprintf("Invalid cron expression: %v", err), + ObservedGeneration: sr.Generation, + }) + sr.Status.ObservedGeneration = sr.Generation + if updateErr := r.Kube.Status().Update(ctx, &sr); updateErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", updateErr) + } + return ctrl.Result{}, nil + } + + // Validate agent ref exists + agentNamespace := sr.Spec.AgentRef.Namespace + if agentNamespace == "" { + agentNamespace = sr.Namespace + } + var agent v1alpha2.Agent + agentKey := types.NamespacedName{Name: sr.Spec.AgentRef.Name, Namespace: agentNamespace} + if err := r.Kube.Get(ctx, agentKey, &agent); err != nil { + if client.IgnoreNotFound(err) == nil { + // Agent disappeared (or agentRef was edited to a missing one). + // Stop firing the cron entry — otherwise every tick would + // uselessly append a Failed history entry. + r.Scheduler.RemoveSchedule(req.NamespacedName) + meta.SetStatusCondition(&sr.Status.Conditions, metav1.Condition{ + Type: ScheduledRunConditionTypeAccepted, + Status: metav1.ConditionFalse, + Reason: "AgentNotFound", + Message: fmt.Sprintf("Agent %s not found", agentKey), + ObservedGeneration: sr.Generation, + }) + sr.Status.ObservedGeneration = sr.Generation + if updateErr := r.Kube.Status().Update(ctx, &sr); updateErr != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", updateErr) + } + return ctrl.Result{}, nil + } + return ctrl.Result{}, fmt.Errorf("failed to check agent ref: %w", err) + } + + // Update the cron schedule. NextRunTime is owned by the scheduler — it + // re-computes after each fire so the user sees the freshest value. + if err := r.Scheduler.UpdateSchedule(&sr); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update schedule: %w", err) + } + + meta.SetStatusCondition(&sr.Status.Conditions, metav1.Condition{ + Type: ScheduledRunConditionTypeAccepted, + Status: metav1.ConditionTrue, + Reason: "ScheduleAccepted", + Message: "ScheduledRun is accepted and scheduled", + ObservedGeneration: sr.Generation, + }) + sr.Status.ObservedGeneration = sr.Generation + + if err := r.Kube.Status().Update(ctx, &sr); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to update status: %w", err) + } + + return ctrl.Result{}, nil +} + +// SetupWithManager sets up the controller with the Manager. Registers the +// agentRef field index used by the Agent watcher to fan reconcile requests +// out to only the affected ScheduledRuns. +func (r *ScheduledRunController) SetupWithManager(mgr ctrl.Manager) error { + if err := mgr.GetFieldIndexer().IndexField( + context.Background(), + &v1alpha2.ScheduledRun{}, + scheduledRunAgentRefIndex, + func(obj client.Object) []string { + sr, ok := obj.(*v1alpha2.ScheduledRun) + if !ok { + return nil + } + ns := sr.Spec.AgentRef.Namespace + if ns == "" { + ns = sr.Namespace + } + return []string{ns + "/" + sr.Spec.AgentRef.Name} + }, + ); err != nil { + return fmt.Errorf("failed to index ScheduledRun by agentRef: %w", err) + } + + return ctrl.NewControllerManagedBy(mgr). + WithOptions(controller.Options{ + NeedLeaderElection: new(true), + }). + For(&v1alpha2.ScheduledRun{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). + // Watch Agent so SRs revisit their cron entry when the referenced + // agent appears/disappears: a recreated Agent re-arms the schedule + // without operators bumping the SR generation. Generation predicate + // filters out Agent status writes (Create/Delete events still fire). + Watches( + &v1alpha2.Agent{}, + handler.EnqueueRequestsFromMapFunc(r.findScheduledRunsForAgent), + builder.WithPredicates(predicate.GenerationChangedPredicate{}), + ). + Named("scheduledrun"). + Complete(r) +} + +// findScheduledRunsForAgent returns reconcile requests for every ScheduledRun +// whose AgentRef points at the given Agent. Uses the agentRef field index so +// the lookup is O(matched) instead of O(all SRs). +func (r *ScheduledRunController) findScheduledRunsForAgent(ctx context.Context, obj client.Object) []reconcile.Request { + var srList v1alpha2.ScheduledRunList + if err := r.Kube.List(ctx, &srList, client.MatchingFields{ + scheduledRunAgentRefIndex: obj.GetNamespace() + "/" + obj.GetName(), + }); err != nil { + return nil + } + requests := make([]reconcile.Request, 0, len(srList.Items)) + for i := range srList.Items { + sr := &srList.Items[i] + requests = append(requests, reconcile.Request{ + NamespacedName: types.NamespacedName{Name: sr.Name, Namespace: sr.Namespace}, + }) + } + return requests +} diff --git a/go/core/internal/controller/scheduledrun_controller_test.go b/go/core/internal/controller/scheduledrun_controller_test.go new file mode 100644 index 000000000..8b78a6b43 --- /dev/null +++ b/go/core/internal/controller/scheduledrun_controller_test.go @@ -0,0 +1,351 @@ +package controller + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestScheduledRunController_Reconcile(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + newAgent := func(namespace, name string) *v1alpha2.Agent { + return &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + } + } + + newScheduledRun := func(namespace, name, schedule, agentName, agentNamespace string) *v1alpha2.ScheduledRun { + return &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Generation: 1, + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: schedule, + AgentRef: v1alpha2.AgentReference{ + Name: agentName, + Namespace: agentNamespace, + }, + Prompt: "test prompt", + MaxRunHistory: 10, + }, + } + } + + tests := []struct { + name string + objects []runtime.Object + reqName string + reqNamespace string + wantErr bool + wantCondition metav1.ConditionStatus + wantReason string + wantNotFound bool // when the ScheduledRun doesn't exist + }{ + { + name: "valid schedule - accepted", + objects: []runtime.Object{ + newScheduledRun("default", "my-sr", "0 */2 * * *", "my-agent", "default"), + newAgent("default", "my-agent"), + }, + reqName: "my-sr", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionTrue, + wantReason: "ScheduleAccepted", + }, + { + name: "invalid cron expression", + objects: []runtime.Object{ + newScheduledRun("default", "my-sr", "invalid-cron", "my-agent", "default"), + newAgent("default", "my-agent"), + }, + reqName: "my-sr", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionFalse, + wantReason: "InvalidSchedule", + }, + { + name: "sub-hourly schedule allowed - every 5 minutes", + objects: []runtime.Object{ + newScheduledRun("default", "my-sr", "*/5 * * * *", "my-agent", "default"), + newAgent("default", "my-agent"), + }, + reqName: "my-sr", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionTrue, + wantReason: "ScheduleAccepted", + }, + { + name: "sub-hourly schedule allowed - every 30 minutes", + objects: []runtime.Object{ + newScheduledRun("default", "my-sr", "*/30 * * * *", "my-agent", "default"), + newAgent("default", "my-agent"), + }, + reqName: "my-sr", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionTrue, + wantReason: "ScheduleAccepted", + }, + { + name: "agent not found", + objects: []runtime.Object{ + newScheduledRun("default", "my-sr", "0 */2 * * *", "nonexistent-agent", "default"), + }, + reqName: "my-sr", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionFalse, + wantReason: "AgentNotFound", + }, + { + name: "scheduledrun not found - deleted", + objects: []runtime.Object{}, + reqName: "deleted-sr", + reqNamespace: "default", + wantErr: false, + wantNotFound: true, + }, + { + name: "agent ref namespace defaults to scheduledrun namespace", + objects: []runtime.Object{ + newScheduledRun("mynamespace", "my-sr", "0 */2 * * *", "my-agent", ""), + newAgent("mynamespace", "my-agent"), + }, + reqName: "my-sr", + reqNamespace: "mynamespace", + wantErr: false, + wantCondition: metav1.ConditionTrue, + wantReason: "ScheduleAccepted", + }, + { + name: "valid schedule - exactly 1 hour interval", + objects: []runtime.Object{ + newScheduledRun("default", "my-sr", "0 * * * *", "my-agent", "default"), + newAgent("default", "my-agent"), + }, + reqName: "my-sr", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionTrue, + wantReason: "ScheduleAccepted", + }, + { + name: "invalid time zone", + objects: []runtime.Object{ + func() runtime.Object { + sr := newScheduledRun("default", "tz-bad", "0 9 * * *", "my-agent", "default") + sr.Spec.TimeZone = "Mars/Olympus_Mons" + return sr + }(), + newAgent("default", "my-agent"), + }, + reqName: "tz-bad", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionFalse, + wantReason: "InvalidTimeZone", + }, + { + name: "valid time zone accepted", + objects: []runtime.Object{ + func() runtime.Object { + sr := newScheduledRun("default", "tz-ok", "0 9 * * *", "my-agent", "default") + sr.Spec.TimeZone = "America/Los_Angeles" + return sr + }(), + newAgent("default", "my-agent"), + }, + reqName: "tz-ok", + reqNamespace: "default", + wantErr: false, + wantCondition: metav1.ConditionTrue, + wantReason: "ScheduleAccepted", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + clientBuilder := fake.NewClientBuilder(). + WithScheme(scheme). + WithStatusSubresource(&v1alpha2.ScheduledRun{}) + + for _, obj := range tt.objects { + clientBuilder = clientBuilder.WithRuntimeObjects(obj) + } + kubeClient := clientBuilder.Build() + + // Use a real scheduler (not started). UpdateSchedule/RemoveSchedule + // work without the cron engine running. + scheduler := NewScheduledRunScheduler(kubeClient, nil) + + controller := &ScheduledRunController{ + Scheme: scheme, + Kube: kubeClient, + Scheduler: scheduler, + } + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Name: tt.reqName, + Namespace: tt.reqNamespace, + }, + } + + result, err := controller.Reconcile(context.Background(), req) + + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + assert.Equal(t, ctrl.Result{}, result) + + if tt.wantNotFound { + // For deleted resources, just verify no error and the scheduler cleaned up + return + } + + // Verify status was updated + var sr v1alpha2.ScheduledRun + err = kubeClient.Get(context.Background(), types.NamespacedName{ + Name: tt.reqName, + Namespace: tt.reqNamespace, + }, &sr) + require.NoError(t, err) + + // Check condition + require.NotEmpty(t, sr.Status.Conditions) + cond := sr.Status.Conditions[0] + assert.Equal(t, ScheduledRunConditionTypeAccepted, cond.Type) + assert.Equal(t, tt.wantCondition, cond.Status) + assert.Equal(t, tt.wantReason, cond.Reason) + + // NextRunTime is now owned by the scheduler (post-run), not the + // reconciler — so it should be nil after a reconcile. + assert.Nil(t, sr.Status.NextRunTime) + + // Check observed generation + assert.Equal(t, int64(1), sr.Status.ObservedGeneration) + }) + } +} + +// TestScheduledRunController_AgentNotFound_RemovesCronEntry verifies that when +// a previously-accepted SR has its agentRef change to a non-existent agent, +// the controller removes the cron entry. Otherwise every tick would +// uselessly create a Failed history entry. +func TestScheduledRunController_AgentNotFound_RemovesCronEntry(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sr", + Namespace: "default", + Generation: 1, + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + AgentRef: v1alpha2.AgentReference{Name: "ghost", Namespace: "default"}, + Prompt: "test", + }, + } + kube := fake.NewClientBuilder(). + WithScheme(scheme). + WithStatusSubresource(&v1alpha2.ScheduledRun{}). + WithRuntimeObjects(sr). + Build() + + scheduler := NewScheduledRunScheduler(kube, nil) + // Simulate a prior reconcile that registered the cron entry while the + // agent existed. + require.NoError(t, scheduler.UpdateSchedule(sr)) + key := types.NamespacedName{Name: "sr", Namespace: "default"} + _, ok := scheduler.entries[key] + require.True(t, ok, "precondition: entry registered") + + c := &ScheduledRunController{Scheme: scheme, Kube: kube, Scheduler: scheduler} + _, err := c.Reconcile(context.Background(), ctrl.Request{NamespacedName: key}) + require.NoError(t, err) + + _, ok = scheduler.entries[key] + assert.False(t, ok, "entry must be removed when referenced agent disappears") +} + +// TestScheduledRunController_FindScheduledRunsForAgent verifies the Agent +// watch's mapper enqueues only SRs whose AgentRef points at the given Agent. +// The empty-namespace case must resolve to the SR's own namespace, matching +// the controller's resolution rule. +func TestScheduledRunController_FindScheduledRunsForAgent(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + newSR := func(ns, name, agentName, agentNS string) *v1alpha2.ScheduledRun { + return &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns, Generation: 1}, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + AgentRef: v1alpha2.AgentReference{Name: agentName, Namespace: agentNS}, + Prompt: "test", + }, + } + } + + kube := fake.NewClientBuilder(). + WithScheme(scheme). + WithStatusSubresource(&v1alpha2.ScheduledRun{}). + WithIndex(&v1alpha2.ScheduledRun{}, scheduledRunAgentRefIndex, func(obj client.Object) []string { + sr, ok := obj.(*v1alpha2.ScheduledRun) + if !ok { + return nil + } + ns := sr.Spec.AgentRef.Namespace + if ns == "" { + ns = sr.Namespace + } + return []string{ns + "/" + sr.Spec.AgentRef.Name} + }). + WithRuntimeObjects( + newSR("ns-a", "sr-explicit-match", "agent-x", "ns-a"), + newSR("ns-a", "sr-default-ns-match", "agent-x", ""), + newSR("ns-a", "sr-other-agent", "agent-y", "ns-a"), + newSR("ns-b", "sr-cross-ns", "agent-x", "ns-b"), + ). + Build() + + c := &ScheduledRunController{Scheme: scheme, Kube: kube, Scheduler: NewScheduledRunScheduler(kube, nil)} + + requests := c.findScheduledRunsForAgent(context.Background(), &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{Name: "agent-x", Namespace: "ns-a"}, + }) + + got := map[string]bool{} + for _, req := range requests { + got[req.Namespace+"/"+req.Name] = true + } + assert.True(t, got["ns-a/sr-explicit-match"], "explicit ns match must enqueue") + assert.True(t, got["ns-a/sr-default-ns-match"], "empty agentRef.namespace must resolve to SR namespace") + assert.False(t, got["ns-a/sr-other-agent"], "non-matching agent name must NOT enqueue") + assert.False(t, got["ns-b/sr-cross-ns"], "cross-namespace match must NOT enqueue") + assert.Len(t, requests, 2) +} diff --git a/go/core/internal/controller/scheduledrun_scheduler.go b/go/core/internal/controller/scheduledrun_scheduler.go new file mode 100644 index 000000000..22d2110b0 --- /dev/null +++ b/go/core/internal/controller/scheduledrun_scheduler.go @@ -0,0 +1,504 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + "net/http" + "sync" + "time" + + "github.com/go-logr/logr" + "github.com/robfig/cron/v3" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + a2aclient "trpc.group/trpc-go/trpc-a2a-go/client" + "trpc.group/trpc-go/trpc-a2a-go/protocol" + + "github.com/kagent-dev/kagent/go/api/database" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + agenttranslator "github.com/kagent-dev/kagent/go/core/internal/controller/translator/agent" + "github.com/kagent-dev/kagent/go/core/internal/metrics" + "github.com/kagent-dev/kagent/go/core/internal/utils" +) + +var schedulerLog = ctrl.Log.WithName("scheduledrun-scheduler") + +const ( + // messageMaxBytes caps RunHistoryEntry.{Dispatch,Outcome}Message so a flood + // of long error strings cannot blow past the apiserver's status size limit. + messageMaxBytes = 1024 + // drainTimeout bounds how long Start() waits for in-flight runs to finish + // after the manager context is cancelled. Should be less than the pod's + // terminationGracePeriodSeconds. + drainTimeout = 25 * time.Second + // statusWriteTimeout bounds the apiserver write that records a run's + // outcome. + statusWriteTimeout = 10 * time.Second + // outcomePollInterval is the interval between session-state polls when + // resolving RunOutcome. + outcomePollInterval = 5 * time.Second + // outcomePollTimeout is the maximum total time spent polling for a + // session's terminal state before giving up and recording Outcome=Timeout. + outcomePollTimeout = 15 * time.Minute +) + +// cronLoggerAdapter bridges logr.Logger to robfig/cron's logger interface. +type cronLoggerAdapter struct{ l logr.Logger } + +func (a cronLoggerAdapter) Info(msg string, keysAndValues ...interface{}) { + a.l.Info(msg, keysAndValues...) +} +func (a cronLoggerAdapter) Error(err error, msg string, keysAndValues ...interface{}) { + a.l.Error(err, msg, keysAndValues...) +} + +type ScheduledRunScheduler struct { + kube client.Client + dbClient database.Client + cronEngine *cron.Cron + + entriesMu sync.Mutex + entries map[types.NamespacedName]cron.EntryID + + // runCtx is the manager's runtime context, captured in Start. Dispatch + // derives from it so in-flight A2A calls cancel cleanly when the + // controller is shutting down. Nil before Start; baseCtx falls back to + // context.Background() in that window. + runCtxMu sync.RWMutex + runCtx context.Context + + // pollersWG tracks outcome-polling goroutines so Start can drain them on + // shutdown alongside cron jobs. + pollersWG sync.WaitGroup + + // dispatchHook is the agent invocation; tests override it so they don't + // need a real A2A server to verify the cron→record-result flow. + dispatchHook func(ctx context.Context, sr *v1alpha2.ScheduledRun, sessionID string) error + // outcomePollerHook resolves a session to an Outcome; tests override it + // (or set it to nil) so they don't need a populated database and so + // async writes are deterministic. + outcomePollerHook func(ctx context.Context, sessionID, userID string) (v1alpha2.RunOutcome, string, error) +} + +// NewScheduledRunScheduler constructs a scheduler. +func NewScheduledRunScheduler(kube client.Client, dbClient database.Client) *ScheduledRunScheduler { + cronLogger := cronLoggerAdapter{l: schedulerLog} + s := &ScheduledRunScheduler{ + kube: kube, + dbClient: dbClient, + // Recover protects the engine: a panic inside any one job no longer + // kills the whole cron loop. + cronEngine: cron.New(cron.WithChain(cron.Recover(cronLogger))), + entries: make(map[types.NamespacedName]cron.EntryID), + } + s.dispatchHook = s.runAgentCall + s.outcomePollerHook = s.pollSessionOutcome + return s +} + +func (s *ScheduledRunScheduler) NeedLeaderElection() bool { + return true +} + +func (s *ScheduledRunScheduler) Start(ctx context.Context) error { + schedulerLog.Info("Starting scheduled run scheduler") + s.runCtxMu.Lock() + s.runCtx = ctx + s.runCtxMu.Unlock() + + s.cronEngine.Start() + <-ctx.Done() + + schedulerLog.Info("Stopping scheduled run scheduler, draining in-flight runs") + stopCtx := s.cronEngine.Stop() + select { + case <-stopCtx.Done(): + schedulerLog.Info("All in-flight cron runs drained") + case <-time.After(drainTimeout): + schedulerLog.Info("Drain timeout exceeded, abandoning in-flight runs", + "timeout", drainTimeout) + } + + // Wait for outcome pollers (already context-cancelled) to return. + pollersDone := make(chan struct{}) + go func() { + s.pollersWG.Wait() + close(pollersDone) + }() + select { + case <-pollersDone: + case <-time.After(drainTimeout): + } + return nil +} + +// baseCtx returns the manager's runtime context (set in Start) so dispatch +// and apiserver calls cancel cleanly when the controller stops. Returns +// context.Background() before Start has run (e.g. unit tests that don't +// drive the full manager lifecycle). +func (s *ScheduledRunScheduler) baseCtx() context.Context { + s.runCtxMu.RLock() + defer s.runCtxMu.RUnlock() + if s.runCtx == nil { + return context.Background() + } + return s.runCtx +} + +// scheduleSpecForCron builds the cron expression handed to robfig/cron, +// embedding the SR's TimeZone via the parser-supported CRON_TZ= prefix +// (parser.go:95 in robfig/cron v3). +func scheduleSpecForCron(sr *v1alpha2.ScheduledRun) string { + if sr.Spec.TimeZone == "" { + return sr.Spec.Schedule + } + return "CRON_TZ=" + sr.Spec.TimeZone + " " + sr.Spec.Schedule +} + +func (s *ScheduledRunScheduler) UpdateSchedule(sr *v1alpha2.ScheduledRun) error { + s.entriesMu.Lock() + defer s.entriesMu.Unlock() + + key := types.NamespacedName{Name: sr.Name, Namespace: sr.Namespace} + + if existingID, ok := s.entries[key]; ok { + s.cronEngine.Remove(existingID) + delete(s.entries, key) + } + + if sr.Spec.Suspend { + metrics.SetActiveSchedules(len(s.entries)) + return nil + } + + entryID, err := s.cronEngine.AddFunc(scheduleSpecForCron(sr), func() { + s.runOnce(key) + }) + if err != nil { + metrics.SetActiveSchedules(len(s.entries)) + return fmt.Errorf("failed to add cron schedule for %s: %w", key, err) + } + + s.entries[key] = entryID + metrics.SetActiveSchedules(len(s.entries)) + return nil +} + +func (s *ScheduledRunScheduler) RemoveSchedule(key types.NamespacedName) { + s.entriesMu.Lock() + defer s.entriesMu.Unlock() + + if existingID, ok := s.entries[key]; ok { + s.cronEngine.Remove(existingID) + delete(s.entries, key) + } + metrics.SetActiveSchedules(len(s.entries)) +} + +// userIDInjector is an HTTP RoundTripper that injects user identity headers +// so the agent runtime knows which user the session belongs to. +type userIDInjector struct { + base http.RoundTripper + userID string +} + +func (t *userIDInjector) RoundTrip(req *http.Request) (*http.Response, error) { + req = req.Clone(req.Context()) + req.Header.Set("X-User-Id", t.userID) + return t.base.RoundTrip(req) +} + +// TriggerManualRun fires a run synchronously through the same code path as +// the cron tick and returns the recorded RunHistoryEntry. Manual triggers +// bypass spec.Suspend by design — Suspend gates only the cron path. +func (s *ScheduledRunScheduler) TriggerManualRun(key types.NamespacedName) (*v1alpha2.RunHistoryEntry, error) { + entry := s.runOnce(key) + if entry == nil { + return nil, fmt.Errorf("scheduled run %s not found", key) + } + return entry, nil +} + +// runOnce performs a single agent invocation: read the SR, send the prompt, +// append the outcome to RunHistory, and (for successful dispatches) spawn a +// background poller that resolves the session's terminal state into Outcome. +func (s *ScheduledRunScheduler) runOnce(key types.NamespacedName) *v1alpha2.RunHistoryEntry { + log := schedulerLog.WithValues("scheduledRun", key) + ctx := s.baseCtx() + + var sr v1alpha2.ScheduledRun + if err := s.kube.Get(ctx, key, &sr); err != nil { + if !apierrors.IsNotFound(err) { + log.Error(err, "Failed to fetch ScheduledRun") + } + return nil + } + + sessionID := protocol.GenerateContextID() + startTime := metav1.Now() + dispatchStart := time.Now() + + // Recover from panics inside dispatchHook so the run still ends up in + // RunHistory as a Failed entry instead of vanishing into the cron + // engine's recovery handler. + var dispatchErr error + func() { + defer func() { + if r := recover(); r != nil { + dispatchErr = fmt.Errorf("dispatch panic: %v", r) + log.Error(dispatchErr, "Recovered from dispatch panic") + } + }() + dispatchErr = s.dispatchHook(ctx, &sr, sessionID) + }() + + completionTime := metav1.Now() + entry := v1alpha2.RunHistoryEntry{ + StartTime: startTime, + CompletionTime: &completionTime, + SessionID: sessionID, + DispatchStatus: v1alpha2.DispatchStatusDispatched, + } + if dispatchErr != nil { + log.Error(dispatchErr, "Scheduled run failed") + entry.DispatchStatus = v1alpha2.DispatchStatusFailed + entry.DispatchMessage = truncate(dispatchErr.Error(), messageMaxBytes) + } else { + // Dispatched runs await async outcome resolution. + entry.Outcome = v1alpha2.RunOutcomePending + } + + metrics.ObserveScheduledRunDispatch( + key.Namespace, key.Name, string(entry.DispatchStatus), + time.Since(dispatchStart).Seconds(), + ) + + // Status writes use a fresh bounded ctx so the outcome is recorded even + // when the manager ctx has been cancelled (graceful shutdown path). + writeCtx, cancel := context.WithTimeout(context.Background(), statusWriteTimeout) + defer cancel() + if err := s.updateStatusWithRetry(writeCtx, key, func(latest *v1alpha2.ScheduledRun) { + latest.Status.LastRunTime = &startTime + latest.Status.RunHistory = append(latest.Status.RunHistory, entry) + trimRunHistory(latest) + // Advance NextRunTime here so it doesn't sit stale at the value + // computed by the last reconcile (which may now be in the past). + if sched, err := cron.ParseStandard(scheduleSpecForCron(latest)); err == nil { + next := metav1.NewTime(sched.Next(completionTime.Time)) + latest.Status.NextRunTime = &next + } + }); err != nil { + log.Error(err, "Failed to record run outcome") + } + + if entry.DispatchStatus == v1alpha2.DispatchStatusDispatched { + // Tests can disable async outcome polling by clearing the hook so + // RunHistory entries stay deterministic at Outcome=Pending. + if s.outcomePollerHook != nil { + s.spawnOutcomePoller(key, sessionID, sessionUserID(&sr)) + } + } else { + // Failed dispatches resolve immediately to a Failed outcome for metrics + // purposes — no session was created so polling would be meaningless. + metrics.ObserveScheduledRunOutcome(key.Namespace, key.Name, string(v1alpha2.RunOutcomeFailed)) + } + + return &entry +} + +// spawnOutcomePoller resolves the session's terminal state asynchronously and +// updates the matching RunHistoryEntry. Match is by SessionID, not by index, +// because RunHistory may be trimmed before polling completes. +func (s *ScheduledRunScheduler) spawnOutcomePoller(key types.NamespacedName, sessionID, userID string) { + s.pollersWG.Add(1) + go func() { + defer s.pollersWG.Done() + log := schedulerLog.WithValues("scheduledRun", key, "sessionID", sessionID) + + pollCtx, cancel := context.WithTimeout(s.baseCtx(), outcomePollTimeout) + defer cancel() + + outcome, msg, err := s.outcomePollerHook(pollCtx, sessionID, userID) + if err != nil { + log.Error(err, "Outcome polling failed") + outcome = v1alpha2.RunOutcomeTimeout + msg = err.Error() + } + + now := metav1.Now() + writeCtx, writeCancel := context.WithTimeout(context.Background(), statusWriteTimeout) + defer writeCancel() + if err := s.updateStatusWithRetry(writeCtx, key, func(latest *v1alpha2.ScheduledRun) { + for i := range latest.Status.RunHistory { + if latest.Status.RunHistory[i].SessionID == sessionID { + latest.Status.RunHistory[i].Outcome = outcome + latest.Status.RunHistory[i].OutcomeMessage = truncate(msg, messageMaxBytes) + latest.Status.RunHistory[i].OutcomeTime = &now + break + } + } + }); err != nil { + log.Error(err, "Failed to write outcome") + } + metrics.ObserveScheduledRunOutcome(key.Namespace, key.Name, string(outcome)) + }() +} + +// pollSessionOutcome polls the session's task list until a terminal state +// is observed. Returns Succeeded for completed, Failed for the negative +// terminal states, and Timeout if the deadline elapses first. +func (s *ScheduledRunScheduler) pollSessionOutcome(ctx context.Context, sessionID, userID string) (v1alpha2.RunOutcome, string, error) { + t := time.NewTicker(outcomePollInterval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return v1alpha2.RunOutcomeTimeout, "polling deadline exceeded", nil + case <-t.C: + } + tasks, err := s.dbClient.ListTasksForSession(ctx, sessionID) + if err != nil { + // Session row may not exist yet (race with StoreSession commit) — + // keep polling rather than treating transient errors as terminal. + continue + } + for _, task := range tasks { + switch task.Status.State { + case protocol.TaskStateCompleted: + return v1alpha2.RunOutcomeSucceeded, "", nil + case protocol.TaskStateFailed, protocol.TaskStateCanceled, protocol.TaskStateRejected: + msg := "" + if task.Status.Message != nil { + for _, p := range task.Status.Message.Parts { + if tp, ok := p.(*protocol.TextPart); ok { + msg = tp.Text + break + } + } + } + return v1alpha2.RunOutcomeFailed, msg, nil + } + } + } +} + +// runAgentCall is the production dispatchHook: persist the session, resolve +// the agent's A2A endpoint, and send the prompt. +func (s *ScheduledRunScheduler) runAgentCall(ctx context.Context, sr *v1alpha2.ScheduledRun, sessionID string) error { + agentNS := sr.Spec.AgentRef.Namespace + if agentNS == "" { + agentNS = sr.Namespace + } + agentID := utils.ConvertToPythonIdentifier(utils.ResourceRefString(agentNS, sr.Spec.AgentRef.Name)) + + userID := sessionUserID(sr) + + storeCtx, storeCancel := context.WithTimeout(ctx, 30*time.Second) + defer storeCancel() + if err := s.dbClient.StoreSession(storeCtx, &database.Session{ + ID: sessionID, + UserID: userID, + AgentID: &agentID, + }); err != nil { + return fmt.Errorf("failed to create session: %w", err) + } + + var agent v1alpha2.Agent + if err := s.kube.Get(ctx, types.NamespacedName{Namespace: agentNS, Name: sr.Spec.AgentRef.Name}, &agent); err != nil { + return fmt.Errorf("failed to fetch agent: %w", err) + } + agentURL := agenttranslator.GetA2AAgentCard(&agent).URL + + cli, err := a2aclient.NewA2AClient( + agentURL, + a2aclient.WithTimeout(5*time.Minute), + a2aclient.WithHTTPClient(&http.Client{ + Transport: &userIDInjector{ + base: http.DefaultTransport, + userID: userID, + }, + }), + ) + if err != nil { + return fmt.Errorf("failed to create A2A client: %w", err) + } + + if _, err := cli.SendMessage(ctx, protocol.SendMessageParams{ + Message: protocol.Message{ + Kind: protocol.KindMessage, + Role: protocol.MessageRoleUser, + ContextID: &sessionID, + Parts: []protocol.Part{protocol.NewTextPart(sr.Spec.Prompt)}, + }, + }); err != nil { + return fmt.Errorf("agent invocation failed: %w", err) + } + return nil +} + +func sessionUserID(sr *v1alpha2.ScheduledRun) string { + if v := sr.Annotations[v1alpha2.AnnotationCreatedBy]; v != "" { + return v + } + return "scheduled-run" +} + +// updateStatusWithRetry refetches the SR and applies mutate, retrying on +// conflict. Status fields are written by both this scheduler (run history, +// timing) and the SR controller (Accepted condition); without retry the +// loser's update is silently dropped. +func (s *ScheduledRunScheduler) updateStatusWithRetry( + ctx context.Context, + key types.NamespacedName, + mutate func(*v1alpha2.ScheduledRun), +) error { + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + var latest v1alpha2.ScheduledRun + if err := s.kube.Get(ctx, key, &latest); err != nil { + return err + } + mutate(&latest) + return s.kube.Status().Update(ctx, &latest) + }) +} + +// trimRunHistory keeps the most recent MaxRunHistory entries. The CRD default +// (10) is applied by the apiserver in production, but unit tests with the +// fake client construct SR objects directly without admission, so we keep +// the runtime fallback to the same value. +func trimRunHistory(sr *v1alpha2.ScheduledRun) { + maxHistory := sr.Spec.MaxRunHistory + if maxHistory <= 0 { + maxHistory = 10 + } + if len(sr.Status.RunHistory) > maxHistory { + sr.Status.RunHistory = sr.Status.RunHistory[len(sr.Status.RunHistory)-maxHistory:] + } +} + +func truncate(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max] + "…(truncated)" +} diff --git a/go/core/internal/controller/scheduledrun_scheduler_test.go b/go/core/internal/controller/scheduledrun_scheduler_test.go new file mode 100644 index 000000000..0911e97df --- /dev/null +++ b/go/core/internal/controller/scheduledrun_scheduler_test.go @@ -0,0 +1,332 @@ +package controller + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" +) + +func TestScheduledRunScheduler_UpdateSchedule(t *testing.T) { + scheduler := NewScheduledRunScheduler(nil, nil) + + t.Run("adds entry for valid schedule", func(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-sr", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + Prompt: "hello", + }, + } + + err := scheduler.UpdateSchedule(sr) + require.NoError(t, err) + + key := types.NamespacedName{Name: "test-sr", Namespace: "default"} + _, exists := scheduler.entries[key] + assert.True(t, exists, "entry should be registered") + }) + + t.Run("removes entry when suspended", func(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "suspended-sr", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + Prompt: "hello", + Suspend: false, + }, + } + + err := scheduler.UpdateSchedule(sr) + require.NoError(t, err) + + key := types.NamespacedName{Name: "suspended-sr", Namespace: "default"} + _, exists := scheduler.entries[key] + assert.True(t, exists) + + sr.Spec.Suspend = true + err = scheduler.UpdateSchedule(sr) + require.NoError(t, err) + + _, exists = scheduler.entries[key] + assert.False(t, exists, "entry should be removed when suspended") + }) + + t.Run("replaces entry on re-schedule", func(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "replace-sr", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + Prompt: "hello", + }, + } + + err := scheduler.UpdateSchedule(sr) + require.NoError(t, err) + + key := types.NamespacedName{Name: "replace-sr", Namespace: "default"} + firstID := scheduler.entries[key] + + sr.Spec.Schedule = "0 */3 * * *" + err = scheduler.UpdateSchedule(sr) + require.NoError(t, err) + + secondID := scheduler.entries[key] + assert.NotEqual(t, firstID, secondID, "entry ID should change on re-schedule") + }) + + t.Run("returns error for invalid cron expression", func(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-sr", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "invalid", + Prompt: "hello", + }, + } + + err := scheduler.UpdateSchedule(sr) + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to add cron schedule") + }) + + t.Run("accepts schedule with TimeZone via CRON_TZ prefix", func(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "tz-sr", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 9 * * *", + TimeZone: "America/Los_Angeles", + Prompt: "hello", + }, + } + err := scheduler.UpdateSchedule(sr) + require.NoError(t, err) + }) +} + +func TestScheduledRunScheduler_RemoveSchedule(t *testing.T) { + scheduler := NewScheduledRunScheduler(nil, nil) + + t.Run("removes existing entry", func(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "to-remove", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + Prompt: "hello", + }, + } + + err := scheduler.UpdateSchedule(sr) + require.NoError(t, err) + + key := types.NamespacedName{Name: "to-remove", Namespace: "default"} + _, exists := scheduler.entries[key] + assert.True(t, exists) + + scheduler.RemoveSchedule(key) + _, exists = scheduler.entries[key] + assert.False(t, exists) + }) + + t.Run("no-op for non-existing entry", func(t *testing.T) { + key := types.NamespacedName{Name: "nonexistent", Namespace: "default"} + scheduler.RemoveSchedule(key) + }) +} + +// --- runOnce tests ---------------------------------------------------------- +// +// runOnce is the single code path for both cron ticks and manual triggers. +// We swap dispatchHook to avoid needing a real A2A server, and disable the +// async outcome poller so RunHistory entries stay deterministic. + +func newSchedulerWithFake(t *testing.T, sr *v1alpha2.ScheduledRun) (*ScheduledRunScheduler, types.NamespacedName) { + t.Helper() + scheme := runtime.NewScheme() + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + kube := fake.NewClientBuilder(). + WithScheme(scheme). + WithStatusSubresource(&v1alpha2.ScheduledRun{}). + WithRuntimeObjects(sr). + Build() + + s := NewScheduledRunScheduler(kube, nil) + s.outcomePollerHook = nil // disable async outcome polling for deterministic asserts + return s, types.NamespacedName{Namespace: sr.Namespace, Name: sr.Name} +} + +func TestRunOnce_RecordsDispatched(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{Name: "ok", Namespace: "default"}, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 * * * *", + Prompt: "hi", + AgentRef: v1alpha2.AgentReference{Name: "a", Namespace: "default"}, + }, + } + s, key := newSchedulerWithFake(t, sr) + + called := false + s.dispatchHook = func(_ context.Context, _ *v1alpha2.ScheduledRun, _ string) error { + called = true + return nil + } + + entry, err := s.TriggerManualRun(key) + require.NoError(t, err) + require.NotNil(t, entry) + assert.Equal(t, v1alpha2.DispatchStatusDispatched, entry.DispatchStatus) + assert.Equal(t, v1alpha2.RunOutcomePending, entry.Outcome) + assert.True(t, called) + + got := &v1alpha2.ScheduledRun{} + require.NoError(t, s.kube.Get(context.Background(), key, got)) + require.Len(t, got.Status.RunHistory, 1) + assert.Equal(t, v1alpha2.DispatchStatusDispatched, got.Status.RunHistory[0].DispatchStatus) + assert.Equal(t, v1alpha2.RunOutcomePending, got.Status.RunHistory[0].Outcome) + assert.NotNil(t, got.Status.RunHistory[0].CompletionTime) + assert.Empty(t, got.Status.RunHistory[0].DispatchMessage) +} + +func TestRunOnce_RecordsDispatchFailed(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{Name: "boom", Namespace: "default"}, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 * * * *", + Prompt: "hi", + AgentRef: v1alpha2.AgentReference{Name: "a", Namespace: "default"}, + }, + } + s, key := newSchedulerWithFake(t, sr) + + s.dispatchHook = func(_ context.Context, _ *v1alpha2.ScheduledRun, _ string) error { + return errors.New("agent down") + } + + entry, err := s.TriggerManualRun(key) + require.NoError(t, err) + require.NotNil(t, entry) + assert.Equal(t, v1alpha2.DispatchStatusFailed, entry.DispatchStatus) + // Failed dispatches do not start a poller, so Outcome stays empty. + assert.Empty(t, entry.Outcome) + + got := &v1alpha2.ScheduledRun{} + require.NoError(t, s.kube.Get(context.Background(), key, got)) + require.Len(t, got.Status.RunHistory, 1) + assert.Equal(t, v1alpha2.DispatchStatusFailed, got.Status.RunHistory[0].DispatchStatus) + assert.Contains(t, got.Status.RunHistory[0].DispatchMessage, "agent down") +} + +// TestRunOnce_RecoversFromDispatchPanic verifies that a panic inside the +// dispatch path is caught and recorded as a Failed entry instead of +// vanishing into the cron engine's recovery handler. Without this, the +// panic path silently drops the run from RunHistory. +func TestRunOnce_RecoversFromDispatchPanic(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{Name: "panicky", Namespace: "default"}, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 * * * *", + Prompt: "hi", + AgentRef: v1alpha2.AgentReference{Name: "a", Namespace: "default"}, + }, + } + s, key := newSchedulerWithFake(t, sr) + + s.dispatchHook = func(_ context.Context, _ *v1alpha2.ScheduledRun, _ string) error { + panic("simulated dispatch panic") + } + + entry, err := s.TriggerManualRun(key) + require.NoError(t, err) + require.NotNil(t, entry) + assert.Equal(t, v1alpha2.DispatchStatusFailed, entry.DispatchStatus) + assert.Contains(t, entry.DispatchMessage, "dispatch panic") + + got := &v1alpha2.ScheduledRun{} + require.NoError(t, s.kube.Get(context.Background(), key, got)) + require.Len(t, got.Status.RunHistory, 1) + assert.Equal(t, v1alpha2.DispatchStatusFailed, got.Status.RunHistory[0].DispatchStatus) + assert.Contains(t, got.Status.RunHistory[0].DispatchMessage, "dispatch panic") +} + +func TestRunOnce_TrimsHistory(t *testing.T) { + existing := make([]v1alpha2.RunHistoryEntry, 10) + for i := range existing { + existing[i] = v1alpha2.RunHistoryEntry{ + StartTime: metav1.NewTime(time.Now().Add(time.Duration(-i) * time.Minute)), + DispatchStatus: v1alpha2.DispatchStatusDispatched, + } + } + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{Name: "trim", Namespace: "default"}, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 * * * *", + Prompt: "hi", + AgentRef: v1alpha2.AgentReference{Name: "a", Namespace: "default"}, + MaxRunHistory: 5, + }, + Status: v1alpha2.ScheduledRunStatus{RunHistory: existing}, + } + s, key := newSchedulerWithFake(t, sr) + s.dispatchHook = func(_ context.Context, _ *v1alpha2.ScheduledRun, _ string) error { return nil } + + _, err := s.TriggerManualRun(key) + require.NoError(t, err) + + got := &v1alpha2.ScheduledRun{} + require.NoError(t, s.kube.Get(context.Background(), key, got)) + assert.Len(t, got.Status.RunHistory, 5) +} + +// TestRunOnce_TruncatesLongDispatchMessage guards the apiserver size budget: +// a flood of long error strings must not push status past the limit. +func TestRunOnce_TruncatesLongDispatchMessage(t *testing.T) { + sr := &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{Name: "longmsg", Namespace: "default"}, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 * * * *", + Prompt: "hi", + AgentRef: v1alpha2.AgentReference{Name: "a", Namespace: "default"}, + }, + } + s, key := newSchedulerWithFake(t, sr) + + long := make([]byte, messageMaxBytes*4) + for i := range long { + long[i] = 'x' + } + s.dispatchHook = func(_ context.Context, _ *v1alpha2.ScheduledRun, _ string) error { + return errors.New(string(long)) + } + + entry, err := s.TriggerManualRun(key) + require.NoError(t, err) + require.LessOrEqual(t, len(entry.DispatchMessage), messageMaxBytes+len("…(truncated)")) +} diff --git a/go/core/internal/httpserver/handlers/agents.go b/go/core/internal/httpserver/handlers/agents.go index c324aa5ed..a1f705563 100644 --- a/go/core/internal/httpserver/handlers/agents.go +++ b/go/core/internal/httpserver/handlers/agents.go @@ -622,6 +622,16 @@ func (h *AgentsHandler) HandleDeleteAgent(w ErrorResponseWriter, r *http.Request agent := &v1alpha2.Agent{} err = h.KubeClient.Get(ctx, objKey, agent) if err == nil { + if refs, err := findReferencingScheduledRuns(ctx, h.KubeClient, agentNamespace, agentName); err != nil { + w.RespondWithError(errors.NewInternalServerError("Failed to check ScheduledRun references", err)) + return + } else if len(refs) > 0 { + w.RespondWithError(errors.NewConflictError( + fmt.Sprintf("Agent is referenced by %d ScheduledRun(s): %s. Delete them first.", len(refs), strings.Join(refs, ", ")), + nil, + )) + return + } if err := h.KubeClient.Delete(ctx, agent); err != nil { w.RespondWithError(errors.NewInternalServerError("Failed to delete Agent", err)) return @@ -649,6 +659,16 @@ func (h *AgentsHandler) HandleDeleteAgent(w ErrorResponseWriter, r *http.Request w.RespondWithError(errors.NewNotFoundError("Agent not found", nil)) return } + if refs, err := findReferencingScheduledRuns(ctx, h.KubeClient, agentNamespace, agentName); err != nil { + w.RespondWithError(errors.NewInternalServerError("Failed to check ScheduledRun references", err)) + return + } else if len(refs) > 0 { + w.RespondWithError(errors.NewConflictError( + fmt.Sprintf("Agent is referenced by %d ScheduledRun(s): %s. Delete them first.", len(refs), strings.Join(refs, ", ")), + nil, + )) + return + } if err := h.KubeClient.Delete(ctx, sb); err != nil { w.RespondWithError(errors.NewInternalServerError("Failed to delete AgentHarness", err)) return @@ -657,6 +677,27 @@ func (h *AgentsHandler) HandleDeleteAgent(w ErrorResponseWriter, r *http.Request RespondWithJSON(w, http.StatusOK, api.NewResponse(struct{}{}, "Successfully deleted agent", false)) } +// findReferencingScheduledRuns returns names ("ns/name") of ScheduledRuns whose +// agentRef points at the given agent. An empty agentRef.namespace is treated as +// the agent's own namespace, mirroring controller behavior. +func findReferencingScheduledRuns(ctx context.Context, kube client.Client, agentNamespace, agentName string) ([]string, error) { + srList := &v1alpha2.ScheduledRunList{} + if err := kube.List(ctx, srList); err != nil { + return nil, fmt.Errorf("failed to list ScheduledRuns: %w", err) + } + var refs []string + for _, sr := range srList.Items { + ns := sr.Spec.AgentRef.Namespace + if ns == "" { + ns = sr.Namespace + } + if sr.Spec.AgentRef.Name == agentName && ns == agentNamespace { + refs = append(refs, sr.Namespace+"/"+sr.Name) + } + } + return refs, nil +} + func normalizeSandboxAgentForAPI(sa *v1alpha2.SandboxAgent) { if sa == nil { return diff --git a/go/core/internal/httpserver/handlers/handlers.go b/go/core/internal/httpserver/handlers/handlers.go index 13a66adeb..69b90abdd 100644 --- a/go/core/internal/httpserver/handlers/handlers.go +++ b/go/core/internal/httpserver/handlers/handlers.go @@ -29,6 +29,7 @@ type Handlers struct { Checkpoints *CheckpointsHandler CrewAI *CrewAIHandler CurrentUser *CurrentUserHandler + ScheduledRuns *ScheduledRunsHandler } // Base holds common dependencies for all handlers @@ -43,7 +44,7 @@ type Base struct { } // NewHandlers creates a new Handlers instance with all handler components. -func NewHandlers(kubeClient client.Client, defaultModelConfig types.NamespacedName, dbService database.Client, watchedNamespaces []string, authorizer auth.Authorizer, proxyURL string, rcnclr reconciler.KagentReconciler, sandboxBackend sandboxbackend.Backend) *Handlers { +func NewHandlers(kubeClient client.Client, defaultModelConfig types.NamespacedName, dbService database.Client, watchedNamespaces []string, authorizer auth.Authorizer, proxyURL string, rcnclr reconciler.KagentReconciler, sandboxBackend sandboxbackend.Backend, scheduledRunTrigger ScheduledRunTrigger) *Handlers { base := &Base{ KubeClient: kubeClient, DefaultModelConfig: defaultModelConfig, @@ -72,5 +73,6 @@ func NewHandlers(kubeClient client.Client, defaultModelConfig types.NamespacedNa Checkpoints: NewCheckpointsHandler(base), CrewAI: NewCrewAIHandler(base), CurrentUser: NewCurrentUserHandler(), + ScheduledRuns: NewScheduledRunsHandler(base, scheduledRunTrigger), } } diff --git a/go/core/internal/httpserver/handlers/scheduledruns.go b/go/core/internal/httpserver/handlers/scheduledruns.go new file mode 100644 index 000000000..9ac1a3425 --- /dev/null +++ b/go/core/internal/httpserver/handlers/scheduledruns.go @@ -0,0 +1,313 @@ +package handlers + +import ( + "net/http" + "time" + + "github.com/robfig/cron/v3" + + api "github.com/kagent-dev/kagent/go/api/httpapi" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/errors" + "github.com/kagent-dev/kagent/go/core/internal/utils" + "github.com/kagent-dev/kagent/go/core/pkg/auth" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + ctrllog "sigs.k8s.io/controller-runtime/pkg/log" +) + +// ScheduledRunTrigger is the interface for triggering scheduled runs manually. +// Implementations run synchronously and return the recorded RunHistoryEntry +// so the handler can include it in the response. +type ScheduledRunTrigger interface { + TriggerManualRun(key types.NamespacedName) (*v1alpha2.RunHistoryEntry, error) +} + +// ScheduledRunsHandler handles ScheduledRun-related requests +type ScheduledRunsHandler struct { + *Base + Scheduler ScheduledRunTrigger +} + +// NewScheduledRunsHandler creates a new ScheduledRunsHandler +func NewScheduledRunsHandler(base *Base, scheduler ScheduledRunTrigger) *ScheduledRunsHandler { + return &ScheduledRunsHandler{Base: base, Scheduler: scheduler} +} + +// ValidateSchedule validates the cron expression syntax and (optionally) the +// IANA time zone. Both are checked at the API edge so a bad request is +// rejected with 400 before it ever reaches the controller, where the same +// invariants are re-checked against the persisted object. +func ValidateSchedule(schedule, timeZone string) *errors.APIError { + if schedule == "" { + return errors.NewBadRequestError("spec.schedule is required (request body must be {\"spec\":{...}})", nil) + } + expr := schedule + if timeZone != "" { + if _, err := time.LoadLocation(timeZone); err != nil { + return errors.NewBadRequestError("Invalid time zone: "+err.Error(), nil) + } + expr = "CRON_TZ=" + timeZone + " " + schedule + } + if _, err := cron.ParseStandard(expr); err != nil { + return errors.NewBadRequestError("Invalid cron expression: "+err.Error(), nil) + } + return nil +} + +// HandleListScheduledRuns handles GET /api/scheduledruns requests +func (h *ScheduledRunsHandler) HandleListScheduledRuns(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("scheduledruns-handler").WithValues("operation", "list") + + if err := Check(h.Authorizer, r, auth.Resource{Type: "ScheduledRun"}); err != nil { + w.RespondWithError(err) + return + } + + scheduledRunList := &v1alpha2.ScheduledRunList{} + if err := h.KubeClient.List(r.Context(), scheduledRunList); err != nil { + w.RespondWithError(errors.NewInternalServerError("Failed to list ScheduledRuns", err)) + return + } + + log.Info("Successfully listed ScheduledRuns", "count", len(scheduledRunList.Items)) + data := api.NewResponse(scheduledRunList.Items, "Successfully listed ScheduledRuns", false) + RespondWithJSON(w, http.StatusOK, data) +} + +// HandleGetScheduledRun handles GET /api/scheduledruns/{namespace}/{name} requests +func (h *ScheduledRunsHandler) HandleGetScheduledRun(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("scheduledruns-handler").WithValues("operation", "get") + + namespace, err := GetPathParam(r, "namespace") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get namespace from path", err)) + return + } + + name, err := GetPathParam(r, "name") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get name from path", err)) + return + } + + log = log.WithValues("namespace", namespace, "name", name) + + if err := Check(h.Authorizer, r, auth.Resource{Type: "ScheduledRun", Name: namespace + "/" + name}); err != nil { + w.RespondWithError(err) + return + } + + sr := &v1alpha2.ScheduledRun{} + if err := h.KubeClient.Get(r.Context(), client.ObjectKey{Namespace: namespace, Name: name}, sr); err != nil { + if apierrors.IsNotFound(err) { + w.RespondWithError(errors.NewNotFoundError("ScheduledRun not found", err)) + return + } + w.RespondWithError(errors.NewInternalServerError("Failed to get ScheduledRun", err)) + return + } + + log.Info("Successfully retrieved ScheduledRun") + data := api.NewResponse(sr, "Successfully retrieved ScheduledRun", false) + RespondWithJSON(w, http.StatusOK, data) +} + +// HandleCreateScheduledRun handles POST /api/scheduledruns requests +func (h *ScheduledRunsHandler) HandleCreateScheduledRun(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("scheduledruns-handler").WithValues("operation", "create") + + if err := Check(h.Authorizer, r, auth.Resource{Type: "ScheduledRun"}); err != nil { + w.RespondWithError(err) + return + } + + var sr v1alpha2.ScheduledRun + if err := DecodeJSONBody(r, &sr); err != nil { + w.RespondWithError(errors.NewBadRequestError("Invalid request body", err)) + return + } + + if apiErr := ValidateSchedule(sr.Spec.Schedule, sr.Spec.TimeZone); apiErr != nil { + w.RespondWithError(apiErr) + return + } + + if sr.Namespace == "" { + sr.Namespace = utils.GetResourceNamespace() + } + + // Record the creating user so the scheduler can attribute sessions back + // to them — without this the session is invisible to the UI. + if userID, err := getUserIDOrAgentUser(r); err == nil && userID != "" { + if sr.Annotations == nil { + sr.Annotations = map[string]string{} + } + sr.Annotations[v1alpha2.AnnotationCreatedBy] = userID + } + + log = log.WithValues("namespace", sr.Namespace, "name", sr.Name) + + if err := h.KubeClient.Create(r.Context(), &sr); err != nil { + w.RespondWithError(errors.NewInternalServerError("Failed to create ScheduledRun", err)) + return + } + + log.Info("Successfully created ScheduledRun") + data := api.NewResponse(sr, "Successfully created ScheduledRun", false) + RespondWithJSON(w, http.StatusCreated, data) +} + +// HandleUpdateScheduledRun handles PUT /api/scheduledruns/{namespace}/{name} requests +func (h *ScheduledRunsHandler) HandleUpdateScheduledRun(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("scheduledruns-handler").WithValues("operation", "update") + + namespace, err := GetPathParam(r, "namespace") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get namespace from path", err)) + return + } + + name, err := GetPathParam(r, "name") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get name from path", err)) + return + } + + log = log.WithValues("namespace", namespace, "name", name) + + if err := Check(h.Authorizer, r, auth.Resource{Type: "ScheduledRun", Name: namespace + "/" + name}); err != nil { + w.RespondWithError(err) + return + } + + var incoming v1alpha2.ScheduledRun + if err := DecodeJSONBody(r, &incoming); err != nil { + w.RespondWithError(errors.NewBadRequestError("Invalid request body", err)) + return + } + + if apiErr := ValidateSchedule(incoming.Spec.Schedule, incoming.Spec.TimeZone); apiErr != nil { + w.RespondWithError(apiErr) + return + } + + existing := &v1alpha2.ScheduledRun{} + if err := h.KubeClient.Get(r.Context(), client.ObjectKey{Namespace: namespace, Name: name}, existing); err != nil { + if apierrors.IsNotFound(err) { + w.RespondWithError(errors.NewNotFoundError("ScheduledRun not found", err)) + return + } + w.RespondWithError(errors.NewInternalServerError("Failed to get ScheduledRun", err)) + return + } + + existing.Spec = incoming.Spec + + // Preserve created-by annotation across updates; if missing, set from + // current request user. + if existing.Annotations == nil { + existing.Annotations = map[string]string{} + } + if existing.Annotations[v1alpha2.AnnotationCreatedBy] == "" { + if userID, err := getUserIDOrAgentUser(r); err == nil && userID != "" { + existing.Annotations[v1alpha2.AnnotationCreatedBy] = userID + } + } + + if err := h.KubeClient.Update(r.Context(), existing); err != nil { + w.RespondWithError(errors.NewInternalServerError("Failed to update ScheduledRun", err)) + return + } + + log.Info("Successfully updated ScheduledRun") + data := api.NewResponse(existing, "Successfully updated ScheduledRun", false) + RespondWithJSON(w, http.StatusOK, data) +} + +// HandleDeleteScheduledRun handles DELETE /api/scheduledruns/{namespace}/{name} requests +func (h *ScheduledRunsHandler) HandleDeleteScheduledRun(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("scheduledruns-handler").WithValues("operation", "delete") + + namespace, err := GetPathParam(r, "namespace") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get namespace from path", err)) + return + } + + name, err := GetPathParam(r, "name") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get name from path", err)) + return + } + + log = log.WithValues("namespace", namespace, "name", name) + + if err := Check(h.Authorizer, r, auth.Resource{Type: "ScheduledRun", Name: namespace + "/" + name}); err != nil { + w.RespondWithError(err) + return + } + + sr := &v1alpha2.ScheduledRun{} + if err := h.KubeClient.Get(r.Context(), client.ObjectKey{Namespace: namespace, Name: name}, sr); err != nil { + if apierrors.IsNotFound(err) { + w.RespondWithError(errors.NewNotFoundError("ScheduledRun not found", err)) + return + } + w.RespondWithError(errors.NewInternalServerError("Failed to get ScheduledRun", err)) + return + } + + if err := h.KubeClient.Delete(r.Context(), sr); err != nil { + w.RespondWithError(errors.NewInternalServerError("Failed to delete ScheduledRun", err)) + return + } + + log.Info("Successfully deleted ScheduledRun") + data := api.NewResponse(struct{}{}, "Successfully deleted ScheduledRun", false) + RespondWithJSON(w, http.StatusOK, data) +} + +// HandleTriggerScheduledRun handles POST /api/scheduledruns/{namespace}/{name}/trigger requests +func (h *ScheduledRunsHandler) HandleTriggerScheduledRun(w ErrorResponseWriter, r *http.Request) { + log := ctrllog.FromContext(r.Context()).WithName("scheduledruns-handler").WithValues("operation", "trigger") + + namespace, err := GetPathParam(r, "namespace") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get namespace from path", err)) + return + } + + name, err := GetPathParam(r, "name") + if err != nil { + w.RespondWithError(errors.NewBadRequestError("Failed to get name from path", err)) + return + } + + log = log.WithValues("namespace", namespace, "name", name) + + if err := Check(h.Authorizer, r, auth.Resource{Type: "ScheduledRun", Name: namespace + "/" + name}); err != nil { + w.RespondWithError(err) + return + } + + sr := &v1alpha2.ScheduledRun{} + if err := h.KubeClient.Get(r.Context(), client.ObjectKey{Namespace: namespace, Name: name}, sr); err != nil { + if apierrors.IsNotFound(err) { + w.RespondWithError(errors.NewNotFoundError("ScheduledRun not found", err)) + return + } + w.RespondWithError(errors.NewInternalServerError("Failed to get ScheduledRun", err)) + return + } + + log.Info("Manually triggering ScheduledRun") + entry, err := h.Scheduler.TriggerManualRun(types.NamespacedName{Namespace: namespace, Name: name}) + if err != nil { + w.RespondWithError(errors.NewInternalServerError("Failed to trigger ScheduledRun", err)) + return + } + data := api.NewResponse(entry, "ScheduledRun triggered successfully", false) + RespondWithJSON(w, http.StatusOK, data) +} diff --git a/go/core/internal/httpserver/handlers/scheduledruns_test.go b/go/core/internal/httpserver/handlers/scheduledruns_test.go new file mode 100644 index 000000000..a35f9bc2b --- /dev/null +++ b/go/core/internal/httpserver/handlers/scheduledruns_test.go @@ -0,0 +1,429 @@ +package handlers_test + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gorilla/mux" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/auth" + "github.com/kagent-dev/kagent/go/core/internal/httpserver/handlers" +) + +// mockScheduledRunTrigger implements handlers.ScheduledRunTrigger for testing. +type mockScheduledRunTrigger struct { + triggered []types.NamespacedName + entry *v1alpha2.RunHistoryEntry + err error +} + +func (m *mockScheduledRunTrigger) TriggerManualRun(key types.NamespacedName) (*v1alpha2.RunHistoryEntry, error) { + m.triggered = append(m.triggered, key) + return m.entry, m.err +} + +func TestScheduledRunsHandler(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + setupHandler := func(objects ...runtime.Object) (*handlers.ScheduledRunsHandler, *mockScheduledRunTrigger, *mockErrorResponseWriter) { + clientBuilder := fake.NewClientBuilder(). + WithScheme(scheme). + WithStatusSubresource(&v1alpha2.ScheduledRun{}) + for _, obj := range objects { + clientBuilder = clientBuilder.WithRuntimeObjects(obj) + } + kubeClient := clientBuilder.Build() + + trigger := &mockScheduledRunTrigger{} + base := &handlers.Base{ + KubeClient: kubeClient, + Authorizer: &auth.NoopAuthorizer{}, + } + handler := handlers.NewScheduledRunsHandler(base, trigger) + responseRecorder := newMockErrorResponseWriter() + return handler, trigger, responseRecorder + } + + newSR := func(namespace, name, schedule string) *v1alpha2.ScheduledRun { + return &v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: schedule, + AgentRef: v1alpha2.AgentReference{ + Name: "my-agent", + Namespace: namespace, + }, + Prompt: "test prompt", + MaxRunHistory: 10, + }, + } + } + + t.Run("HandleListScheduledRuns", func(t *testing.T) { + t.Run("empty list", func(t *testing.T) { + handler, _, w := setupHandler() + + req := httptest.NewRequest("GET", "/api/scheduledruns", nil) + req = setUser(req, "test-user") + handler.HandleListScheduledRuns(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + }) + + t.Run("list with items", func(t *testing.T) { + sr := newSR("default", "sr-1", "0 */2 * * *") + handler, _, w := setupHandler(sr) + + req := httptest.NewRequest("GET", "/api/scheduledruns", nil) + req = setUser(req, "test-user") + handler.HandleListScheduledRuns(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + assert.Contains(t, w.Body.String(), "sr-1") + }) + }) + + t.Run("HandleGetScheduledRun", func(t *testing.T) { + t.Run("success", func(t *testing.T) { + sr := newSR("default", "sr-1", "0 */2 * * *") + handler, _, w := setupHandler(sr) + + req := httptest.NewRequest("GET", "/api/scheduledruns/default/sr-1", nil) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "sr-1"}) + req = setUser(req, "test-user") + handler.HandleGetScheduledRun(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + assert.Contains(t, w.Body.String(), "sr-1") + }) + + t.Run("not found", func(t *testing.T) { + handler, _, w := setupHandler() + + req := httptest.NewRequest("GET", "/api/scheduledruns/default/nonexistent", nil) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "nonexistent"}) + req = setUser(req, "test-user") + handler.HandleGetScheduledRun(w, req) + + assert.Equal(t, http.StatusNotFound, w.Code) + }) + }) + + t.Run("HandleCreateScheduledRun", func(t *testing.T) { + t.Run("success", func(t *testing.T) { + handler, _, w := setupHandler() + + sr := v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "new-sr", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + AgentRef: v1alpha2.AgentReference{Name: "agent", Namespace: "default"}, + Prompt: "do something", + }, + } + body, _ := json.Marshal(sr) + + req := httptest.NewRequest("POST", "/api/scheduledruns", bytes.NewBuffer(body)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + handler.HandleCreateScheduledRun(w, req) + + assert.Equal(t, http.StatusCreated, w.Code) + assert.Contains(t, w.Body.String(), "new-sr") + }) + + t.Run("invalid schedule - bad cron syntax", func(t *testing.T) { + handler, _, w := setupHandler() + + sr := v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "new-sr", + Namespace: "default", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "not-a-cron", + AgentRef: v1alpha2.AgentReference{Name: "agent", Namespace: "default"}, + Prompt: "do something", + }, + } + body, _ := json.Marshal(sr) + + req := httptest.NewRequest("POST", "/api/scheduledruns", bytes.NewBuffer(body)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + handler.HandleCreateScheduledRun(w, req) + + assert.Equal(t, http.StatusBadRequest, w.Code) + }) + + t.Run("invalid body", func(t *testing.T) { + handler, _, w := setupHandler() + + req := httptest.NewRequest("POST", "/api/scheduledruns", bytes.NewBufferString("{invalid")) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + handler.HandleCreateScheduledRun(w, req) + + assert.Equal(t, http.StatusBadRequest, w.Code) + }) + }) + + t.Run("HandleUpdateScheduledRun", func(t *testing.T) { + t.Run("success", func(t *testing.T) { + existing := newSR("default", "sr-1", "0 */2 * * *") + handler, _, w := setupHandler(existing) + + updated := v1alpha2.ScheduledRun{ + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */3 * * *", + AgentRef: v1alpha2.AgentReference{Name: "my-agent", Namespace: "default"}, + Prompt: "updated prompt", + }, + } + body, _ := json.Marshal(updated) + + req := httptest.NewRequest("PUT", "/api/scheduledruns/default/sr-1", bytes.NewBuffer(body)) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "sr-1"}) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + handler.HandleUpdateScheduledRun(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + }) + + t.Run("not found", func(t *testing.T) { + handler, _, w := setupHandler() + + updated := v1alpha2.ScheduledRun{ + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */3 * * *", + AgentRef: v1alpha2.AgentReference{Name: "agent", Namespace: "default"}, + Prompt: "updated prompt", + }, + } + body, _ := json.Marshal(updated) + + req := httptest.NewRequest("PUT", "/api/scheduledruns/default/nonexistent", bytes.NewBuffer(body)) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "nonexistent"}) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + handler.HandleUpdateScheduledRun(w, req) + + assert.Equal(t, http.StatusNotFound, w.Code) + }) + + t.Run("invalid schedule", func(t *testing.T) { + existing := newSR("default", "sr-1", "0 */2 * * *") + handler, _, w := setupHandler(existing) + + updated := v1alpha2.ScheduledRun{ + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "not-a-cron", + AgentRef: v1alpha2.AgentReference{Name: "agent", Namespace: "default"}, + Prompt: "updated prompt", + }, + } + body, _ := json.Marshal(updated) + + req := httptest.NewRequest("PUT", "/api/scheduledruns/default/sr-1", bytes.NewBuffer(body)) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "sr-1"}) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + handler.HandleUpdateScheduledRun(w, req) + + assert.Equal(t, http.StatusBadRequest, w.Code) + }) + }) + + t.Run("HandleDeleteScheduledRun", func(t *testing.T) { + t.Run("success", func(t *testing.T) { + existing := newSR("default", "sr-1", "0 */2 * * *") + handler, _, w := setupHandler(existing) + + req := httptest.NewRequest("DELETE", "/api/scheduledruns/default/sr-1", nil) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "sr-1"}) + req = setUser(req, "test-user") + handler.HandleDeleteScheduledRun(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + + // Verify it's actually deleted + getReq := httptest.NewRequest("GET", "/api/scheduledruns/default/sr-1", nil) + getReq = mux.SetURLVars(getReq, map[string]string{"namespace": "default", "name": "sr-1"}) + getReq = setUser(getReq, "test-user") + w2 := newMockErrorResponseWriter() + handler.HandleGetScheduledRun(w2, getReq) + assert.Equal(t, http.StatusNotFound, w2.Code) + }) + + t.Run("not found", func(t *testing.T) { + handler, _, w := setupHandler() + + req := httptest.NewRequest("DELETE", "/api/scheduledruns/default/nonexistent", nil) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "nonexistent"}) + req = setUser(req, "test-user") + handler.HandleDeleteScheduledRun(w, req) + + assert.Equal(t, http.StatusNotFound, w.Code) + }) + }) + + t.Run("HandleTriggerScheduledRun", func(t *testing.T) { + t.Run("success", func(t *testing.T) { + existing := newSR("default", "sr-1", "0 */2 * * *") + handler, trigger, w := setupHandler(existing) + trigger.entry = &v1alpha2.RunHistoryEntry{DispatchStatus: v1alpha2.DispatchStatusDispatched} + + req := httptest.NewRequest("POST", "/api/scheduledruns/default/sr-1/trigger", nil) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "sr-1"}) + req = setUser(req, "test-user") + handler.HandleTriggerScheduledRun(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + require.Len(t, trigger.triggered, 1) + assert.Equal(t, types.NamespacedName{Namespace: "default", Name: "sr-1"}, trigger.triggered[0]) + }) + + t.Run("not found", func(t *testing.T) { + handler, _, w := setupHandler() + + req := httptest.NewRequest("POST", "/api/scheduledruns/default/nonexistent/trigger", nil) + req = mux.SetURLVars(req, map[string]string{"namespace": "default", "name": "nonexistent"}) + req = setUser(req, "test-user") + handler.HandleTriggerScheduledRun(w, req) + + assert.Equal(t, http.StatusNotFound, w.Code) + }) + }) +} + +func TestValidateSchedule(t *testing.T) { + tests := []struct { + name string + schedule string + timeZone string + wantErr bool + }{ + { + name: "valid - every 2 hours", + schedule: "0 */2 * * *", + wantErr: false, + }, + { + name: "valid - daily at midnight", + schedule: "0 0 * * *", + wantErr: false, + }, + { + name: "valid - exactly 1 hour", + schedule: "0 * * * *", + wantErr: false, + }, + { + name: "valid - every 5 minutes (sub-hourly allowed)", + schedule: "*/5 * * * *", + wantErr: false, + }, + { + name: "valid - every 30 minutes (sub-hourly allowed)", + schedule: "*/30 * * * *", + wantErr: false, + }, + { + name: "invalid cron expression", + schedule: "not-a-cron", + wantErr: true, + }, + { + name: "valid - with time zone", + schedule: "0 9 * * *", + timeZone: "America/Los_Angeles", + wantErr: false, + }, + { + name: "invalid time zone", + schedule: "0 9 * * *", + timeZone: "Mars/Olympus_Mons", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := handlers.ValidateSchedule(tt.schedule, tt.timeZone) + if tt.wantErr { + assert.NotNil(t, err) + } else { + assert.Nil(t, err) + } + }) + } +} + +func TestScheduledRunsHandler_CreateDefaultsNamespace(t *testing.T) { + scheme := runtime.NewScheme() + require.NoError(t, v1alpha2.AddToScheme(scheme)) + + kubeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + trigger := &mockScheduledRunTrigger{} + base := &handlers.Base{ + KubeClient: kubeClient, + Authorizer: &auth.NoopAuthorizer{}, + } + handler := handlers.NewScheduledRunsHandler(base, trigger) + w := newMockErrorResponseWriter() + + sr := v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "no-namespace-sr", + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + AgentRef: v1alpha2.AgentReference{Name: "agent", Namespace: "default"}, + Prompt: "test", + }, + } + body, _ := json.Marshal(sr) + + req := httptest.NewRequest("POST", "/api/scheduledruns", bytes.NewBuffer(body)) + req.Header.Set("Content-Type", "application/json") + req = setUser(req, "test-user") + handler.HandleCreateScheduledRun(w, req) + + // Should succeed — the handler defaults the namespace + assert.Equal(t, http.StatusCreated, w.Code) + + // Verify it was created with a namespace + var created v1alpha2.ScheduledRun + err := kubeClient.Get(context.Background(), types.NamespacedName{ + Name: "no-namespace-sr", + Namespace: "kagent", + }, &created) + // If namespace defaults to something else, just verify creation succeeded + if err != nil { + // Try empty namespace (depends on GetResourceNamespace()) + list := &v1alpha2.ScheduledRunList{} + err = kubeClient.List(context.Background(), list) + require.NoError(t, err) + require.Len(t, list.Items, 1) + assert.Equal(t, "no-namespace-sr", list.Items[0].Name) + } +} diff --git a/go/core/internal/httpserver/handlers/test_helpers_test.go b/go/core/internal/httpserver/handlers/test_helpers_test.go index d63aa5e95..da1565e40 100644 --- a/go/core/internal/httpserver/handlers/test_helpers_test.go +++ b/go/core/internal/httpserver/handlers/test_helpers_test.go @@ -27,6 +27,8 @@ func setupScheme() *runtime.Scheme { &v1alpha2.SandboxAgentList{}, &v1alpha2.AgentHarness{}, &v1alpha2.AgentHarnessList{}, + &v1alpha2.ScheduledRun{}, + &v1alpha2.ScheduledRunList{}, ) metav1.AddToGroupVersion(s, schema.GroupVersion{Group: "kagent.dev", Version: "v1alpha1"}) diff --git a/go/core/internal/httpserver/server.go b/go/core/internal/httpserver/server.go index aac7e831a..867bc1193 100644 --- a/go/core/internal/httpserver/server.go +++ b/go/core/internal/httpserver/server.go @@ -50,6 +50,7 @@ const ( APIPathLangGraph = "/api/langgraph" APIPathCrewAI = "/api/crewai" APIPathSandboxSSH = "/api/sandbox/ssh" + APIPathScheduledRuns = "/api/scheduledruns" ) var defaultModelConfig = types.NamespacedName{ @@ -59,18 +60,19 @@ var defaultModelConfig = types.NamespacedName{ // ServerConfig holds the configuration for the HTTP server type ServerConfig struct { - Router *mux.Router - BindAddr string - KubeClient ctrl_client.Client - A2AHandler a2a.A2AHandlerMux - MCPHandler *mcp.MCPHandler - WatchedNamespaces []string - DbClient dbpkg.Client - Authenticator auth.AuthProvider - Authorizer auth.Authorizer - ProxyURL string - Reconciler reconciler.KagentReconciler - SandboxBackend sandboxbackend.Backend + Router *mux.Router + BindAddr string + KubeClient ctrl_client.Client + A2AHandler a2a.A2AHandlerMux + MCPHandler *mcp.MCPHandler + WatchedNamespaces []string + DbClient dbpkg.Client + Authenticator auth.AuthProvider + Authorizer auth.Authorizer + ProxyURL string + Reconciler reconciler.KagentReconciler + SandboxBackend sandboxbackend.Backend + ScheduledRunTrigger handlers.ScheduledRunTrigger } // HTTPServer is the structure that manages the HTTP server @@ -89,7 +91,7 @@ func NewHTTPServer(config ServerConfig) (*HTTPServer, error) { return &HTTPServer{ config: config, router: config.Router, - handlers: handlers.NewHandlers(config.KubeClient, defaultModelConfig, config.DbClient, config.WatchedNamespaces, config.Authorizer, config.ProxyURL, config.Reconciler, config.SandboxBackend), + handlers: handlers.NewHandlers(config.KubeClient, defaultModelConfig, config.DbClient, config.WatchedNamespaces, config.Authorizer, config.ProxyURL, config.Reconciler, config.SandboxBackend, config.ScheduledRunTrigger), authenticator: config.Authenticator, }, nil } @@ -303,6 +305,16 @@ func (s *HTTPServer) setupRoutes() { // OpenShell sandbox PTY (browser WebSocket → gateway CONNECT → SSH). Authenticated like other /api routes. s.router.HandleFunc(APIPathSandboxSSH, adaptHandler(s.handlers.HandleSandboxSSHWebSocket)).Methods(http.MethodGet) + // ScheduledRuns + if s.handlers.ScheduledRuns != nil { + s.router.HandleFunc(APIPathScheduledRuns, adaptHandler(s.handlers.ScheduledRuns.HandleListScheduledRuns)).Methods(http.MethodGet) + s.router.HandleFunc(APIPathScheduledRuns, adaptHandler(s.handlers.ScheduledRuns.HandleCreateScheduledRun)).Methods(http.MethodPost) + s.router.HandleFunc(APIPathScheduledRuns+"/{namespace}/{name}", adaptHandler(s.handlers.ScheduledRuns.HandleGetScheduledRun)).Methods(http.MethodGet) + s.router.HandleFunc(APIPathScheduledRuns+"/{namespace}/{name}", adaptHandler(s.handlers.ScheduledRuns.HandleUpdateScheduledRun)).Methods(http.MethodPut) + s.router.HandleFunc(APIPathScheduledRuns+"/{namespace}/{name}", adaptHandler(s.handlers.ScheduledRuns.HandleDeleteScheduledRun)).Methods(http.MethodDelete) + s.router.HandleFunc(APIPathScheduledRuns+"/{namespace}/{name}/trigger", adaptHandler(s.handlers.ScheduledRuns.HandleTriggerScheduledRun)).Methods(http.MethodPost) + } + // A2A s.router.PathPrefix(APIPathA2A + "/{namespace}/{name}").Handler(s.config.A2AHandler) s.router.PathPrefix(APIPathA2ASandboxes + "/{namespace}/{name}").Handler(s.config.A2AHandler) diff --git a/go/core/internal/metrics/scheduledrun.go b/go/core/internal/metrics/scheduledrun.go new file mode 100644 index 000000000..5f5ca3c23 --- /dev/null +++ b/go/core/internal/metrics/scheduledrun.go @@ -0,0 +1,64 @@ +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// ScheduledRun metrics. The label cardinality is bounded by the number of +// ScheduledRun resources, which is operator-controlled and typically small. +var ( + scheduledRunDispatchTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_scheduledrun_dispatch_total", + Help: "Total number of ScheduledRun dispatch attempts, labelled by dispatch status.", + }, + []string{"namespace", "name", "status"}, + ) + scheduledRunOutcomeTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "kagent_scheduledrun_outcome_total", + Help: "Total number of ScheduledRun resolved outcomes (post async session-state polling).", + }, + []string{"namespace", "name", "outcome"}, + ) + scheduledRunDispatchDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "kagent_scheduledrun_dispatch_duration_seconds", + Help: "Duration of the synchronous A2A dispatch call.", + Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), + }, + []string{"namespace", "name"}, + ) + scheduledRunActiveSchedules = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "kagent_scheduledrun_active_schedules", + Help: "Number of ScheduledRun cron entries currently scheduled (excludes suspended).", + }, + ) +) + +func init() { + ctrlmetrics.Registry.MustRegister( + scheduledRunDispatchTotal, + scheduledRunOutcomeTotal, + scheduledRunDispatchDurationSeconds, + scheduledRunActiveSchedules, + ) +} + +// ObserveScheduledRunDispatch records a dispatch attempt and its duration. +func ObserveScheduledRunDispatch(namespace, name, status string, durationSeconds float64) { + scheduledRunDispatchTotal.WithLabelValues(namespace, name, status).Inc() + scheduledRunDispatchDurationSeconds.WithLabelValues(namespace, name).Observe(durationSeconds) +} + +// ObserveScheduledRunOutcome records a resolved outcome (post-polling). +func ObserveScheduledRunOutcome(namespace, name, outcome string) { + scheduledRunOutcomeTotal.WithLabelValues(namespace, name, outcome).Inc() +} + +// SetActiveSchedules updates the gauge of active cron entries. +func SetActiveSchedules(n int) { + scheduledRunActiveSchedules.Set(float64(n)) +} diff --git a/go/core/pkg/app/app.go b/go/core/pkg/app/app.go index d47ab55ad..490140d7e 100644 --- a/go/core/pkg/app/app.go +++ b/go/core/pkg/app/app.go @@ -606,6 +606,20 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne os.Exit(1) } + scheduledRunScheduler := controller.NewScheduledRunScheduler(mgr.GetClient(), dbClient) + if err := mgr.Add(scheduledRunScheduler); err != nil { + setupLog.Error(err, "unable to add scheduled run scheduler to manager") + os.Exit(1) + } + if err = (&controller.ScheduledRunController{ + Scheme: mgr.GetScheme(), + Kube: mgr.GetClient(), + Scheduler: scheduledRunScheduler, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "ScheduledRun") + os.Exit(1) + } + if err := reconcilerutils.SetupOwnerIndexes(mgr, rcnclr.GetOwnedResourceTypes()); err != nil { setupLog.Error(err, "failed to setup indexes for owned resources") os.Exit(1) @@ -684,6 +698,7 @@ func Start(getExtensionConfig GetExtensionConfig, migrationRunner MigrationRunne ProxyURL: cfg.Proxy.URL, Reconciler: rcnclr, SandboxBackend: extensionCfg.SandboxBackend, + ScheduledRunTrigger: scheduledRunScheduler, }) if err != nil { setupLog.Error(err, "unable to create HTTP server") diff --git a/go/core/test/e2e/scheduledrun_api_test.go b/go/core/test/e2e/scheduledrun_api_test.go new file mode 100644 index 000000000..6f7bf46af --- /dev/null +++ b/go/core/test/e2e/scheduledrun_api_test.go @@ -0,0 +1,175 @@ +package e2e_test + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + api "github.com/kagent-dev/kagent/go/api/httpapi" + "github.com/kagent-dev/kagent/go/api/v1alpha2" + "github.com/kagent-dev/kagent/go/core/internal/utils" +) + +// TestScheduledRunAPI tests the ScheduledRun REST API lifecycle end-to-end. +// Requires a deployed kagent instance (set KAGENT_URL or default http://localhost:8083). +func TestScheduledRunAPI(t *testing.T) { + cli := setupK8sClient(t, false) + baseURL := kagentURL() + "/api/scheduledruns" + namespace := utils.GetResourceNamespace() + + // Create an Agent so the ScheduledRun controller can validate agentRef. + agent := &v1alpha2.Agent{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "e2e-sr-agent-", + Namespace: namespace, + }, + Spec: v1alpha2.AgentSpec{ + Type: v1alpha2.AgentType_Declarative, + Declarative: &v1alpha2.DeclarativeAgentSpec{ + ModelConfig: "default-model-config", + SystemMessage: "test agent for ScheduledRun E2E", + }, + Description: "agent for ScheduledRun E2E test", + }, + } + err := cli.Create(context.Background(), agent) + require.NoError(t, err) + cleanup(t, cli, agent) + + srName := "e2e-test-sr-" + time.Now().Format("150405") + + t.Run("create", func(t *testing.T) { + sr := v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: srName, + Namespace: namespace, + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */2 * * *", + AgentRef: v1alpha2.AgentReference{ + Name: agent.Name, + Namespace: namespace, + }, + Prompt: "run e2e test task", + MaxRunHistory: 5, + }, + } + body, _ := json.Marshal(sr) + + resp, respBody := doRequest(t, "POST", baseURL, body) + assert.Equal(t, http.StatusCreated, resp.StatusCode, "body: %s", respBody) + }) + + t.Run("list", func(t *testing.T) { + resp, body := doRequest(t, "GET", baseURL, nil) + require.Equal(t, http.StatusOK, resp.StatusCode) + assert.Contains(t, string(body), srName) + }) + + t.Run("get", func(t *testing.T) { + resp, body := doRequest(t, "GET", baseURL+"/"+namespace+"/"+srName, nil) + require.Equal(t, http.StatusOK, resp.StatusCode) + + var result api.StandardResponse[v1alpha2.ScheduledRun] + require.NoError(t, json.Unmarshal(body, &result)) + assert.Equal(t, "0 */2 * * *", result.Data.Spec.Schedule) + assert.Equal(t, agent.Name, result.Data.Spec.AgentRef.Name) + assert.Equal(t, "run e2e test task", result.Data.Spec.Prompt) + }) + + t.Run("update", func(t *testing.T) { + sr := v1alpha2.ScheduledRun{ + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "0 */3 * * *", + AgentRef: v1alpha2.AgentReference{ + Name: agent.Name, + Namespace: namespace, + }, + Prompt: "updated prompt", + MaxRunHistory: 10, + }, + } + body, _ := json.Marshal(sr) + + resp, respBody := doRequest(t, "PUT", baseURL+"/"+namespace+"/"+srName, body) + assert.Equal(t, http.StatusOK, resp.StatusCode, "body: %s", respBody) + + // Verify the update took effect + var result api.StandardResponse[v1alpha2.ScheduledRun] + _, getBody := doRequest(t, "GET", baseURL+"/"+namespace+"/"+srName, nil) + require.NoError(t, json.Unmarshal(getBody, &result)) + assert.Equal(t, "0 */3 * * *", result.Data.Spec.Schedule) + assert.Equal(t, "updated prompt", result.Data.Spec.Prompt) + }) + + t.Run("create_invalid_schedule", func(t *testing.T) { + sr := v1alpha2.ScheduledRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-schedule-sr", + Namespace: namespace, + }, + Spec: v1alpha2.ScheduledRunSpec{ + Schedule: "not-a-valid-cron", // syntactically invalid + AgentRef: v1alpha2.AgentReference{ + Name: agent.Name, + Namespace: namespace, + }, + Prompt: "should fail", + }, + } + body, _ := json.Marshal(sr) + + resp, _ := doRequest(t, "POST", baseURL, body) + assert.Equal(t, http.StatusBadRequest, resp.StatusCode) + }) + + t.Run("trigger", func(t *testing.T) { + resp, _ := doRequest(t, "POST", baseURL+"/"+namespace+"/"+srName+"/trigger", nil) + assert.Equal(t, http.StatusOK, resp.StatusCode) + }) + + t.Run("delete", func(t *testing.T) { + resp, _ := doRequest(t, "DELETE", baseURL+"/"+namespace+"/"+srName, nil) + assert.Equal(t, http.StatusOK, resp.StatusCode) + }) + + t.Run("get_after_delete", func(t *testing.T) { + resp, _ := doRequest(t, "GET", baseURL+"/"+namespace+"/"+srName, nil) + assert.Equal(t, http.StatusNotFound, resp.StatusCode) + }) +} + +// doRequest makes an HTTP request with optional JSON body and returns the response. +func doRequest(t *testing.T, method, url string, body []byte) (*http.Response, []byte) { + t.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + var req *http.Request + var err error + if body != nil { + req, err = http.NewRequestWithContext(ctx, method, url, bytes.NewBuffer(body)) + req.Header.Set("Content-Type", "application/json") + } else { + req, err = http.NewRequestWithContext(ctx, method, url, nil) + } + require.NoError(t, err) + + resp, err := http.DefaultClient.Do(req) + require.NoError(t, err) + + respBody, err := io.ReadAll(resp.Body) + require.NoError(t, err) + resp.Body.Close() + + return resp, respBody +} diff --git a/go/go.mod b/go/go.mod index e14ab9636..9ba3bb083 100644 --- a/go/go.mod +++ b/go/go.mod @@ -334,6 +334,7 @@ require ( github.com/quasilyte/stdinfo v0.0.0-20220114132959-f7386bf02567 // indirect github.com/raeperd/recvcheck v0.2.0 // indirect github.com/rivo/uniseg v0.4.7 // indirect + github.com/robfig/cron/v3 v3.0.1 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/ryancurrah/gomodguard v1.4.1 // indirect github.com/ryancurrah/gomodguard/v2 v2.1.3 // indirect diff --git a/go/go.sum b/go/go.sum index 739218517..db745239c 100644 --- a/go/go.sum +++ b/go/go.sum @@ -692,6 +692,8 @@ github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/helm/kagent-crds/templates/kagent.dev_scheduledruns.yaml b/helm/kagent-crds/templates/kagent.dev_scheduledruns.yaml new file mode 100644 index 000000000..7b6ff60c9 --- /dev/null +++ b/helm/kagent-crds/templates/kagent.dev_scheduledruns.yaml @@ -0,0 +1,222 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: scheduledruns.kagent.dev +spec: + group: kagent.dev + names: + kind: ScheduledRun + listKind: ScheduledRunList + plural: scheduledruns + singular: scheduledrun + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.schedule + name: Schedule + type: string + - jsonPath: .spec.suspend + name: Suspend + type: boolean + - jsonPath: .status.lastRunTime + name: Last Run + type: date + - jsonPath: .status.nextRunTime + name: Next Run + type: string + name: v1alpha2 + schema: + openAPIV3Schema: + description: ScheduledRun is the Schema for the scheduledruns API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ScheduledRunSpec defines the desired state of ScheduledRun. + properties: + agentRef: + description: |- + AgentRef is a reference to the Agent to execute. If Namespace is empty + it defaults to the ScheduledRun's namespace. + properties: + name: + type: string + namespace: + type: string + required: + - name + type: object + maxRunHistory: + default: 10 + description: MaxRunHistory is the maximum number of run history entries + to retain. + maximum: 100 + minimum: 1 + type: integer + prompt: + description: Prompt is the text prompt to send to the agent on each + run. + minLength: 1 + type: string + schedule: + description: |- + Schedule is a cron expression defining when to run the agent. Standard + 5-field cron syntax (minute hour day-of-month month day-of-week). + minLength: 1 + type: string + suspend: + default: false + description: |- + Suspend pauses cron-driven scheduling when set to true. Manual triggers + via the API still execute; Suspend only gates the cron tick path. + type: boolean + timeZone: + description: |- + TimeZone is an IANA time zone name (e.g. "America/Los_Angeles") used + to interpret Schedule. If empty, the controller process's local time + zone (typically UTC in-cluster) is used. + type: string + required: + - agentRef + - prompt + - schedule + type: object + status: + description: ScheduledRunStatus defines the observed state of ScheduledRun. + properties: + conditions: + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + lastRunTime: + format: date-time + type: string + nextRunTime: + format: date-time + type: string + observedGeneration: + format: int64 + type: integer + runHistory: + items: + description: |- + RunHistoryEntry records one execution of a scheduled run. DispatchStatus + is set synchronously when the A2A call returns; Outcome is set + asynchronously by polling the session. + properties: + completionTime: + format: date-time + type: string + dispatchMessage: + type: string + dispatchStatus: + description: |- + DispatchStatus reflects whether the A2A SendMessage call to the agent pod + succeeded. It says nothing about the LLM result — that lives in [RunOutcome]. + enum: + - Dispatched + - DispatchFailed + type: string + outcome: + description: |- + RunOutcome reflects the terminal state of the agent run, resolved + asynchronously by polling the session's task state after dispatch returns. + "Pending" means polling is still in progress (or was abandoned because the + controller restarted before the session terminated). + enum: + - Pending + - Succeeded + - Failed + - Timeout + type: string + outcomeMessage: + type: string + outcomeTime: + format: date-time + type: string + sessionId: + type: string + startTime: + format: date-time + type: string + required: + - dispatchStatus + - startTime + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/helm/kagent/templates/rbac/getter-role.yaml b/helm/kagent/templates/rbac/getter-role.yaml index f0ed9614f..1f63db655 100644 --- a/helm/kagent/templates/rbac/getter-role.yaml +++ b/helm/kagent/templates/rbac/getter-role.yaml @@ -11,6 +11,7 @@ - memories - remotemcpservers - mcpservers + - scheduledruns verbs: - get - list @@ -27,6 +28,7 @@ - memories/finalizers - remotemcpservers/finalizers - mcpservers/finalizers + - scheduledruns/finalizers verbs: - update - apiGroups: @@ -41,6 +43,7 @@ - memories/status - remotemcpservers/status - mcpservers/status + - scheduledruns/status verbs: - get - patch diff --git a/helm/kagent/templates/rbac/writer-role.yaml b/helm/kagent/templates/rbac/writer-role.yaml index b735e159b..b23848026 100644 --- a/helm/kagent/templates/rbac/writer-role.yaml +++ b/helm/kagent/templates/rbac/writer-role.yaml @@ -11,6 +11,7 @@ - memories - remotemcpservers - mcpservers + - scheduledruns verbs: - create - update @@ -28,6 +29,7 @@ - memories/finalizers - remotemcpservers/finalizers - mcpservers/finalizers + - scheduledruns/finalizers verbs: - update - apiGroups: diff --git a/ui/src/app/actions/scheduledRuns.ts b/ui/src/app/actions/scheduledRuns.ts new file mode 100644 index 000000000..7a60d25bd --- /dev/null +++ b/ui/src/app/actions/scheduledRuns.ts @@ -0,0 +1,122 @@ +"use server"; + +import { ScheduledRun, RunHistoryEntry, BaseResponse } from "@/types"; +import { revalidatePath } from "next/cache"; +import { fetchApi, createErrorResponse } from "./utils"; + +/** + * Gets all scheduled runs + * @returns A promise with all scheduled runs + */ +export async function getScheduledRuns(): Promise> { + try { + const response = await fetchApi>("/scheduledruns"); + return { message: "Successfully fetched scheduled runs", data: response.data }; + } catch (error) { + return createErrorResponse(error, "Error getting scheduled runs"); + } +} + +/** + * Gets a specific scheduled run + * @param name The scheduled run name + * @param namespace The scheduled run namespace + * @returns A promise with the scheduled run + */ +export async function getScheduledRun(name: string, namespace: string): Promise> { + try { + const response = await fetchApi>(`/scheduledruns/${namespace}/${name}`); + return { message: "Successfully fetched scheduled run", data: response.data }; + } catch (error) { + return createErrorResponse(error, "Error getting scheduled run"); + } +} + +/** + * Creates a new scheduled run + * @param sr The scheduled run to create + * @returns A promise with the created scheduled run + */ +export async function createScheduledRun(sr: ScheduledRun): Promise> { + try { + const response = await fetchApi>("/scheduledruns", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify(sr), + }); + + revalidatePath("/schedules"); + return { message: "Successfully created scheduled run", data: response.data }; + } catch (error) { + return createErrorResponse(error, "Error creating scheduled run"); + } +} + +/** + * Updates an existing scheduled run + * @param sr The scheduled run to update + * @returns A promise with the updated scheduled run + */ +export async function updateScheduledRun(sr: ScheduledRun): Promise> { + try { + const namespace = sr.metadata.namespace || ""; + const name = sr.metadata.name; + const response = await fetchApi>(`/scheduledruns/${namespace}/${name}`, { + method: "PUT", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify(sr), + }); + + revalidatePath("/schedules"); + return { message: "Successfully updated scheduled run", data: response.data }; + } catch (error) { + return createErrorResponse(error, "Error updating scheduled run"); + } +} + +/** + * Deletes a scheduled run + * @param name The scheduled run name + * @param namespace The scheduled run namespace + * @returns A promise with the delete result + */ +export async function deleteScheduledRun(name: string, namespace: string): Promise> { + try { + await fetchApi(`/scheduledruns/${namespace}/${name}`, { + method: "DELETE", + headers: { + "Content-Type": "application/json", + }, + }); + + revalidatePath("/schedules"); + return { message: "Successfully deleted scheduled run" }; + } catch (error) { + return createErrorResponse(error, "Error deleting scheduled run"); + } +} + +/** + * Triggers a manual run of a scheduled run. The backend runs synchronously + * and returns the resulting RunHistoryEntry; callers should inspect + * `data.status` to know whether the dispatch actually succeeded. + */ +export async function triggerScheduledRun(name: string, namespace: string): Promise> { + try { + const response = await fetchApi>(`/scheduledruns/${namespace}/${name}/trigger`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + }); + + revalidatePath("/schedules"); + return { message: "Successfully triggered scheduled run", data: response.data }; + } catch (error) { + return createErrorResponse(error, "Error triggering scheduled run"); + } +} diff --git a/ui/src/app/schedules/[namespace]/[name]/page.tsx b/ui/src/app/schedules/[namespace]/[name]/page.tsx new file mode 100644 index 000000000..e53a1a168 --- /dev/null +++ b/ui/src/app/schedules/[namespace]/[name]/page.tsx @@ -0,0 +1,305 @@ +"use client"; + +import React, { useState, useEffect, useCallback } from "react"; +import { useRouter, useParams } from "next/navigation"; +import { Button } from "@/components/ui/button"; +import { Badge } from "@/components/ui/badge"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { Pencil, Trash2, Play, Pause, PlayCircle, Loader2, Clock } from "lucide-react"; +import { ScheduledRun } from "@/types"; +import { + getScheduledRun, + deleteScheduledRun, + triggerScheduledRun, + updateScheduledRun, +} from "@/app/actions/scheduledRuns"; +import { RunHistoryTable } from "@/components/schedules/RunHistoryTable"; +import { LoadingState } from "@/components/LoadingState"; +import { ErrorState } from "@/components/ErrorState"; +import { formatDateTime } from "@/lib/formatDateTime"; +import { toast } from "sonner"; + +export default function ScheduledRunDetailPage() { + const router = useRouter(); + const params = useParams(); + const namespace = params.namespace as string; + const name = params.name as string; + + const [sr, setSr] = useState(null); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [showDeleteDialog, setShowDeleteDialog] = useState(false); + const [isTriggering, setIsTriggering] = useState(false); + const [isTogglingPause, setIsTogglingPause] = useState(false); + + const fetchData = useCallback(async () => { + try { + setLoading(true); + const response = await getScheduledRun(name, namespace); + if (response.error || !response.data) { + throw new Error(response.error || "Scheduled run not found"); + } + setSr(response.data); + } catch (err) { + const errorMessage = err instanceof Error ? err.message : "Failed to fetch scheduled run"; + setError(errorMessage); + } finally { + setLoading(false); + } + }, [name, namespace]); + + useEffect(() => { + // eslint-disable-next-line react-hooks/set-state-in-effect -- legitimate data fetch on mount/dependency change + fetchData(); + }, [fetchData]); + + const handleEdit = () => { + router.push(`/schedules/new?edit=true&name=${name}&namespace=${namespace}`); + }; + + const handleDelete = async () => { + try { + const response = await deleteScheduledRun(name, namespace); + if (response.error) { + throw new Error(response.error); + } + toast.success(`Scheduled run "${name}" deleted successfully`); + router.push("/schedules"); + } catch (err) { + const errorMessage = err instanceof Error ? err.message : "Failed to delete scheduled run"; + toast.error(errorMessage); + setShowDeleteDialog(false); + } + }; + + const handleTrigger = async () => { + setIsTriggering(true); + try { + const response = await triggerScheduledRun(name, namespace); + if (response.error) { + throw new Error(response.error); + } + // Manual trigger returns the dispatch result only — outcome resolves + // asynchronously via the background poller, so we surface dispatch + // status here and let the run-history table reflect the eventual + // outcome. + if (response.data?.dispatchStatus === "DispatchFailed") { + toast.error(`Dispatch failed: ${response.data.dispatchMessage ?? "agent dispatch error"}`); + } else { + toast.success(`Run for "${name}" dispatched`); + } + await fetchData(); + } catch (err) { + const errorMessage = err instanceof Error ? err.message : "Failed to trigger scheduled run"; + toast.error(errorMessage); + } finally { + setIsTriggering(false); + } + }; + + const handleToggleSuspend = async () => { + if (!sr) return; + setIsTogglingPause(true); + try { + const updated: ScheduledRun = { + ...sr, + spec: { + ...sr.spec, + suspend: !sr.spec.suspend, + }, + }; + const response = await updateScheduledRun(updated); + if (response.error) { + throw new Error(response.error); + } + toast.success(sr.spec.suspend ? "Schedule resumed" : "Schedule suspended"); + await fetchData(); + } catch (err) { + const errorMessage = err instanceof Error ? err.message : "Failed to update scheduled run"; + toast.error(errorMessage); + } finally { + setIsTogglingPause(false); + } + }; + + if (loading) return ; + if (error) return ; + if (!sr) return ; + + const agentRef = sr.spec.agentRef; + const agentDisplay = agentRef.namespace + ? `${agentRef.namespace}/${agentRef.name}` + : agentRef.name; + + return ( +
+
+ {/* Header */} +
+
+

{sr.metadata.name}

+

+ {sr.metadata.namespace} +

+
+
+ + + + +
+
+ + {/* Details Card */} + + + + + Schedule Details + + + +
+
+

+ Schedule +

+

{sr.spec.schedule}

+
+
+

+ Time Zone +

+

{sr.spec.timeZone || "UTC"}

+
+
+

+ Agent +

+

{agentDisplay}

+
+
+

+ Status +

+ {sr.spec.suspend ? ( + Suspended + ) : ( + + Active + + )} +
+
+

+ Max Run History +

+

{sr.spec.maxRunHistory ?? 10}

+
+
+

+ Last Run +

+

{formatDateTime(sr.status?.lastRunTime)}

+
+
+

+ Next Run +

+

{formatDateTime(sr.status?.nextRunTime)}

+
+
+
+

+ Prompt +

+
+ {sr.spec.prompt} +
+
+
+
+ + {/* Run History */} + + + Run History + + + + + + + {/* Delete Dialog */} + + + + Delete Scheduled Run + + Are you sure you want to delete the scheduled run '{sr.metadata.name}'? + This action cannot be undone. + + + + + + + + +
+
+ ); +} diff --git a/ui/src/app/schedules/new/page.tsx b/ui/src/app/schedules/new/page.tsx new file mode 100644 index 000000000..fdce30b74 --- /dev/null +++ b/ui/src/app/schedules/new/page.tsx @@ -0,0 +1,525 @@ +"use client"; + +import React, { useState, useEffect } from "react"; +import { useRouter, useSearchParams } from "next/navigation"; +import { Input } from "@/components/ui/input"; +import { Button } from "@/components/ui/button"; +import { Textarea } from "@/components/ui/textarea"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Label } from "@/components/ui/label"; +import { Switch } from "@/components/ui/switch"; +import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; +import { Clock, Loader2 } from "lucide-react"; +import { NamespaceCombobox } from "@/components/NamespaceCombobox"; +import { ScheduledRun, AgentResponse } from "@/types"; +import { + createScheduledRun, + updateScheduledRun, + getScheduledRun, +} from "@/app/actions/scheduledRuns"; +import { getAgents } from "@/app/actions/agents"; +import { LoadingState } from "@/components/LoadingState"; +import { toast } from "sonner"; + +interface FormState { + name: string; + namespace: string; + schedule: string; + timeZone: string; + agentName: string; + agentNamespace: string; + prompt: string; + suspend: boolean; + maxRunHistory: number; + isSubmitting: boolean; + isLoading: boolean; +} + +interface ValidationErrors { + name?: string; + namespace?: string; + schedule?: string; + agent?: string; + prompt?: string; + maxRunHistory?: string; +} + +const RFC1123_REGEX = /^[a-z0-9]([a-z0-9-]{0,61}[a-z0-9])?$/; +const CRON_FIELD_COUNT = 5; + +function validateCronExpression(expr: string): string | undefined { + const trimmed = expr.trim(); + if (!trimmed) return "Schedule is required"; + const fields = trimmed.split(/\s+/); + if (fields.length !== CRON_FIELD_COUNT) { + return `Cron expression must have exactly ${CRON_FIELD_COUNT} fields (minute hour day month weekday)`; + } + return undefined; +} + +function describeNextRuns(expr: string, count: number): string[] { + // Simple heuristic preview for basic cron expressions + // Full cron parsing would need a library; we show the raw expression instead + const trimmed = expr.trim(); + const fields = trimmed.split(/\s+/); + if (fields.length !== CRON_FIELD_COUNT) return []; + + const descriptions: string[] = []; + const [minute, hour, dom, month, dow] = fields; + + // Build a human-readable hint + let desc = ""; + if (minute === "*" && hour === "*") { + desc = "Every minute"; + } else if (minute.startsWith("*/")) { + desc = `Every ${minute.slice(2)} minutes`; + } else if (hour === "*") { + desc = `At minute ${minute} of every hour`; + } else if (dom === "*" && month === "*" && dow === "*") { + desc = `Daily at ${hour.padStart(2, "0")}:${minute.padStart(2, "0")}`; + } else if (dow !== "*" && dom === "*" && month === "*") { + const dayNames: Record = { "0": "Sun", "1": "Mon", "2": "Tue", "3": "Wed", "4": "Thu", "5": "Fri", "6": "Sat", "7": "Sun" }; + const days = dow.split(",").map((d) => dayNames[d] || d).join(", "); + desc = `At ${hour.padStart(2, "0")}:${minute.padStart(2, "0")} on ${days}`; + } else { + desc = `Cron: ${trimmed}`; + } + + descriptions.push(desc); + + // Add note about count + if (descriptions.length > 0 && count > 1) { + descriptions.push(`(${CRON_FIELD_COUNT}-field cron: min hour dom month dow)`); + } + + return descriptions.slice(0, count); +} + +function ScheduledRunFormContent() { + const router = useRouter(); + const searchParams = useSearchParams(); + const isEditMode = searchParams.get("edit") === "true"; + const editName = searchParams.get("name"); + const editNamespace = searchParams.get("namespace"); + + const [agents, setAgents] = useState([]); + const [state, setState] = useState({ + name: "", + namespace: "default", + schedule: "", + timeZone: "", + agentName: "", + agentNamespace: "", + prompt: "", + suspend: false, + maxRunHistory: 10, + isSubmitting: false, + isLoading: isEditMode, + }); + const [errors, setErrors] = useState({}); + + // Fetch agents list + useEffect(() => { + const loadAgents = async () => { + try { + const response = await getAgents(); + if (response.error) { + toast.error(`Failed to load agents: ${response.error}`); + return; + } + if (response.data) { + setAgents(response.data); + } + } catch (err) { + const msg = err instanceof Error ? err.message : "Failed to load agents"; + toast.error(msg); + } + }; + loadAgents(); + }, []); + + // Fetch existing data in edit mode + useEffect(() => { + const fetchExisting = async () => { + if (isEditMode && editName && editNamespace) { + try { + setState((prev) => ({ ...prev, isLoading: true })); + const response = await getScheduledRun(editName, editNamespace); + if (!response.data) { + toast.error("Scheduled run not found"); + setState((prev) => ({ ...prev, isLoading: false })); + return; + } + const sr = response.data; + setState((prev) => ({ + ...prev, + name: sr.metadata.name, + namespace: sr.metadata.namespace || "", + schedule: sr.spec.schedule, + timeZone: sr.spec.timeZone || "", + agentName: sr.spec.agentRef.name, + agentNamespace: sr.spec.agentRef.namespace || "", + prompt: sr.spec.prompt, + suspend: sr.spec.suspend ?? false, + maxRunHistory: sr.spec.maxRunHistory ?? 10, + isLoading: false, + })); + } catch (err) { + console.error("Error fetching scheduled run:", err); + toast.error("Failed to load scheduled run data"); + setState((prev) => ({ ...prev, isLoading: false })); + } + } + }; + fetchExisting(); + }, [isEditMode, editName, editNamespace]); + + const validateForm = (): boolean => { + const newErrors: ValidationErrors = {}; + + if (!state.name.trim()) { + newErrors.name = "Name is required"; + } else if (!RFC1123_REGEX.test(state.name)) { + newErrors.name = "Name must be a valid RFC 1123 label (lowercase alphanumeric and hyphens, max 63 chars)"; + } + + if (!state.namespace.trim()) { + newErrors.namespace = "Namespace is required"; + } + + const cronError = validateCronExpression(state.schedule); + if (cronError) { + newErrors.schedule = cronError; + } + + if (!state.agentName) { + newErrors.agent = "Agent is required"; + } + + if (!state.prompt.trim()) { + newErrors.prompt = "Prompt is required"; + } + + if (state.maxRunHistory < 1 || state.maxRunHistory > 100) { + newErrors.maxRunHistory = "Must be between 1 and 100"; + } + + setErrors(newErrors); + return Object.keys(newErrors).length === 0; + }; + + const handleSubmit = async () => { + if (!validateForm()) return; + + setState((prev) => ({ ...prev, isSubmitting: true })); + + try { + const sr: ScheduledRun = { + apiVersion: "kagent.dev/v1alpha2", + kind: "ScheduledRun", + metadata: { + name: state.name, + namespace: state.namespace, + }, + spec: { + schedule: state.schedule.trim(), + timeZone: state.timeZone.trim() || undefined, + agentRef: { + name: state.agentName, + namespace: state.agentNamespace || undefined, + }, + prompt: state.prompt, + suspend: state.suspend, + maxRunHistory: state.maxRunHistory, + }, + }; + + const response = isEditMode + ? await updateScheduledRun(sr) + : await createScheduledRun(sr); + + if (response.error) { + throw new Error(response.error); + } + + toast.success( + isEditMode + ? "Scheduled run updated successfully" + : "Scheduled run created successfully" + ); + router.push("/schedules"); + } catch (err) { + const errorMessage = + err instanceof Error + ? err.message + : `Failed to ${isEditMode ? "update" : "create"} scheduled run`; + toast.error(errorMessage); + setState((prev) => ({ ...prev, isSubmitting: false })); + } + }; + + const isFormDisabled = state.isSubmitting || state.isLoading; + const cronPreview = state.schedule.trim() ? describeNextRuns(state.schedule, 3) : []; + + if (state.isSubmitting) { + return ; + } + + return ( +
+
+

+ {isEditMode ? "Edit Scheduled Run" : "Create Scheduled Run"} +

+ +
+ {/* Basic Information */} + + + + + Basic Information + + + +
+ +

+ Unique identifier for this scheduled run (RFC 1123 compliant). +

+ + setState((prev) => ({ ...prev, name: e.target.value })) + } + placeholder="e.g. daily-report" + disabled={isFormDisabled || isEditMode} + className={errors.name ? "border-red-500" : ""} + /> + {errors.name && ( +

{errors.name}

+ )} +
+ +
+ +

+ Kubernetes namespace for this scheduled run. +

+ + setState((prev) => ({ ...prev, namespace: value })) + } + disabled={isFormDisabled || isEditMode} + /> + {errors.namespace && ( +

{errors.namespace}

+ )} +
+
+
+ + {/* Schedule Configuration */} + + + + + Schedule Configuration + + + +
+ + + setState((prev) => ({ ...prev, schedule: e.target.value })) + } + placeholder="e.g. 0 9 * * 1-5" + className={`font-mono ${errors.schedule ? "border-red-500" : ""}`} + disabled={isFormDisabled} + /> + {errors.schedule && ( +

{errors.schedule}

+ )} + {cronPreview.length > 0 && !errors.schedule && ( +
+ {cronPreview.map((line, i) => ( +

{line}

+ ))} +
+ )} +
+ +
+ +

+ Optional IANA time-zone name (e.g. America/Los_Angeles, Asia/Shanghai). Leave blank to interpret the schedule in UTC. +

+ + setState((prev) => ({ ...prev, timeZone: e.target.value })) + } + placeholder="UTC" + className="font-mono" + disabled={isFormDisabled} + /> +
+ +
+ +

+ Select the agent to run on this schedule. +

+ + {errors.agent && ( +

{errors.agent}

+ )} +
+ +
+ +

+ The prompt message sent to the agent on each scheduled run. +

+