From e99bee5fe11ee8cdbe16ddd06a308c89d8f36d18 Mon Sep 17 00:00:00 2001
From: Mahmoud Mabrouk <mahmoud@agenta.ai>
Date: Thu, 11 Jun 2026 11:32:01 +0200
Subject: [PATCH] fix(sdk): sandbox custom-code evaluators by default
 (RestrictedPython runner)

Custom-code evaluators ran user Python with raw exec() in the services
process, sandboxed only if an operator opted into Daytona. Add a
RestrictedPython runner and make it the default; keep raw exec() as an
explicit 'local' opt-in. The new sandbox uses a guarded __import__ limited
to a pure-stdlib allowlist and safer_getattr to block the class-gadget
escape, closing the two holes the previous RestrictedPython sandbox had.
---
 api/oss/src/utils/env.py                      |   2 +-
 .../07-custom-evaluator.mdx                   |   4 +
 docs/docs/self-host/02-configuration.mdx      |  10 +
 hosting/docker-compose/ee/env.ee.dev.example  |   4 +-
 hosting/docker-compose/ee/env.ee.gh.example   |   4 +-
 .../docker-compose/oss/env.oss.dev.example    |   4 +-
 hosting/docker-compose/oss/env.oss.gh.example |   4 +-
 hosting/kubernetes/ee/values.ee.example.yaml  |   4 +-
 .../kubernetes/oss/values.oss.example.yaml    |   4 +-
 .../sdk/engines/running/runners/registry.py   |  34 ++-
 .../sdk/engines/running/runners/restricted.py | 182 ++++++++++++++
 .../agenta/sdk/engines/running/sandbox.py     |  17 --
 .../oss/tests/pytest/utils/test_code_v0.py    |  18 +-
 .../pytest/utils/test_restricted_runner.py    | 223 ++++++++++++++++++
 sdks/python/pyproject.toml                    |   1 +
 sdks/python/uv.lock                           |  11 +
 16 files changed, 481 insertions(+), 45 deletions(-)
 create mode 100644 sdks/python/agenta/sdk/engines/running/runners/restricted.py
 create mode 100644 sdks/python/oss/tests/pytest/utils/test_restricted_runner.py

diff --git a/api/oss/src/utils/env.py b/api/oss/src/utils/env.py
index 88f6cb55b3..a91357a995 100644
--- a/api/oss/src/utils/env.py
+++ b/api/oss/src/utils/env.py
@@ -300,7 +300,7 @@ class ServicesCodeConfig(BaseModel):
     sandbox_runner: str = (
         os.getenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER")
         or os.getenv("AGENTA_SERVICES_SANDBOX_RUNNER")
-        or "local"
+        or "restricted"
     )
 
     model_config = ConfigDict(extra="ignore")
diff --git a/docs/docs/evaluation/configure-evaluators/07-custom-evaluator.mdx b/docs/docs/evaluation/configure-evaluators/07-custom-evaluator.mdx
index e3b4206304..d17217e9f4 100644
--- a/docs/docs/evaluation/configure-evaluators/07-custom-evaluator.mdx
+++ b/docs/docs/evaluation/configure-evaluators/07-custom-evaluator.mdx
@@ -5,6 +5,10 @@ description: "Write custom evaluators in Python, JavaScript, or TypeScript with
 
 Custom code evaluators let you write your own evaluation logic in Python, JavaScript, or TypeScript. Your code has access to the application inputs, outputs, and the full execution trace (spans, latency, token usage, costs).
 
+:::warning Self-hosted deployments only
+On self-hosted Agenta, custom evaluator code runs server-side. By default it runs in a restricted Python sandbox (no filesystem, network, or host access). Operators can change the runner with the `AGENTA_SERVICES_CODE_SANDBOX_RUNNER` environment variable: `local` runs code with no sandbox (trusted authors only), `daytona` runs it in an isolated remote sandbox. See [environment configuration](/self-host/configuration). Agenta Cloud is unaffected — it isolates evaluator execution.
+:::
+
 ## Function signature
 
 Your code must define an `evaluate` function with the following signature:
diff --git a/docs/docs/self-host/02-configuration.mdx b/docs/docs/self-host/02-configuration.mdx
index 4525c66f18..9a87d78f6e 100644
--- a/docs/docs/self-host/02-configuration.mdx
+++ b/docs/docs/self-host/02-configuration.mdx
@@ -109,6 +109,16 @@ This key has no env-var or `env.py` equivalent.
 | `AGENTA_SERVICES_HOOK_ALLOW_INSECURE` | `agenta.services.hook.allow_insecure` | `agenta.services.hook.allowInsecure` |
 | `AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED` | `agenta.services.middleware.caching_enabled` | `agenta.services.middleware.cachingEnabled` |
 
+:::warning Custom-code evaluator runner
+`AGENTA_SERVICES_CODE_SANDBOX_RUNNER` selects how [custom-code evaluators](/evaluation/configure-evaluators/custom-evaluator) execute:
+
+- `restricted` (default) — in-process Python sandbox with limited builtins and an allowlist of pure-standard-library imports. No filesystem, network, or host access.
+- `local` — raw execution in the services process with **no sandbox**. Any author who can create a custom-code evaluator can run arbitrary code on the host. Use only for trusted, single-tenant deployments.
+- `daytona` — isolated remote sandbox (strongest). Recommended when evaluator authors are not fully trusted. Requires the [daytona](#daytona) credentials below.
+
+The legacy `AGENTA_SERVICES_SANDBOX_RUNNER` is still accepted as a fallback.
+:::
+
 ## Agenta — webhooks
 
 | Env var | env.py path | values.yaml path |
diff --git a/hosting/docker-compose/ee/env.ee.dev.example b/hosting/docker-compose/ee/env.ee.dev.example
index d4aa299188..cd11534e8c 100644
--- a/hosting/docker-compose/ee/env.ee.dev.example
+++ b/hosting/docker-compose/ee/env.ee.dev.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true
diff --git a/hosting/docker-compose/ee/env.ee.gh.example b/hosting/docker-compose/ee/env.ee.gh.example
index 0310e956b0..4b7d467262 100644
--- a/hosting/docker-compose/ee/env.ee.gh.example
+++ b/hosting/docker-compose/ee/env.ee.gh.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true
diff --git a/hosting/docker-compose/oss/env.oss.dev.example b/hosting/docker-compose/oss/env.oss.dev.example
index 53aca58e84..b8eef07dbf 100644
--- a/hosting/docker-compose/oss/env.oss.dev.example
+++ b/hosting/docker-compose/oss/env.oss.dev.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true
diff --git a/hosting/docker-compose/oss/env.oss.gh.example b/hosting/docker-compose/oss/env.oss.gh.example
index 4fc6b1244e..aac6f133a1 100644
--- a/hosting/docker-compose/oss/env.oss.gh.example
+++ b/hosting/docker-compose/oss/env.oss.gh.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true
diff --git a/hosting/kubernetes/ee/values.ee.example.yaml b/hosting/kubernetes/ee/values.ee.example.yaml
index e78ee154fa..5a220ecb65 100644
--- a/hosting/kubernetes/ee/values.ee.example.yaml
+++ b/hosting/kubernetes/ee/values.ee.example.yaml
@@ -113,7 +113,7 @@ global:
 # === agenta.services ===
 # Consumed by the agenta SDK running inside services pods:
 #   - hook.allowInsecure   → AGENTA_SERVICES_HOOK_ALLOW_INSECURE
-#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (local|daytona)
+#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (restricted|local|daytona; default restricted. local = no sandbox, trusted only)
 #   - middleware.authEnabled    → AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED
 #   - middleware.cachingEnabled → AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED
 # ================================================================== #
@@ -122,7 +122,7 @@ global:
 #     hook:
 #       allowInsecure: false
 #     code:
-#       sandboxRunner: local
+#       sandboxRunner: restricted
 #     middleware:
 #       authEnabled: true
 #       cachingEnabled: true
diff --git a/hosting/kubernetes/oss/values.oss.example.yaml b/hosting/kubernetes/oss/values.oss.example.yaml
index c9e0f35f75..602d2a65bb 100644
--- a/hosting/kubernetes/oss/values.oss.example.yaml
+++ b/hosting/kubernetes/oss/values.oss.example.yaml
@@ -100,7 +100,7 @@ agenta:
 # === agenta.services ===
 # Consumed by the agenta SDK running inside services pods:
 #   - hook.allowInsecure   → AGENTA_SERVICES_HOOK_ALLOW_INSECURE
-#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (local|daytona)
+#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (restricted|local|daytona; default restricted. local = no sandbox, trusted only)
 #   - middleware.authEnabled    → AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED
 #   - middleware.cachingEnabled → AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED
 # ================================================================== #
@@ -109,7 +109,7 @@ agenta:
 #     hook:
 #       allowInsecure: false
 #     code:
-#       sandboxRunner: local
+#       sandboxRunner: restricted
 #     middleware:
 #       authEnabled: true
 #       cachingEnabled: true
diff --git a/sdks/python/agenta/sdk/engines/running/runners/registry.py b/sdks/python/agenta/sdk/engines/running/runners/registry.py
index 4c0530c5d9..dc582c8947 100644
--- a/sdks/python/agenta/sdk/engines/running/runners/registry.py
+++ b/sdks/python/agenta/sdk/engines/running/runners/registry.py
@@ -3,10 +3,14 @@
 
 from agenta.sdk.engines.running.runners.base import CodeRunner
 from agenta.sdk.engines.running.runners.local import LocalRunner
+from agenta.sdk.engines.running.runners.restricted import RestrictedRunner
+from agenta.sdk.utils.logging import get_module_logger
 
 if TYPE_CHECKING:
     from agenta.sdk.engines.running.runners.daytona import DaytonaRunner
 
+log = get_module_logger(__name__)
+
 
 def _get_daytona_runner() -> "DaytonaRunner":
     from agenta.sdk.engines.running.runners.daytona import DaytonaRunner
@@ -20,34 +24,44 @@ def get_runner() -> CodeRunner:
 
     Reads AGENTA_SERVICES_CODE_SANDBOX_RUNNER (canonical, v0.100.3+) with a
     fallback to the legacy AGENTA_SERVICES_SANDBOX_RUNNER.
-    - "local" (default): Uses current container for local execution
-    - "daytona": Uses Daytona remote sandbox
+    - "restricted" (default): In-process RestrictedPython sandbox (allowlisted imports).
+    - "local": Raw exec in the current process — no sandbox. Trusted deployments only.
+    - "daytona": Remote Daytona sandbox (strongest isolation).
 
     Returns:
-        CodeRunner: An instance of LocalRunner or DaytonaRunner
+        CodeRunner: An instance of RestrictedRunner, LocalRunner, or DaytonaRunner
 
     Raises:
-        ValueError: If Daytona runner is selected but required environment variables are missing
+        ValueError: If an unknown runner is selected, or Daytona is selected but its
+            required environment variables are missing.
     """
     runner_type = (
         os.getenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER")
         or os.getenv("AGENTA_SERVICES_SANDBOX_RUNNER")
-        or "local"
+        or "restricted"
     ).lower()
 
-    if runner_type == "daytona":
+    if runner_type == "restricted":
+        return RestrictedRunner()
+    elif runner_type == "local":
+        log.warning(
+            "Custom-code evaluators are using the 'local' runner: user code runs with "
+            "raw exec() and no sandbox in this process. Use it only for trusted/"
+            "single-tenant deployments. Set AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted "
+            "(default) or =daytona for untrusted evaluator authors."
+        )
+        return LocalRunner()
+    elif runner_type == "daytona":
         try:
             return _get_daytona_runner()
         except ImportError as exc:
             raise ValueError(
                 "Daytona runner requires the 'daytona' package. "
                 "Install optional dependencies or set "
-                "AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local."
+                "AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted."
             ) from exc
-    elif runner_type == "local":
-        return LocalRunner()
     else:
         raise ValueError(
             f"Unknown AGENTA_SERVICES_CODE_SANDBOX_RUNNER value: {runner_type}. "
-            f"Supported values: 'local', 'daytona'"
+            f"Supported values: 'restricted', 'local', 'daytona'"
         )
diff --git a/sdks/python/agenta/sdk/engines/running/runners/restricted.py b/sdks/python/agenta/sdk/engines/running/runners/restricted.py
new file mode 100644
index 0000000000..3e4eeb4d0f
--- /dev/null
+++ b/sdks/python/agenta/sdk/engines/running/runners/restricted.py
@@ -0,0 +1,182 @@
+import builtins as _py_builtins
+from typing import Any, Dict, Union, Optional
+
+from RestrictedPython import compile_restricted, safe_builtins, PrintCollector
+from RestrictedPython.Eval import default_guarded_getiter, default_guarded_getitem
+from RestrictedPython.Guards import (
+    safer_getattr,
+    guarded_iter_unpack_sequence,
+    full_write_guard,
+)
+
+from agenta.sdk.engines.running.runners.base import CodeRunner
+
+
+# Pure data/iteration builtins that RestrictedPython's safe_builtins omits but
+# evaluators routinely need. All operate on data only — none reach the host or
+# the class graph, so adding them does not widen the sandbox (escapes go through
+# attribute access, which safer_getattr blocks).
+_SAFE_EXTRA_BUILTINS = (
+    "dict",
+    "list",
+    "set",
+    "frozenset",
+    "min",
+    "max",
+    "sum",
+    "enumerate",
+    "map",
+    "filter",
+    "reversed",
+    "all",
+    "any",
+)
+
+
+# Pure-computation stdlib modules only: no filesystem, network, or process reach.
+# Deliberately strict — anything that can touch the host (os, subprocess, sys,
+# pathlib, socket, importlib, io, shutil, ...) or the network (httpx, urllib,
+# requests, ...) is excluded. Operators who need unrestricted execution must opt
+# into the `local` runner; hostile multi-tenant should use `daytona`.
+_ALLOWED_IMPORTS = frozenset(
+    {
+        "math",
+        "statistics",
+        "datetime",
+        "json",
+        "re",
+        "random",
+        "string",
+        "typing",
+        "collections",
+        "itertools",
+        "functools",
+    }
+)
+
+
+def _safe_import(name, globals=None, locals=None, fromlist=(), level=0):
+    """Guarded ``__import__`` that only permits the pure-stdlib allowlist.
+
+    Replaces the real ``__import__`` inside the sandbox so user evaluator code
+    cannot import host-reaching modules. Relative imports (``level != 0``) are
+    rejected outright.
+    """
+    root = name.split(".")[0]
+    if level != 0 or root not in _ALLOWED_IMPORTS:
+        raise ImportError(
+            f"Import of '{name}' is not allowed in the restricted evaluator sandbox. "
+            f"Allowed modules: {', '.join(sorted(_ALLOWED_IMPORTS))}. "
+            "To run unrestricted evaluator code set "
+            "AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local (trusted deployments only)."
+        )
+    return __import__(name, globals, locals, fromlist, level)
+
+
+def _build_restricted_globals() -> Dict[str, Any]:
+    """Build the execution globals for RestrictedPython.
+
+    Closes the two holes the previous sandbox had:
+    1. it injected the real ``__import__`` (here: a guarded allowlist import), and
+    2. it never set ``_getattr_`` (here: ``safer_getattr`` blocks dunder/underscore
+       attribute access, which defeats the ``().__class__.__bases__`` gadget escape).
+    """
+    builtins = dict(safe_builtins)
+    for name in _SAFE_EXTRA_BUILTINS:
+        builtins[name] = getattr(_py_builtins, name)
+    builtins["__import__"] = _safe_import
+
+    return {
+        "__builtins__": builtins,
+        "_getattr_": safer_getattr,
+        "_getitem_": default_guarded_getitem,
+        "_getiter_": default_guarded_getiter,
+        "_iter_unpack_sequence_": guarded_iter_unpack_sequence,
+        "_write_": full_write_guard,
+        # print() goes through PrintCollector (captured, not real stdout).
+        "_print_": PrintCollector,
+    }
+
+
+class RestrictedRunner(CodeRunner):
+    """Default code runner: executes evaluator code in an in-process RestrictedPython sandbox."""
+
+    def run(
+        self,
+        code: str,
+        app_params: Dict[str, Any],
+        inputs: Dict[str, Any],
+        output: Union[dict, str],
+        correct_answer: Any,
+        runtime: Optional[str] = None,
+        templates: Optional[Dict[str, str]] = None,
+        *,
+        version: str = "1",
+        trace: Optional[Dict[str, Any]] = None,
+    ) -> Union[float, None]:
+        """
+        Execute provided Python code in a RestrictedPython sandbox.
+
+        Args:
+            code: The Python code to be executed
+            app_params: The parameters of the app variant (v1 only)
+            inputs: Inputs to be used during code execution
+            output: The output of the app variant after being called
+            correct_answer: The correct answer (or target) for comparison (v1 only)
+            runtime: Runtime environment (only "python" is supported)
+            templates: Wrapper templates keyed by runtime (unused for in-process runners).
+            version: Evaluator interface version ("1" = legacy, "2" = new)
+            trace: Full trace data (v2 only)
+
+        Returns:
+            Float score between 0 and 1, or None if execution fails
+        """
+        # Normalize runtime: None means python
+        runtime = runtime or "python"
+
+        # The restricted sandbox runs in-process and only supports Python.
+        if runtime != "python":
+            raise ValueError(
+                f"RestrictedRunner only supports 'python' runtime, got: {runtime}. "
+                "Use the Daytona runner for javascript/typescript."
+            )
+
+        try:
+            byte_code = compile_restricted(code, filename="<inline>", mode="exec")
+        except SyntaxError as e:
+            raise SyntaxError(f"Syntax error in provided code: {e}")
+
+        environment = _build_restricted_globals()
+
+        try:
+            exec(byte_code, environment)
+
+            fn = environment["evaluate"]
+
+            if version == "2":
+                result = fn(inputs, output, trace)
+            else:
+                result = fn(app_params, inputs, output, correct_answer)
+
+            # Attempt to convert result to float
+            if isinstance(result, (float, int, str)):
+                try:
+                    result = float(result)
+                except ValueError as e:
+                    raise ValueError(f"Result cannot be converted to float: {e}")
+
+            if not isinstance(result, float):
+                raise TypeError(
+                    f"Result is not a float after conversion: {type(result)}"
+                )
+
+            return result
+
+        except KeyError as e:
+            raise KeyError(f"Missing expected key in environment: {e}")
+
+        except SyntaxError as e:
+            raise SyntaxError(f"Syntax error in provided code: {e}")
+
+        except Exception as e:
+            raise RuntimeError(f"Error during code execution: {e}")
diff --git a/sdks/python/agenta/sdk/engines/running/sandbox.py b/sdks/python/agenta/sdk/engines/running/sandbox.py
index 2e013b5f3f..74a9ae219a 100644
--- a/sdks/python/agenta/sdk/engines/running/sandbox.py
+++ b/sdks/python/agenta/sdk/engines/running/sandbox.py
@@ -6,23 +6,6 @@
 _runner = None
 
 
-def is_import_safe(python_code: Text) -> bool:
-    """Checks if the imports in the python code contains a system-level import.
-
-    Args:
-        python_code (str): The Python code to be executed
-
-    Returns:
-        bool - module is secured or not
-    """
-
-    disallowed_imports = ["os", "subprocess", "threading", "multiprocessing"]
-    for import_ in disallowed_imports:
-        if import_ in python_code:
-            return False
-    return True
-
-
 def execute_code_safely(
     app_params: Dict[str, Any],
     inputs: Dict[str, Any],
diff --git a/sdks/python/oss/tests/pytest/utils/test_code_v0.py b/sdks/python/oss/tests/pytest/utils/test_code_v0.py
index b4cf854099..3437e58bf9 100644
--- a/sdks/python/oss/tests/pytest/utils/test_code_v0.py
+++ b/sdks/python/oss/tests/pytest/utils/test_code_v0.py
@@ -29,17 +29,17 @@
 
 _code_v0 = code_v0.__wrapped__
 
-# Mirrors registry.get_runner() precedence: local unless explicitly overridden.
+# Mirrors registry.get_runner() precedence: restricted unless explicitly overridden.
 _SANDBOX_RUNNER = (
     os.getenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER")
     or os.getenv("AGENTA_SERVICES_SANDBOX_RUNNER")
-    or "local"
+    or "restricted"
 ).lower()
 
-# Python execution works on both the local and daytona runners, so those tests
-# are not gated. JS/TS runtimes are only supported by the daytona runner (the
-# local runner rejects them), and daytona requires live sandbox credentials, so
-# JS/TS tests only run when daytona is selected.
+# Python execution works on the restricted, local, and daytona runners, so those
+# tests are not gated. JS/TS runtimes are only supported by the daytona runner
+# (the in-process runners reject them), and daytona requires live sandbox
+# credentials, so JS/TS tests only run when daytona is selected.
 daytona_only = pytest.mark.skipif(
     _SANDBOX_RUNNER != "daytona",
     reason=f"Daytona-only test (JS/TS runtime); AGENTA_SERVICES_CODE_SANDBOX_RUNNER={_SANDBOX_RUNNER}",
@@ -109,7 +109,7 @@ def test_unsupported_runtime_lua_raises(self):
 
 
 # ---------------------------------------------------------------------------
-# 2. Return-type normalisation (float path via LocalRunner)
+# 2. Return-type normalisation (float path via the in-process runner)
 # ---------------------------------------------------------------------------
 
 
@@ -133,7 +133,7 @@ def test_returns_integer_zero(self):
         assert r["success"] is False
 
     def test_boolean_true_becomes_score_one(self):
-        # LocalRunner converts bool via float(); True → 1.0
+        # the runner converts bool via float(); True → 1.0
         r = call(evaluate("return True"))
         assert r["score"] == pytest.approx(1.0)
         assert r["success"] is True
@@ -273,7 +273,7 @@ def test_empty_code_raises_code_error(self):
 
 
 # ---------------------------------------------------------------------------
-# 6. Runtime parameter (Python only for LocalRunner)
+# 6. Runtime parameter (Python only for the in-process runner)
 # ---------------------------------------------------------------------------
 
 
diff --git a/sdks/python/oss/tests/pytest/utils/test_restricted_runner.py b/sdks/python/oss/tests/pytest/utils/test_restricted_runner.py
new file mode 100644
index 0000000000..7fe2ab4e85
--- /dev/null
+++ b/sdks/python/oss/tests/pytest/utils/test_restricted_runner.py
@@ -0,0 +1,223 @@
+"""
+Unit tests for the RestrictedRunner (default custom-code evaluator sandbox) and
+the runner registry.
+
+The RestrictedRunner executes evaluator code in-process via RestrictedPython.
+These tests assert two things:
+
+1. Functionality — normal evaluators run and the v1/v2 interfaces both work,
+   allowlisted pure-stdlib imports (e.g. math) are available.
+2. Security — host-reaching imports and the classic attribute-gadget escape are
+   blocked (they raise instead of executing), which is the whole point of making
+   this the default.
+
+The runner re-raises any failure as one of SyntaxError / ImportError / KeyError /
+RuntimeError, so "blocked" is asserted against that union rather than a single
+type (RestrictedPython rejects some escapes at compile time and others at run
+time, and we want the test to hold regardless of which path fires).
+"""
+
+import pytest
+
+from agenta.sdk.engines.running.runners.local import LocalRunner
+from agenta.sdk.engines.running.runners.registry import get_runner
+from agenta.sdk.engines.running.runners.restricted import RestrictedRunner
+
+
+# Errors that all count as "the sandbox blocked this".
+BLOCKED = (SyntaxError, ImportError, RuntimeError, NameError, AttributeError, KeyError)
+
+
+def v1(body: str) -> str:
+    """Wrap a one-liner body in a legacy v1 evaluate() function."""
+    return f"def evaluate(app_params, inputs, output, correct_answer):\n    {body}\n"
+
+
+def v2(body: str) -> str:
+    """Wrap a one-liner body in a v2 evaluate() function."""
+    return f"def evaluate(inputs, output, trace):\n    {body}\n"
+
+
+def run_v2(runner, code, *, inputs=None, output="", trace=None):
+    return runner.run(
+        code, {}, inputs or {}, output, None, "python", None, version="2", trace=trace
+    )
+
+
+def run_v1(runner, code, *, inputs=None, output="", correct_answer=None):
+    return runner.run(
+        code, {}, inputs or {}, output, correct_answer, "python", None, version="1"
+    )
+
+
+# ---------------------------------------------------------------------------
+# 1. Functionality
+# ---------------------------------------------------------------------------
+
+
+class TestRestrictedRunnerFunctionality:
+    def setup_method(self):
+        self.runner = RestrictedRunner()
+
+    def test_v2_returns_float(self):
+        assert run_v2(self.runner, v2("return 0.5")) == 0.5
+
+    def test_v1_returns_float(self):
+        assert run_v1(self.runner, v1("return 1.0")) == 1.0
+
+    def test_int_coerced_to_float(self):
+        assert run_v2(self.runner, v2("return 1")) == 1.0
+
+    def test_bool_coerced_to_float(self):
+        assert run_v2(self.runner, v2("return True")) == 1.0
+
+    def test_inputs_accessible(self):
+        code = v2("return 1.0 if inputs.get('x') == 'hi' else 0.0")
+        assert run_v2(self.runner, code, inputs={"x": "hi"}) == 1.0
+
+    def test_output_and_builtins_accessible(self):
+        code = v2("return float(len(output)) / 10.0")
+        assert run_v2(self.runner, code, output="abcde") == 0.5
+
+    def test_trace_accessible(self):
+        code = v2("return 1.0 if (trace or {}).get('latency') == 42 else 0.0")
+        assert run_v2(self.runner, code, trace={"latency": 42}) == 1.0
+
+    def test_allowed_import_math(self):
+        code = (
+            "import math\n"
+            "def evaluate(inputs, output, trace):\n"
+            "    return math.sqrt(0.25)\n"
+        )
+        assert run_v2(self.runner, code) == 0.5
+
+    def test_allowed_import_json(self):
+        code = (
+            "import json\n"
+            "def evaluate(inputs, output, trace):\n"
+            "    return float(json.loads('1'))\n"
+        )
+        assert run_v2(self.runner, code) == 1.0
+
+    def test_non_python_runtime_rejected(self):
+        with pytest.raises(ValueError):
+            self.runner.run(
+                v2("return 1.0"),
+                {},
+                {},
+                "",
+                None,
+                "javascript",
+                None,
+                version="2",
+                trace={},
+            )
+
+    def test_missing_evaluate_raises(self):
+        code = "def not_evaluate(inputs, output, trace):\n    return 1.0\n"
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+
+# ---------------------------------------------------------------------------
+# 2. Security — escapes must be blocked
+# ---------------------------------------------------------------------------
+
+
+class TestRestrictedRunnerSecurity:
+    def setup_method(self):
+        self.runner = RestrictedRunner()
+
+    def test_blocks_import_os(self):
+        code = (
+            "import os\n"
+            "def evaluate(inputs, output, trace):\n"
+            "    os.system('echo pwned')\n"
+            "    return 1.0\n"
+        )
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+    def test_blocks_import_subprocess(self):
+        code = (
+            "import subprocess\ndef evaluate(inputs, output, trace):\n    return 1.0\n"
+        )
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+    def test_blocks_import_httpx_no_network(self):
+        # httpx is deliberately excluded: no outbound network from the sandbox.
+        code = "import httpx\ndef evaluate(inputs, output, trace):\n    return 1.0\n"
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+    def test_blocks_dunder_import_call(self):
+        code = v2("return __import__('os').system('echo pwned')")
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+    def test_blocks_open_builtin(self):
+        code = v2("return float(open('/etc/passwd').read()[:0] == '')")
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+    def test_blocks_class_gadget_escape(self):
+        # The classic RestrictedPython escape: reach a host module via the object
+        # graph. Blocked because dunder attribute access is denied.
+        code = v2("return float(len(().__class__.__bases__[0].__subclasses__()) > 0)")
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+    def test_blocks_eval_builtin(self):
+        code = v2("return float(eval('1+1'))")
+        with pytest.raises(BLOCKED):
+            run_v2(self.runner, code)
+
+
+# ---------------------------------------------------------------------------
+# 3. Registry selection
+# ---------------------------------------------------------------------------
+
+
+class TestGetRunner:
+    def _clear(self, monkeypatch):
+        monkeypatch.delenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER", raising=False)
+        monkeypatch.delenv("AGENTA_SERVICES_SANDBOX_RUNNER", raising=False)
+
+    def test_default_is_restricted(self, monkeypatch):
+        self._clear(monkeypatch)
+        assert isinstance(get_runner(), RestrictedRunner)
+
+    def test_explicit_restricted(self, monkeypatch):
+        self._clear(monkeypatch)
+        monkeypatch.setenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER", "restricted")
+        assert isinstance(get_runner(), RestrictedRunner)
+
+    def test_local_opt_in(self, monkeypatch):
+        self._clear(monkeypatch)
+        monkeypatch.setenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER", "local")
+        assert isinstance(get_runner(), LocalRunner)
+
+    def test_legacy_var_still_works(self, monkeypatch):
+        self._clear(monkeypatch)
+        monkeypatch.setenv("AGENTA_SERVICES_SANDBOX_RUNNER", "local")
+        assert isinstance(get_runner(), LocalRunner)
+
+    def test_canonical_overrides_legacy(self, monkeypatch):
+        self._clear(monkeypatch)
+        monkeypatch.setenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER", "restricted")
+        monkeypatch.setenv("AGENTA_SERVICES_SANDBOX_RUNNER", "local")
+        assert isinstance(get_runner(), RestrictedRunner)
+
+    def test_daytona_without_key_raises(self, monkeypatch):
+        self._clear(monkeypatch)
+        monkeypatch.setenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER", "daytona")
+        monkeypatch.delenv("DAYTONA_API_KEY", raising=False)
+        with pytest.raises(ValueError):
+            get_runner()
+
+    def test_unknown_value_raises(self, monkeypatch):
+        self._clear(monkeypatch)
+        monkeypatch.setenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER", "nope")
+        with pytest.raises(ValueError):
+            get_runner()
diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml
index e880ae41cf..e5e998d584 100644
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
     "orjson>=3,<4",
     "python-jsonpath>=2,<3",
     "daytona>=0.184,<0.185",
+    "RestrictedPython>=8,<9",
 ]
 
 [project.urls]
diff --git a/sdks/python/uv.lock b/sdks/python/uv.lock
index dd4f380fe2..7fd4b79113 100644
--- a/sdks/python/uv.lock
+++ b/sdks/python/uv.lock
@@ -23,6 +23,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "python-jsonpath" },
     { name = "pyyaml" },
+    { name = "restrictedpython" },
     { name = "structlog" },
 ]
 
@@ -61,6 +62,7 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2,<3" },
     { name = "python-jsonpath", specifier = ">=2,<3" },
     { name = "pyyaml", specifier = ">=6,<7" },
+    { name = "restrictedpython", specifier = ">=8,<9" },
     { name = "structlog", specifier = ">=25,<26" },
 ]
 
@@ -1866,6 +1868,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/f4/c67b0b3f1b9245e8d266f0f112c500d50e5b4e83cb6f3b71b6528104182a/requests-2.34.2-py3-none-any.whl", hash = "sha256:2a0d60c172f83ac6ab31e4554906c0f3b3588d37b5cb939b1c061f4907e278e0", size = 73075, upload-time = "2026-05-14T19:25:26.443Z" },
 ]
 
+[[package]]
+name = "restrictedpython"
+version = "8.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/97/04/7314639eab9edd57b5c8f6e158a558f5a5a65453c37a51fb8f9ecc1d112e/restrictedpython-8.2.tar.gz", hash = "sha256:b835c2ff87d71b76f6d8b129096cc5a1e893bf01839ab802ba47a4129f1bdfe4", size = 449323, upload-time = "2026-05-29T06:27:57.285Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/59/b2778bfafcc37fdfd87c02702bbfc5753c58c28c7eb82999b7b49ec6e0e7/restrictedpython-8.2-py3-none-any.whl", hash = "sha256:6ff69e2fe06674bfe88943f17f2ce8353a9bc743ee37ea7bec38ff8b4851afd1", size = 27176, upload-time = "2026-05-29T06:27:55.651Z" },
+]
+
 [[package]]
 name = "rich"
 version = "15.0.0"