Agenta-AI · mmabrouk · Jun 11, 2026
diff --git a/api/oss/src/utils/env.py b/api/oss/src/utils/env.py
@@ -300,7 +300,7 @@ class ServicesCodeConfig(BaseModel):
     sandbox_runner: str = (
         os.getenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER")
         or os.getenv("AGENTA_SERVICES_SANDBOX_RUNNER")
-        or "local"
+        or "restricted"
     )
 
     model_config = ConfigDict(extra="ignore")

diff --git a/docs/docs/evaluation/configure-evaluators/07-custom-evaluator.mdx b/docs/docs/evaluation/configure-evaluators/07-custom-evaluator.mdx
@@ -5,6 +5,10 @@ description: "Write custom evaluators in Python, JavaScript, or TypeScript with
 
 Custom code evaluators let you write your own evaluation logic in Python, JavaScript, or TypeScript. Your code has access to the application inputs, outputs, and the full execution trace (spans, latency, token usage, costs).
 
+:::warning Self-hosted deployments only
+On self-hosted Agenta, custom evaluator code runs server-side. By default it runs in a restricted Python sandbox (no filesystem, network, or host access). Operators can change the runner with the `AGENTA_SERVICES_CODE_SANDBOX_RUNNER` environment variable: `local` runs code with no sandbox (trusted authors only), `daytona` runs it in an isolated remote sandbox. See [environment configuration](/self-host/configuration). Agenta Cloud is unaffected — it isolates evaluator execution.
+:::
+
 ## Function signature
 
 Your code must define an `evaluate` function with the following signature:

diff --git a/docs/docs/self-host/02-configuration.mdx b/docs/docs/self-host/02-configuration.mdx
@@ -109,6 +109,16 @@ This key has no env-var or `env.py` equivalent.
 | `AGENTA_SERVICES_HOOK_ALLOW_INSECURE` | `agenta.services.hook.allow_insecure` | `agenta.services.hook.allowInsecure` |
 | `AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED` | `agenta.services.middleware.caching_enabled` | `agenta.services.middleware.cachingEnabled` |
 
+:::warning Custom-code evaluator runner
+`AGENTA_SERVICES_CODE_SANDBOX_RUNNER` selects how [custom-code evaluators](/evaluation/configure-evaluators/custom-evaluator) execute:
+
+- `restricted` (default) — in-process Python sandbox with limited builtins and an allowlist of pure-standard-library imports. No filesystem, network, or host access.
+- `local` — raw execution in the services process with **no sandbox**. Any author who can create a custom-code evaluator can run arbitrary code on the host. Use only for trusted, single-tenant deployments.
+- `daytona` — isolated remote sandbox (strongest). Recommended when evaluator authors are not fully trusted. Requires the [daytona](#daytona) credentials below.
+
+The legacy `AGENTA_SERVICES_SANDBOX_RUNNER` is still accepted as a fallback.
+:::
+
 ## Agenta — webhooks
 
 | Env var | env.py path | values.yaml path |

diff --git a/hosting/docker-compose/ee/env.ee.dev.example b/hosting/docker-compose/ee/env.ee.dev.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true

diff --git a/hosting/docker-compose/ee/env.ee.gh.example b/hosting/docker-compose/ee/env.ee.gh.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true

diff --git a/hosting/docker-compose/oss/env.oss.dev.example b/hosting/docker-compose/oss/env.oss.dev.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true

diff --git a/hosting/docker-compose/oss/env.oss.gh.example b/hosting/docker-compose/oss/env.oss.gh.example
@@ -78,7 +78,9 @@ AGENTA_CRYPT_KEY=replace-me
 # ================================================================== #
 # Agenta - Services (code/hook/middleware)
 # ================================================================== #
-# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local
+# Custom-code evaluator runner: restricted (default, in-process sandbox) | local
+# (no sandbox, raw exec — trusted/single-tenant only) | daytona (isolated remote sandbox)
+# AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted
 # AGENTA_SERVICES_HOOK_ALLOW_INSECURE=true
 # AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED=true
 # AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED=true

diff --git a/hosting/kubernetes/ee/values.ee.example.yaml b/hosting/kubernetes/ee/values.ee.example.yaml
@@ -113,7 +113,7 @@ global:
 # === agenta.services ===
 # Consumed by the agenta SDK running inside services pods:
 #   - hook.allowInsecure   → AGENTA_SERVICES_HOOK_ALLOW_INSECURE
-#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (local|daytona)
+#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (restricted|local|daytona; default restricted. local = no sandbox, trusted only)
 #   - middleware.authEnabled    → AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED
 #   - middleware.cachingEnabled → AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED
 # ================================================================== #
@@ -122,7 +122,7 @@ global:
 #     hook:
 #       allowInsecure: false
 #     code:
-#       sandboxRunner: local
+#       sandboxRunner: restricted
 #     middleware:
 #       authEnabled: true
 #       cachingEnabled: true

diff --git a/hosting/kubernetes/oss/values.oss.example.yaml b/hosting/kubernetes/oss/values.oss.example.yaml
@@ -100,7 +100,7 @@ agenta:
 # === agenta.services ===
 # Consumed by the agenta SDK running inside services pods:
 #   - hook.allowInsecure   → AGENTA_SERVICES_HOOK_ALLOW_INSECURE
-#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (local|daytona)
+#   - code.sandboxRunner   → AGENTA_SERVICES_CODE_SANDBOX_RUNNER (restricted|local|daytona; default restricted. local = no sandbox, trusted only)
 #   - middleware.authEnabled    → AGENTA_SERVICES_MIDDLEWARE_AUTH_ENABLED
 #   - middleware.cachingEnabled → AGENTA_SERVICES_MIDDLEWARE_CACHING_ENABLED
 # ================================================================== #
@@ -109,7 +109,7 @@ agenta:
 #     hook:
 #       allowInsecure: false
 #     code:
-#       sandboxRunner: local
+#       sandboxRunner: restricted
 #     middleware:
 #       authEnabled: true
 #       cachingEnabled: true

diff --git a/sdks/python/agenta/sdk/engines/running/runners/registry.py b/sdks/python/agenta/sdk/engines/running/runners/registry.py
@@ -3,10 +3,14 @@
 
 from agenta.sdk.engines.running.runners.base import CodeRunner
 from agenta.sdk.engines.running.runners.local import LocalRunner
+from agenta.sdk.engines.running.runners.restricted import RestrictedRunner
+from agenta.sdk.utils.logging import get_module_logger
 
 if TYPE_CHECKING:
     from agenta.sdk.engines.running.runners.daytona import DaytonaRunner
 
+log = get_module_logger(__name__)
+
 
 def _get_daytona_runner() -> "DaytonaRunner":
     from agenta.sdk.engines.running.runners.daytona import DaytonaRunner
@@ -20,34 +24,44 @@ def get_runner() -> CodeRunner:
 
     Reads AGENTA_SERVICES_CODE_SANDBOX_RUNNER (canonical, v0.100.3+) with a
     fallback to the legacy AGENTA_SERVICES_SANDBOX_RUNNER.
-    - "local" (default): Uses current container for local execution
-    - "daytona": Uses Daytona remote sandbox
+    - "restricted" (default): In-process RestrictedPython sandbox (allowlisted imports).
+    - "local": Raw exec in the current process — no sandbox. Trusted deployments only.
+    - "daytona": Remote Daytona sandbox (strongest isolation).
 
     Returns:
-        CodeRunner: An instance of LocalRunner or DaytonaRunner
+        CodeRunner: An instance of RestrictedRunner, LocalRunner, or DaytonaRunner
 
     Raises:
-        ValueError: If Daytona runner is selected but required environment variables are missing
+        ValueError: If an unknown runner is selected, or Daytona is selected but its
+            required environment variables are missing.
     """
     runner_type = (
         os.getenv("AGENTA_SERVICES_CODE_SANDBOX_RUNNER")
         or os.getenv("AGENTA_SERVICES_SANDBOX_RUNNER")
-        or "local"
+        or "restricted"
     ).lower()
 
-    if runner_type == "daytona":
+    if runner_type == "restricted":
+        return RestrictedRunner()
+    elif runner_type == "local":
+        log.warning(
+            "Custom-code evaluators are using the 'local' runner: user code runs with "
+            "raw exec() and no sandbox in this process. Use it only for trusted/"
+            "single-tenant deployments. Set AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted "
+            "(default) or =daytona for untrusted evaluator authors."
+        )
+        return LocalRunner()
+    elif runner_type == "daytona":
         try:
             return _get_daytona_runner()
         except ImportError as exc:
             raise ValueError(
                 "Daytona runner requires the 'daytona' package. "
                 "Install optional dependencies or set "
-                "AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local."
+                "AGENTA_SERVICES_CODE_SANDBOX_RUNNER=restricted."
             ) from exc
-    elif runner_type == "local":
-        return LocalRunner()
     else:
         raise ValueError(
             f"Unknown AGENTA_SERVICES_CODE_SANDBOX_RUNNER value: {runner_type}. "
-            f"Supported values: 'local', 'daytona'"
+            f"Supported values: 'restricted', 'local', 'daytona'"
         )
diff --git a/sdks/python/agenta/sdk/engines/running/runners/restricted.py b/sdks/python/agenta/sdk/engines/running/runners/restricted.py
@@ -0,0 +1,182 @@
+import builtins as _py_builtins
+from typing import Any, Dict, Union, Optional
+
+from RestrictedPython import compile_restricted, safe_builtins, PrintCollector
+from RestrictedPython.Eval import default_guarded_getiter, default_guarded_getitem
+from RestrictedPython.Guards import (
+    safer_getattr,
+    guarded_iter_unpack_sequence,
+    full_write_guard,
+)
+
+from agenta.sdk.engines.running.runners.base import CodeRunner
+
+
+# Pure data/iteration builtins that RestrictedPython's safe_builtins omits but
+# evaluators routinely need. All operate on data only — none reach the host or
+# the class graph, so adding them does not widen the sandbox (escapes go through
+# attribute access, which safer_getattr blocks).
+_SAFE_EXTRA_BUILTINS = (
+    "dict",
+    "list",
+    "set",
+    "frozenset",
+    "min",
+    "max",
+    "sum",
+    "enumerate",
+    "map",
+    "filter",
+    "reversed",
+    "all",
+    "any",
+)
+
+
+# Pure-computation stdlib modules only: no filesystem, network, or process reach.
+# Deliberately strict — anything that can touch the host (os, subprocess, sys,
+# pathlib, socket, importlib, io, shutil, ...) or the network (httpx, urllib,
+# requests, ...) is excluded. Operators who need unrestricted execution must opt
+# into the `local` runner; hostile multi-tenant should use `daytona`.
+_ALLOWED_IMPORTS = frozenset(
+    {
+        "math",
+        "statistics",
+        "datetime",
+        "json",
+        "re",
+        "random",
+        "string",
+        "typing",
+        "collections",
+        "itertools",
+        "functools",
+    }
+)
+
+
+def _safe_import(name, globals=None, locals=None, fromlist=(), level=0):
+    """Guarded ``__import__`` that only permits the pure-stdlib allowlist.
+
+    Replaces the real ``__import__`` inside the sandbox so user evaluator code
+    cannot import host-reaching modules. Relative imports (``level != 0``) are
+    rejected outright.
+    """
+    root = name.split(".")[0]
+    if level != 0 or root not in _ALLOWED_IMPORTS:
+        raise ImportError(
+            f"Import of '{name}' is not allowed in the restricted evaluator sandbox. "
+            f"Allowed modules: {', '.join(sorted(_ALLOWED_IMPORTS))}. "
+            "To run unrestricted evaluator code set "
+            "AGENTA_SERVICES_CODE_SANDBOX_RUNNER=local (trusted deployments only)."
+        )
+    return __import__(name, globals, locals, fromlist, level)
+
+
+def _build_restricted_globals() -> Dict[str, Any]:
+    """Build the execution globals for RestrictedPython.
+
+    Closes the two holes the previous sandbox had:
+    1. it injected the real ``__import__`` (here: a guarded allowlist import), and
+    2. it never set ``_getattr_`` (here: ``safer_getattr`` blocks dunder/underscore
+       attribute access, which defeats the ``().__class__.__bases__`` gadget escape).
+    """
+    builtins = dict(safe_builtins)
+    for name in _SAFE_EXTRA_BUILTINS:
+        builtins[name] = getattr(_py_builtins, name)
+    builtins["__import__"] = _safe_import
+
+    return {
+        "__builtins__": builtins,
+        "_getattr_": safer_getattr,
+        "_getitem_": default_guarded_getitem,
+        "_getiter_": default_guarded_getiter,
+        "_iter_unpack_sequence_": guarded_iter_unpack_sequence,
+        "_write_": full_write_guard,
+        # print() goes through PrintCollector (captured, not real stdout).
+        "_print_": PrintCollector,
+    }
+
+
+class RestrictedRunner(CodeRunner):
+    """Default code runner: executes evaluator code in an in-process RestrictedPython sandbox."""
+
+    def run(
+        self,
+        code: str,
+        app_params: Dict[str, Any],
+        inputs: Dict[str, Any],
+        output: Union[dict, str],
+        correct_answer: Any,
+        runtime: Optional[str] = None,
+        templates: Optional[Dict[str, str]] = None,
+        *,
+        version: str = "1",
+        trace: Optional[Dict[str, Any]] = None,
+    ) -> Union[float, None]:
+        """
+        Execute provided Python code in a RestrictedPython sandbox.
+
+        Args:
+            code: The Python code to be executed
+            app_params: The parameters of the app variant (v1 only)
+            inputs: Inputs to be used during code execution
+            output: The output of the app variant after being called
+            correct_answer: The correct answer (or target) for comparison (v1 only)
+            runtime: Runtime environment (only "python" is supported)
+            templates: Wrapper templates keyed by runtime (unused for in-process runners).
+            version: Evaluator interface version ("1" = legacy, "2" = new)
+            trace: Full trace data (v2 only)
+
+        Returns:
+            Float score between 0 and 1, or None if execution fails
+        """
+        # Normalize runtime: None means python
+        runtime = runtime or "python"
+
+        # The restricted sandbox runs in-process and only supports Python.
+        if runtime != "python":
+            raise ValueError(
+                f"RestrictedRunner only supports 'python' runtime, got: {runtime}. "
+                "Use the Daytona runner for javascript/typescript."
+            )
+
+        try:
+            byte_code = compile_restricted(code, filename="<inline>", mode="exec")
+        except SyntaxError as e:
+            raise SyntaxError(f"Syntax error in provided code: {e}")
+
+        environment = _build_restricted_globals()
+
+        try:
+            exec(byte_code, environment)
+
+            fn = environment["evaluate"]
+
+            if version == "2":
+                result = fn(inputs, output, trace)
+            else:
+                result = fn(app_params, inputs, output, correct_answer)
+
+            # Attempt to convert result to float
+            if isinstance(result, (float, int, str)):
+                try:
+                    result = float(result)
+                except ValueError as e:
+                    raise ValueError(f"Result cannot be converted to float: {e}")
+
+            if not isinstance(result, float):
+                raise TypeError(
+                    f"Result is not a float after conversion: {type(result)}"
+                )
+
+            return result
+
+        except KeyError as e:
+            raise KeyError(f"Missing expected key in environment: {e}")
+
+        except SyntaxError as e:
+            raise SyntaxError(f"Syntax error in provided code: {e}")
+
+        except Exception as e:
+            raise RuntimeError(f"Error during code execution: {e}")
diff --git a/sdks/python/agenta/sdk/engines/running/sandbox.py b/sdks/python/agenta/sdk/engines/running/sandbox.py
@@ -6,23 +6,6 @@
 _runner = None
 
 
-def is_import_safe(python_code: Text) -> bool:
-    """Checks if the imports in the python code contains a system-level import.
-
-    Args:
-        python_code (str): The Python code to be executed
-
-    Returns:
-        bool - module is secured or not
-    """
-
-    disallowed_imports = ["os", "subprocess", "threading", "multiprocessing"]
-    for import_ in disallowed_imports:
-        if import_ in python_code:
-            return False
-    return True
-
-
 def execute_code_safely(
     app_params: Dict[str, Any],
     inputs: Dict[str, Any],