From 54297678120beb19def0030323826025359af1a9 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 13 May 2026 16:31:19 +0200
Subject: [PATCH 1/6] vllm/sglang: add semantic degradation support

---
 doc/workloads/sglang.rst                      | 34 ++++++++
 doc/workloads/vllm.rst                        | 29 +++++++
 src/cloudai/workloads/common/llm_serving.py   | 79 ++++++++++++++++---
 src/cloudai/workloads/sglang/__init__.py      |  4 +
 .../sglang/report_generation_strategy.py      | 12 ++-
 src/cloudai/workloads/sglang/sglang.py        | 33 ++++++++
 .../sglang/slurm_command_gen_strategy.py      | 19 +++++
 src/cloudai/workloads/vllm/__init__.py        | 14 +++-
 .../vllm/report_generation_strategy.py        |  5 +-
 .../vllm/slurm_command_gen_strategy.py        | 19 +++++
 src/cloudai/workloads/vllm/vllm.py            | 48 +++++++++++
 .../sglang/test_command_gen_strategy_slurm.py | 72 ++++++++++++++++-
 .../test_job_status_retrieval_strategy.py     | 60 +++++++++++++-
 .../sglang/test_report_gen_strategy.py        | 41 +++++++++-
 .../vllm/test_command_gen_strategy_slurm.py   | 49 ++++++++++++
 .../test_job_status_retrieval_strategy.py     | 40 +++++++++-
 .../vllm/test_report_gen_strategy.py          | 38 ++++++++-
 17 files changed, 579 insertions(+), 17 deletions(-)

diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst
index 77121715b..d0561c773 100644
--- a/doc/workloads/sglang.rst
+++ b/doc/workloads/sglang.rst
@@ -28,6 +28,10 @@ Test + Scenario example
    max_concurrency = 16
    num_prompts = 30
 
+   [semantic_eval_cmd_args]
+   module = "sglang.test.run_eval"
+   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+
 
 .. code-block:: toml
    :caption: scenario.toml (scenario with one test)
@@ -68,6 +72,29 @@ Test-in-Scenario example
    num_prompts = 30
 
 
+Semantic Validation
+-------------------
+To run GSM8K semantic validation after the serving benchmark, add ``semantic_eval_cmd_args``. CloudAI reports
+``accuracy`` from the eval output, but does not enforce an accuracy threshold.
+
+.. code-block:: toml
+   :caption: test.toml (semantic validation)
+
+   [semantic_eval_cmd_args]
+   module = "sglang.test.run_eval"
+   args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+
+For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments:
+
+.. code-block:: toml
+
+   [semantic_eval_cmd_args]
+   module = "sglang.test.few_shot_gsm8k"
+   args = "--num-questions 200"
+
+The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+
+
 Control number of GPUs
 ----------------------
 The number of GPUs can be controlled using the options below, listed from lowest to highest priority:
@@ -130,6 +157,13 @@ Benchmark Command Arguments
    :members:
    :show-inheritance:
 
+Semantic Eval Command Arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autopydantic_model:: cloudai.workloads.sglang.sglang.SglangSemanticEvalCmdArgs
+   :members:
+   :show-inheritance:
+
 Test Definition
 ~~~~~~~~~~~~~~~
 
diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst
index 7285c5858..930bcf11b 100644
--- a/doc/workloads/vllm.rst
+++ b/doc/workloads/vllm.rst
@@ -28,6 +28,10 @@ Test and Scenario Examples
    max_concurrency = 16
    num_prompts = 30
 
+   [semantic_eval_cmd_args]
+   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+
 
 .. code-block:: toml
    :caption: scenario.toml (scenario with one test)
@@ -68,6 +72,24 @@ Test-in-Scenario example
    num_prompts = 30
 
 
+Semantic Validation
+-------------------
+To run GSM8K semantic validation after the serving benchmark, add ``semantic_eval_cmd_args``. CloudAI reports
+``accuracy`` from the eval output, but does not enforce an accuracy threshold.
+
+.. code-block:: toml
+   :caption: test.toml (semantic validation)
+
+   [semantic_eval_cmd_args]
+   script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+   args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+
+If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and
+point ``script`` at the mounted path.
+
+The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders.
+
+
 Controlling the Number of GPUs
 -------------------------------
 The number of GPUs can be controlled using the options below, listed from lowest to highest priority:
@@ -154,6 +176,13 @@ Benchmark Command Arguments
    :members:
    :show-inheritance:
 
+Semantic Eval Command Arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.vllm.vllm.VllmSemanticEvalCmdArgs
+   :members:
+   :show-inheritance:
+
 Test Definition
 ~~~~~~~~~~~~~~~
 
diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py
index a023f2443..f83b3a97a 100644
--- a/src/cloudai/workloads/common/llm_serving.py
+++ b/src/cloudai/workloads/common/llm_serving.py
@@ -217,6 +217,7 @@ class LLMServingReportGenerationStrategy(ReportGenerationStrategy, Generic[TestD
         "throughput",
         "tps-per-user",
         "tps-per-gpu",
+        "accuracy",
     ]
 
     @property
@@ -240,6 +241,10 @@ def all_gpu_ids(self, tdef: TestDefT, gpus_per_node: int | None) -> list[int]:
     def parse_results(self) -> ReportT | None:
         return self.parse_output(self.test_run.output_path / self.result_file_name)
 
+    def parse_semantic_accuracy(self) -> float | None:
+        """Parse semantic validation accuracy, if supported by the workload."""
+        return None
+
     def can_handle_directory(self) -> bool:
         return self.parse_results() is not None
 
@@ -261,6 +266,13 @@ def get_metric(self, metric: str) -> MetricValue:
         if metric not in self.metrics:
             return METRIC_ERROR
 
+        if metric == "accuracy":
+            tdef = cast(TestDefT, self.test_run.test)
+            if getattr(tdef, "semantic_eval_cmd_args", None) is None:
+                return METRIC_ERROR
+            accuracy = self.parse_semantic_accuracy()
+            return accuracy if accuracy is not None else METRIC_ERROR
+
         results = self.parse_results()
         if results is None:
             return METRIC_ERROR
@@ -286,14 +298,31 @@ def generate_report(self) -> None:
         table.add_column("TPOT Mean, ms", justify="right")
         table.add_column("TPOT Median, ms", justify="right")
         table.add_column("TPOT P99, ms", justify="right")
+        accuracy = self.get_metric("accuracy")
+        if accuracy != METRIC_ERROR:
+            table.add_column("Accuracy", justify="right")
+            row = [
+                f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})",
+                f"{results.mean_ttft_ms:.4f}",
+                f"{results.median_ttft_ms:.4f}",
+                f"{results.p99_ttft_ms:.4f}",
+                f"{results.mean_tpot_ms:.4f}",
+                f"{results.median_tpot_ms:.4f}",
+                f"{results.p99_tpot_ms:.4f}",
+                f"{accuracy:.4f}",
+            ]
+        else:
+            row = [
+                f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})",
+                f"{results.mean_ttft_ms:.4f}",
+                f"{results.median_ttft_ms:.4f}",
+                f"{results.p99_ttft_ms:.4f}",
+                f"{results.mean_tpot_ms:.4f}",
+                f"{results.median_tpot_ms:.4f}",
+                f"{results.p99_tpot_ms:.4f}",
+            ]
         table.add_row(
-            f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})",
-            f"{results.mean_ttft_ms:.4f}",
-            f"{results.median_ttft_ms:.4f}",
-            f"{results.p99_ttft_ms:.4f}",
-            f"{results.mean_tpot_ms:.4f}",
-            f"{results.median_tpot_ms:.4f}",
-            f"{results.p99_tpot_ms:.4f}",
+            *row,
         )
         console.print(table)
 
@@ -522,6 +551,11 @@ def bench_log_file(self) -> str:
         """Benchmark log file name."""
         return f"{self.workload_slug}-bench.log"
 
+    @property
+    def semantic_eval_log_file(self) -> str:
+        """Semantic validation log file name."""
+        return f"{self.workload_slug}-semantic-eval.log"
+
     @property
     def serve_pid_var(self) -> str:
         """Shell variable holding the aggregated serve PID."""
@@ -558,6 +592,33 @@ def get_bench_command(self) -> list[str]:
     def get_helper_command(self) -> list[str]:
         """Return the helper process command for disaggregated mode."""
 
+    def get_semantic_eval_command(self) -> list[str] | None:
+        """Return the optional semantic validation command."""
+        return None
+
+    def _expand_semantic_eval_args(self, args: str, *, host: str) -> str:
+        replacements = {
+            "{model}": self.tdef.cmd_args.model,
+            "{host}": host,
+            "{port}": str(self.serve_port),
+            "{output_path}": str(self.test_run.output_path.absolute()),
+        }
+        for placeholder, value in replacements.items():
+            args = args.replace(placeholder, value)
+        return args
+
+    def _gen_semantic_eval_block(self, srun_prefix: str) -> str:
+        semantic_cmd = self.get_semantic_eval_command()
+        if not semantic_cmd:
+            return ""
+
+        return f"""\
+
+echo "Running semantic validation..."
+{srun_prefix} \\
+    --output={(self.test_run.output_path / self.semantic_eval_log_file).absolute()} \\
+    {" ".join(semantic_cmd)}"""
+
     def _gen_srun_command(self) -> str:
         serve_commands = self.get_serve_commands()
         srun_command = self._gen_llm_serving_srun_command(serve_commands)
@@ -593,7 +654,7 @@ def _gen_aggregated_script(self, serve_cmd: list[str], bench_cmd: str) -> str:
 echo "Running benchmark..."
 {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\
     --output={(self.test_run.output_path / self.bench_log_file).absolute()} \\
-    {bench_cmd}"""
+    {bench_cmd}{self._gen_semantic_eval_block(f"{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1")}"""
 
     def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: str) -> str:
         prefill_cmd, decode_cmd = serve_commands
@@ -651,4 +712,4 @@ def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd:
 echo "Running benchmark..."
 {prefill_srun_prefix} \\
     --output={(self.test_run.output_path / self.bench_log_file).absolute()} \\
-    {bench_cmd}"""
+    {bench_cmd}{self._gen_semantic_eval_block(prefill_srun_prefix)}"""
diff --git a/src/cloudai/workloads/sglang/__init__.py b/src/cloudai/workloads/sglang/__init__.py
index 7b1ebf778..578eef27b 100644
--- a/src/cloudai/workloads/sglang/__init__.py
+++ b/src/cloudai/workloads/sglang/__init__.py
@@ -18,10 +18,12 @@
 from .sglang import (
     SGLANG_BENCH_JSONL_FILE,
     SGLANG_BENCH_LOG_FILE,
+    SGLANG_SEMANTIC_EVAL_LOG_FILE,
     SglangArgs,
     SglangBenchCmdArgs,
     SGLangBenchReport,
     SglangCmdArgs,
+    SglangSemanticEvalCmdArgs,
     SglangTestDefinition,
 )
 from .slurm_command_gen_strategy import SglangSlurmCommandGenStrategy
@@ -29,11 +31,13 @@
 __all__ = [
     "SGLANG_BENCH_JSONL_FILE",
     "SGLANG_BENCH_LOG_FILE",
+    "SGLANG_SEMANTIC_EVAL_LOG_FILE",
     "SGLangBenchReport",
     "SGLangBenchReportGenerationStrategy",
     "SglangArgs",
     "SglangBenchCmdArgs",
     "SglangCmdArgs",
+    "SglangSemanticEvalCmdArgs",
     "SglangSlurmCommandGenStrategy",
     "SglangTestDefinition",
 ]
diff --git a/src/cloudai/workloads/sglang/report_generation_strategy.py b/src/cloudai/workloads/sglang/report_generation_strategy.py
index b535647bc..00c58d83f 100644
--- a/src/cloudai/workloads/sglang/report_generation_strategy.py
+++ b/src/cloudai/workloads/sglang/report_generation_strategy.py
@@ -18,7 +18,14 @@
 
 from cloudai.workloads.common.llm_serving import LLMServingReportGenerationStrategy, all_gpu_ids
 
-from .sglang import SGLANG_BENCH_JSONL_FILE, SGLangBenchReport, SglangTestDefinition, parse_sglang_bench_output
+from .sglang import (
+    SGLANG_BENCH_JSONL_FILE,
+    SGLANG_SEMANTIC_EVAL_LOG_FILE,
+    SGLangBenchReport,
+    SglangTestDefinition,
+    parse_sglang_bench_output,
+    parse_sglang_semantic_accuracy,
+)
 
 
 class SGLangBenchReportGenerationStrategy(LLMServingReportGenerationStrategy[SglangTestDefinition, SGLangBenchReport]):
@@ -35,5 +42,8 @@ def report_title(self) -> str:
     def parse_output(self, path: Path) -> SGLangBenchReport | None:
         return parse_sglang_bench_output(path)
 
+    def parse_semantic_accuracy(self) -> float | None:
+        return parse_sglang_semantic_accuracy(self.test_run.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE)
+
     def all_gpu_ids(self, tdef: SglangTestDefinition, gpus_per_node: int | None) -> list[int]:
         return all_gpu_ids(tdef, gpus_per_node)
diff --git a/src/cloudai/workloads/sglang/sglang.py b/src/cloudai/workloads/sglang/sglang.py
index dbb48e92d..41428ccc8 100644
--- a/src/cloudai/workloads/sglang/sglang.py
+++ b/src/cloudai/workloads/sglang/sglang.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import logging
+import re
 from functools import cache
 from pathlib import Path
 
@@ -34,6 +35,7 @@
 SGLANG_SERVE_LOG_FILE = "sglang-serve.log"
 SGLANG_BENCH_LOG_FILE = "sglang-bench.log"
 SGLANG_BENCH_JSONL_FILE = "sglang-bench.jsonl"
+SGLANG_SEMANTIC_EVAL_LOG_FILE = "sglang-semantic-eval.log"
 
 
 class SglangArgs(LLMServingArgs):
@@ -85,14 +87,29 @@ class SglangBenchCmdArgs(CmdArgs):
     output_details: bool = True
 
 
+class SglangSemanticEvalCmdArgs(CmdArgs):
+    """SGLang semantic validation command arguments."""
+
+    module: str = "sglang.test.run_eval"
+    args: str = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+
+
 class SglangTestDefinition(LLMServingTestDefinition[SglangCmdArgs]):
     """Test object for SGLang."""
 
     bench_cmd_args: SglangBenchCmdArgs = SglangBenchCmdArgs()
+    semantic_eval_cmd_args: SglangSemanticEvalCmdArgs | None = None
 
     def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         res = parse_sglang_bench_output(tr.output_path / SGLANG_BENCH_JSONL_FILE)
         if res and res.completed > 0:
+            if self.semantic_eval_cmd_args is not None:
+                accuracy = parse_sglang_semantic_accuracy(tr.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE)
+                if accuracy is None:
+                    return JobStatusResult(
+                        is_successful=False,
+                        error_message=f"SGLang semantic accuracy not found in {tr.output_path}.",
+                    )
             return JobStatusResult(is_successful=True)
 
         return JobStatusResult(
@@ -139,3 +156,19 @@ def parse_sglang_bench_output(jsonl_file: Path) -> SGLangBenchReport | None:
                 continue
 
     return None
+
+
+@cache
+def parse_sglang_semantic_accuracy(log_file: Path) -> float | None:
+    """Parse SGLang semantic validation accuracy from run_eval or legacy GSM8K output."""
+    if not log_file.is_file():
+        return None
+
+    pattern = re.compile(r"\b(?:Score|Accuracy):\s*([0-9]*\.?[0-9]+)")
+    with log_file.open(encoding="utf-8", errors="ignore") as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                return float(match.group(1))
+
+    return None
diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
index fd2548b18..f1c7c741c 100644
--- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
@@ -23,6 +23,7 @@
     SglangArgs,
     SglangBenchCmdArgs,
     SglangCmdArgs,
+    SglangSemanticEvalCmdArgs,
     SglangTestDefinition,
 )
 
@@ -123,5 +124,23 @@ def get_bench_command(self) -> list[str]:
 
         return command
 
+    def get_semantic_eval_command(self) -> list[str] | None:
+        eval_args: SglangSemanticEvalCmdArgs | None = self.tdef.semantic_eval_cmd_args
+        if eval_args is None:
+            return None
+
+        host = self.bench_host
+        command = [
+            "python3",
+            "-m",
+            eval_args.module,
+            f"--host {host}",
+            f"--port {self.serve_port}",
+        ]
+        args = self._expand_semantic_eval_args(eval_args.args, host=host)
+        if args:
+            command.append(args)
+        return command
+
     def aggregated_serve_env(self) -> dict[str, str]:
         return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)}
diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py
index 5dbcf72ce..9d0da4e6f 100644
--- a/src/cloudai/workloads/vllm/__init__.py
+++ b/src/cloudai/workloads/vllm/__init__.py
@@ -16,15 +16,27 @@
 
 from .report_generation_strategy import VLLMBenchReport, VLLMBenchReportGenerationStrategy
 from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy
-from .vllm import VLLM_BENCH_LOG_FILE, VllmArgs, VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition
+from .vllm import (
+    VLLM_BENCH_LOG_FILE,
+    VLLM_GSM8K_JSON_FILE,
+    VLLM_SEMANTIC_EVAL_LOG_FILE,
+    VllmArgs,
+    VllmBenchCmdArgs,
+    VllmCmdArgs,
+    VllmSemanticEvalCmdArgs,
+    VllmTestDefinition,
+)
 
 __all__ = [
     "VLLM_BENCH_LOG_FILE",
+    "VLLM_GSM8K_JSON_FILE",
+    "VLLM_SEMANTIC_EVAL_LOG_FILE",
     "VLLMBenchReport",
     "VLLMBenchReportGenerationStrategy",
     "VllmArgs",
     "VllmBenchCmdArgs",
     "VllmCmdArgs",
+    "VllmSemanticEvalCmdArgs",
     "VllmSlurmCommandGenStrategy",
     "VllmTestDefinition",
 ]
diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py
index 0cfbae5e8..fcecabfba 100644
--- a/src/cloudai/workloads/vllm/report_generation_strategy.py
+++ b/src/cloudai/workloads/vllm/report_generation_strategy.py
@@ -21,7 +21,7 @@
 
 from cloudai.workloads.common.llm_serving import LLMServingBenchReport, LLMServingReportGenerationStrategy, all_gpu_ids
 
-from .vllm import VLLM_BENCH_JSON_FILE, VllmTestDefinition
+from .vllm import VLLM_BENCH_JSON_FILE, VllmTestDefinition, parse_vllm_semantic_accuracy
 
 
 class VLLMBenchReport(LLMServingBenchReport):
@@ -62,5 +62,8 @@ def report_title(self) -> str:
     def parse_output(self, path: Path) -> VLLMBenchReport | None:
         return parse_vllm_bench_output(path)
 
+    def parse_semantic_accuracy(self) -> float | None:
+        return parse_vllm_semantic_accuracy(self.test_run.output_path)
+
     def all_gpu_ids(self, tdef: VllmTestDefinition, gpus_per_node: int | None) -> list[int]:
         return all_gpu_ids(tdef, gpus_per_node)
diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
index 1fb40d83d..ea41cddf6 100644
--- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
@@ -22,6 +22,7 @@
 from .vllm import (
     VLLM_BENCH_JSON_FILE,
     VllmCmdArgs,
+    VllmSemanticEvalCmdArgs,
     VllmTestDefinition,
 )
 
@@ -121,3 +122,21 @@ def get_bench_command(self) -> list[str]:
             "--save-result",
             *extras,
         ]
+
+    def get_semantic_eval_command(self) -> list[str] | None:
+        eval_args: VllmSemanticEvalCmdArgs | None = self.tdef.semantic_eval_cmd_args
+        if eval_args is None:
+            return None
+
+        host = self.bench_host
+        http_host = host if host.startswith("http://") or host.startswith("https://") else f"http://{host}"
+        command = [
+            "python3",
+            eval_args.script,
+            f"--host {http_host}",
+            f"--port {self.serve_port}",
+        ]
+        args = self._expand_semantic_eval_args(eval_args.args, host=http_host)
+        if args:
+            command.append(args)
+        return command
diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py
index 9f8933b8c..7eb829971 100644
--- a/src/cloudai/workloads/vllm/vllm.py
+++ b/src/cloudai/workloads/vllm/vllm.py
@@ -16,7 +16,11 @@
 
 from __future__ import annotations
 
+import json
 import logging
+import re
+from functools import cache
+from pathlib import Path
 from typing import Optional, cast
 
 from pydantic import ConfigDict, Field
@@ -35,6 +39,8 @@
 VLLM_SERVE_LOG_FILE = "vllm-serve.log"
 VLLM_BENCH_LOG_FILE = "vllm-bench.log"
 VLLM_BENCH_JSON_FILE = "vllm-bench.json"
+VLLM_GSM8K_JSON_FILE = "vllm-gsm8k.json"
+VLLM_SEMANTIC_EVAL_LOG_FILE = "vllm-semantic-eval.log"
 
 
 class VllmArgs(LLMServingArgs):
@@ -81,10 +87,18 @@ class VllmBenchCmdArgs(CmdArgs):
     num_prompts: int = 30
 
 
+class VllmSemanticEvalCmdArgs(CmdArgs):
+    """vLLM semantic validation command arguments."""
+
+    script: str = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py"
+    args: str = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+
+
 class VllmTestDefinition(LLMServingTestDefinition[VllmCmdArgs]):
     """Test object for vLLM."""
 
     bench_cmd_args: VllmBenchCmdArgs = VllmBenchCmdArgs()
+    semantic_eval_cmd_args: VllmSemanticEvalCmdArgs | None = None
     proxy_script_repo: GitRepo | None = None
 
     @property
@@ -165,6 +179,13 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
                     try:
                         num_successful_requests = int(line.split()[2])
                         if num_successful_requests > 0:
+                            if self.semantic_eval_cmd_args is not None:
+                                accuracy = parse_vllm_semantic_accuracy(tr.output_path)
+                                if accuracy is None:
+                                    return JobStatusResult(
+                                        is_successful=False,
+                                        error_message=f"vLLM semantic accuracy not found in {tr.output_path}.",
+                                    )
                             return JobStatusResult(is_successful=True)
                     except Exception as e:
                         logging.debug(f"Error parsing number of successful requests: {e}")
@@ -172,3 +193,30 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         return JobStatusResult(
             is_successful=False, error_message=f"vLLM bench log does not contain benchmark result in {tr.output_path}."
         )
+
+
+@cache
+def parse_vllm_semantic_accuracy(output_path: Path) -> float | None:
+    """Parse vLLM semantic validation accuracy from JSON results or the eval log."""
+    json_path = output_path / VLLM_GSM8K_JSON_FILE
+    if json_path.is_file():
+        try:
+            data = json.loads(json_path.read_text(encoding="utf-8"))
+            accuracy = data.get("accuracy") if isinstance(data, dict) else None
+            if isinstance(accuracy, (int, float)):
+                return float(accuracy)
+        except Exception as e:
+            logging.debug(f"Error parsing vLLM semantic JSON output: {e}")
+
+    log_path = output_path / VLLM_SEMANTIC_EVAL_LOG_FILE
+    if not log_path.is_file():
+        return None
+
+    pattern = re.compile(r"\bAccuracy:\s*([0-9]*\.?[0-9]+)")
+    with log_path.open(encoding="utf-8", errors="ignore") as f:
+        for line in f:
+            match = pattern.search(line)
+            if match:
+                return float(match.group(1))
+
+    return None
diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
index 18c9d739b..84cdab162 100644
--- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
@@ -15,12 +15,19 @@
 # limitations under the License.
 
 from pathlib import Path
+from typing import cast
 
 import pytest
 
 from cloudai.core import TestRun
 from cloudai.systems.slurm import SlurmSystem
-from cloudai.workloads.sglang import SglangArgs, SglangCmdArgs, SglangSlurmCommandGenStrategy, SglangTestDefinition
+from cloudai.workloads.sglang import (
+    SglangArgs,
+    SglangCmdArgs,
+    SglangSemanticEvalCmdArgs,
+    SglangSlurmCommandGenStrategy,
+    SglangTestDefinition,
+)
 from cloudai.workloads.sglang.sglang import SGLANG_BENCH_JSONL_FILE, SGLANG_BENCH_LOG_FILE
 
 
@@ -136,6 +143,69 @@ def test_get_sglang_bench_command_writes_jsonl(
     assert output_file_args[0].endswith(f"/{SGLANG_BENCH_JSONL_FILE}")
 
 
+def test_get_sglang_semantic_eval_command_defaults(sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy):
+    sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test)
+    sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs()
+
+    command = sglang_cmd_gen_strategy.get_semantic_eval_command()
+
+    assert command == [
+        "python3",
+        "-m",
+        "sglang.test.run_eval",
+        "--host ${NODE}",
+        "--port 8000",
+        "--eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B",
+    ]
+
+
+def test_get_sglang_semantic_eval_command_supports_custom_module_and_args(
+    sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy,
+):
+    sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test)
+    sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs(
+        module="sglang.test.few_shot_gsm8k",
+        args="--num-questions 200 --data-path {output_path}/gsm8k.jsonl --seen {host}:{port}",
+    )
+
+    command = sglang_cmd_gen_strategy.get_semantic_eval_command()
+
+    assert command is not None
+    assert command[2] == "sglang.test.few_shot_gsm8k"
+    assert command[-1] == (
+        f"--num-questions 200 --data-path {sglang_cmd_gen_strategy.test_run.output_path.absolute()}/gsm8k.jsonl "
+        "--seen ${NODE}:8000"
+    )
+
+
+def test_gen_srun_command_contains_sglang_semantic_eval(sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy):
+    sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test)
+    sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs()
+
+    srun_command = sglang_cmd_gen_strategy._gen_srun_command()
+
+    assert "Running benchmark..." in srun_command
+    assert "Running semantic validation..." in srun_command
+    assert (
+        "--output=" + str((sglang_cmd_gen_strategy.test_run.output_path / "sglang-semantic-eval.log").absolute())
+        in srun_command
+    )
+    assert "python3 -m sglang.test.run_eval --host ${NODE} --port 8000" in srun_command
+
+
+def test_gen_srun_command_contains_sglang_semantic_eval_in_disagg(
+    sglang_disagg_tr: TestRun, slurm_system: SlurmSystem
+) -> None:
+    sglang_test = cast(SglangTestDefinition, sglang_disagg_tr.test)
+    sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs()
+    strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr)
+
+    srun_command = strategy._gen_srun_command()
+
+    assert "Running semantic validation..." in srun_command
+    assert "python3 -m sglang.test.run_eval --host ${PREFILL_NODE} --port 8000" in srun_command
+
+
 def test_gen_srun_command_contains_expected_flow(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None:
     strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr)
 
diff --git a/tests/workloads/sglang/test_job_status_retrieval_strategy.py b/tests/workloads/sglang/test_job_status_retrieval_strategy.py
index 71335711b..099e9f329 100644
--- a/tests/workloads/sglang/test_job_status_retrieval_strategy.py
+++ b/tests/workloads/sglang/test_job_status_retrieval_strategy.py
@@ -17,7 +17,13 @@
 import json
 
 from cloudai.core import TestRun
-from cloudai.workloads.sglang import SGLANG_BENCH_JSONL_FILE, SglangCmdArgs, SglangTestDefinition
+from cloudai.workloads.sglang import (
+    SGLANG_BENCH_JSONL_FILE,
+    SGLANG_SEMANTIC_EVAL_LOG_FILE,
+    SglangCmdArgs,
+    SglangSemanticEvalCmdArgs,
+    SglangTestDefinition,
+)
 
 
 class TestSglangSuccessCheck:
@@ -89,3 +95,55 @@ def test_failed_job_no_successful_requests(self, base_tr: TestRun) -> None:
         assert (
             result.error_message == f"SGLang bench jsonl does not contain successful requests in {base_tr.output_path}."
         )
+
+    def test_semantic_eval_successful_with_low_accuracy(self, base_tr: TestRun) -> None:
+        self.sglang_tdef.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs()
+        base_tr.output_path.mkdir(parents=True, exist_ok=True)
+        (base_tr.output_path / SGLANG_BENCH_JSONL_FILE).write_text(
+            json.dumps(
+                {
+                    "completed": 3,
+                    "num_prompts": 3,
+                    "request_throughput": 1.0,
+                    "max_concurrency": 16,
+                    "mean_ttft_ms": 1.0,
+                    "median_ttft_ms": 1.0,
+                    "p99_ttft_ms": 1.0,
+                    "mean_tpot_ms": 1.0,
+                    "median_tpot_ms": 1.0,
+                    "p99_tpot_ms": 1.0,
+                }
+            )
+            + "\n"
+        )
+        (base_tr.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE).write_text("Score: 0.000\n")
+
+        result = self.sglang_tdef.was_run_successful(base_tr)
+
+        assert result.is_successful
+
+    def test_semantic_eval_requires_parseable_accuracy(self, base_tr: TestRun) -> None:
+        self.sglang_tdef.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs()
+        base_tr.output_path.mkdir(parents=True, exist_ok=True)
+        (base_tr.output_path / SGLANG_BENCH_JSONL_FILE).write_text(
+            json.dumps(
+                {
+                    "completed": 3,
+                    "num_prompts": 3,
+                    "request_throughput": 1.0,
+                    "max_concurrency": 16,
+                    "mean_ttft_ms": 1.0,
+                    "median_ttft_ms": 1.0,
+                    "p99_ttft_ms": 1.0,
+                    "mean_tpot_ms": 1.0,
+                    "median_tpot_ms": 1.0,
+                    "p99_tpot_ms": 1.0,
+                }
+            )
+            + "\n"
+        )
+
+        result = self.sglang_tdef.was_run_successful(base_tr)
+
+        assert not result.is_successful
+        assert result.error_message == f"SGLang semantic accuracy not found in {base_tr.output_path}."
diff --git a/tests/workloads/sglang/test_report_gen_strategy.py b/tests/workloads/sglang/test_report_gen_strategy.py
index d1597f8c6..86d5ed9c3 100644
--- a/tests/workloads/sglang/test_report_gen_strategy.py
+++ b/tests/workloads/sglang/test_report_gen_strategy.py
@@ -16,6 +16,7 @@
 
 import json
 from pathlib import Path
+from typing import cast
 
 import pytest
 
@@ -26,9 +27,15 @@
     SGLangBenchReport,
     SGLangBenchReportGenerationStrategy,
     SglangCmdArgs,
+    SglangSemanticEvalCmdArgs,
     SglangTestDefinition,
 )
-from cloudai.workloads.sglang.sglang import SGLANG_BENCH_JSONL_FILE, parse_sglang_bench_output
+from cloudai.workloads.sglang.sglang import (
+    SGLANG_BENCH_JSONL_FILE,
+    SGLANG_SEMANTIC_EVAL_LOG_FILE,
+    parse_sglang_bench_output,
+    parse_sglang_semantic_accuracy,
+)
 
 BENCH_RECORD = {
     "num_prompts": 30,
@@ -131,6 +138,38 @@ def test_sglang_tps_per_gpu(slurm_system: SlurmSystem, sglang_tr: TestRun) -> No
     assert metric == 600.0
 
 
+def test_sglang_accuracy_metric(slurm_system: SlurmSystem, sglang_tr: TestRun):
+    sglang_test = cast(SglangTestDefinition, sglang_tr.test)
+    sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs()
+    (sglang_tr.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE).write_text("Score: 0.945\n", encoding="utf-8")
+
+    strategy = SGLangBenchReportGenerationStrategy(slurm_system, sglang_tr)
+
+    assert strategy.get_metric("accuracy") == 0.945
+
+
+def test_parse_sglang_semantic_accuracy_from_score(tmp_path: Path):
+    log_path = tmp_path / "score.log"
+    log_path.write_text("Total latency: 1.000 s\nScore: 0.812\n", encoding="utf-8")
+
+    assert parse_sglang_semantic_accuracy(log_path) == 0.812
+
+
+def test_parse_sglang_semantic_accuracy_from_legacy_accuracy(tmp_path: Path) -> None:
+    log_path = tmp_path / "accuracy.log"
+    log_path.write_text("Accuracy: 0.945\nInvalid: 0.000\n", encoding="utf-8")
+
+    assert parse_sglang_semantic_accuracy(log_path) == 0.945
+
+
+def test_parse_sglang_semantic_accuracy_missing_or_invalid(tmp_path: Path) -> None:
+    log_path = tmp_path / "invalid.log"
+    log_path.write_text("no score here\n", encoding="utf-8")
+
+    assert parse_sglang_semantic_accuracy(tmp_path / "missing.log") is None
+    assert parse_sglang_semantic_accuracy(log_path) is None
+
+
 def test_sglang_tps_per_user__concurrency_is_zero() -> None:
     bench_report = SGLangBenchReport.model_validate({**BENCH_RECORD, "max_concurrency": 0})
     assert bench_report.tps_per_user is None
diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
index 0585e9b91..c7163b689 100644
--- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
@@ -25,6 +25,7 @@
     VllmArgs,
     VllmBenchCmdArgs,
     VllmCmdArgs,
+    VllmSemanticEvalCmdArgs,
     VllmSlurmCommandGenStrategy,
     VllmTestDefinition,
 )
@@ -184,6 +185,54 @@ def test_get_vllm_bench_command_with_extra_args(
         assert "--extra-3 3" in cmd
 
 
+class TestVllmSemanticEvalCommand:
+    def test_get_vllm_semantic_eval_command_defaults(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy):
+        vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test)
+        vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs()
+
+        command = vllm_cmd_gen_strategy.get_semantic_eval_command()
+
+        assert command == [
+            "python3",
+            "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py",
+            "--host http://${NODE}",
+            "--port 8000",
+            "--num-questions 200 --save-results "
+            f"{vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json",
+        ]
+
+    def test_gen_srun_command_contains_vllm_semantic_eval(
+        self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy
+    ) -> None:
+        vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test)
+        vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs()
+
+        srun_command = vllm_cmd_gen_strategy._gen_srun_command()
+
+        assert "Running benchmark..." in srun_command
+        assert "Running semantic validation..." in srun_command
+        assert (
+            "--output=" + str((vllm_cmd_gen_strategy.test_run.output_path / "vllm-semantic-eval.log").absolute())
+            in srun_command
+        )
+        assert "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py --host http://${NODE} --port 8000" in srun_command
+
+    def test_gen_srun_command_contains_vllm_semantic_eval_in_disagg(
+        self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem
+    ) -> None:
+        vllm_disagg_test = cast(VllmTestDefinition, vllm_disagg_tr.test)
+        vllm_disagg_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs()
+        strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr)
+
+        srun_command = strategy._gen_srun_command()
+
+        assert "Running semantic validation..." in srun_command
+        assert (
+            "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py --host http://${PREFILL_NODE} --port 8000"
+            in srun_command
+        )
+
+
 class TestVllmAggregatedMode:
     """Tests for vLLM non-disaggregated mode with 1 GPU."""
 
diff --git a/tests/workloads/vllm/test_job_status_retrieval_strategy.py b/tests/workloads/vllm/test_job_status_retrieval_strategy.py
index ec7678199..830e64ce9 100644
--- a/tests/workloads/vllm/test_job_status_retrieval_strategy.py
+++ b/tests/workloads/vllm/test_job_status_retrieval_strategy.py
@@ -15,7 +15,13 @@
 # limitations under the License.
 
 from cloudai.core import TestRun
-from cloudai.workloads.vllm import VLLM_BENCH_LOG_FILE, VllmCmdArgs, VllmTestDefinition
+from cloudai.workloads.vllm import (
+    VLLM_BENCH_LOG_FILE,
+    VLLM_GSM8K_JSON_FILE,
+    VllmCmdArgs,
+    VllmSemanticEvalCmdArgs,
+    VllmTestDefinition,
+)
 
 
 class TestVllmSuccessCheck:
@@ -72,3 +78,35 @@ def test_no_successful_requests(self, base_tr: TestRun) -> None:
         result = self.vllm_tdef.was_run_successful(base_tr)
         assert not result.is_successful
         assert result.error_message == f"vLLM bench log does not contain benchmark result in {base_tr.output_path}."
+
+    def test_semantic_eval_successful_with_low_accuracy(self, base_tr: TestRun) -> None:
+        self.vllm_tdef.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs()
+        base_tr.output_path.mkdir(parents=True, exist_ok=True)
+        log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE
+        log_file.write_text(
+            """
+============ Serving Benchmark Result ============
+Successful requests:                     1
+"""
+        )
+        (base_tr.output_path / VLLM_GSM8K_JSON_FILE).write_text('{"accuracy": 0.0}')
+
+        result = self.vllm_tdef.was_run_successful(base_tr)
+
+        assert result.is_successful
+
+    def test_semantic_eval_requires_parseable_accuracy(self, base_tr: TestRun) -> None:
+        self.vllm_tdef.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs()
+        base_tr.output_path.mkdir(parents=True, exist_ok=True)
+        log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE
+        log_file.write_text(
+            """
+============ Serving Benchmark Result ============
+Successful requests:                     1
+"""
+        )
+
+        result = self.vllm_tdef.was_run_successful(base_tr)
+
+        assert not result.is_successful
+        assert result.error_message == f"vLLM semantic accuracy not found in {base_tr.output_path}."
diff --git a/tests/workloads/vllm/test_report_gen_strategy.py b/tests/workloads/vllm/test_report_gen_strategy.py
index c1ae400cf..7e1821067 100644
--- a/tests/workloads/vllm/test_report_gen_strategy.py
+++ b/tests/workloads/vllm/test_report_gen_strategy.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 from pathlib import Path
+from typing import cast
 
 import pytest
 
@@ -25,10 +26,16 @@
     VLLMBenchReport,
     VLLMBenchReportGenerationStrategy,
     VllmCmdArgs,
+    VllmSemanticEvalCmdArgs,
     VllmTestDefinition,
 )
 from cloudai.workloads.vllm.report_generation_strategy import parse_vllm_bench_output
-from cloudai.workloads.vllm.vllm import VLLM_BENCH_JSON_FILE
+from cloudai.workloads.vllm.vllm import (
+    VLLM_BENCH_JSON_FILE,
+    VLLM_GSM8K_JSON_FILE,
+    VLLM_SEMANTIC_EVAL_LOG_FILE,
+    parse_vllm_semantic_accuracy,
+)
 
 BENCH_DATA = VLLMBenchReport(
     num_prompts=30,
@@ -115,3 +122,32 @@ def test_vllm_tps_per_gpu(slurm_system: SlurmSystem, vllm_tr: TestRun, ngpus: in
     metric = strategy.get_metric("tps-per-gpu")
 
     assert metric == BENCH_DATA.throughput / ngpus
+
+
+def test_vllm_accuracy_metric(slurm_system: SlurmSystem, vllm_tr: TestRun) -> None:
+    vllm_test = cast(VllmTestDefinition, vllm_tr.test)
+    vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs()
+    (vllm_tr.output_path / VLLM_GSM8K_JSON_FILE).write_text('{"accuracy": 0.875}', encoding="utf-8")
+
+    strategy = VLLMBenchReportGenerationStrategy(slurm_system, vllm_tr)
+
+    assert strategy.get_metric("accuracy") == 0.875
+
+
+def test_parse_vllm_semantic_accuracy_from_json(tmp_path: Path) -> None:
+    (tmp_path / VLLM_GSM8K_JSON_FILE).write_text('{"accuracy": 0.91}', encoding="utf-8")
+
+    assert parse_vllm_semantic_accuracy(tmp_path) == 0.91
+
+
+def test_parse_vllm_semantic_accuracy_falls_back_to_log(tmp_path: Path) -> None:
+    (tmp_path / VLLM_GSM8K_JSON_FILE).write_text("{invalid", encoding="utf-8")
+    (tmp_path / VLLM_SEMANTIC_EVAL_LOG_FILE).write_text("Accuracy: 0.742\n", encoding="utf-8")
+
+    assert parse_vllm_semantic_accuracy(tmp_path) == 0.742
+
+
+def test_parse_vllm_semantic_accuracy_missing_or_invalid(tmp_path: Path) -> None:
+    (tmp_path / VLLM_SEMANTIC_EVAL_LOG_FILE).write_text("no accuracy here\n", encoding="utf-8")
+
+    assert parse_vllm_semantic_accuracy(tmp_path) is None

From 7f1b7a7c22ccc6385a5e95d89a0d966a4c0fd81d Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 20 May 2026 22:05:38 +0200
Subject: [PATCH 2/6] update readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 65f42b4dc..a96814570 100644
--- a/README.md
+++ b/README.md
@@ -43,10 +43,12 @@ These schemas enable CloudAI to be flexible and compatible with different system
 |NIXL benchmark|✅|❌|❌|❌|
 |NIXL kvbench|✅|❌|❌|❌|
 |NIXL CTPerf|✅|❌|❌|❌|
+|SGLang|✅|❌|❌|❌|
 |Sleep|✅|✅|❌|✅|
 |SlurmContainer|✅|❌|❌|❌|
 |Triton Inference|✅|❌|❌|❌|
 |UCC|✅|❌|❌|❌|
+|vLLM|✅|❌|❌|❌|
 
 Note: Deprecated means that a workload support exists, but we are not maintaining it actively anymore and newer configurations might not work.
 

From b0e2425f4e7c3e4eaf83050f7bf9d9d95853ef3d Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 20 May 2026 22:43:43 +0200
Subject: [PATCH 3/6] vllm/sglang configs in the repo

---
 conf/experimental/sglang/test/sglang.toml     | 30 +++++++
 .../sglang/test_scenario/sglang.toml          | 81 +++++++++++++++++++
 conf/experimental/vllm/test/vllm.toml         | 35 ++++++++
 .../experimental/vllm/test_scenario/vllm.toml | 51 ++++++++++++
 src/cloudai/workloads/common/llm_serving.py   | 37 ++++-----
 5 files changed, 211 insertions(+), 23 deletions(-)
 create mode 100644 conf/experimental/sglang/test/sglang.toml
 create mode 100644 conf/experimental/sglang/test_scenario/sglang.toml
 create mode 100644 conf/experimental/vllm/test/vllm.toml
 create mode 100644 conf/experimental/vllm/test_scenario/vllm.toml

diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml
new file mode 100644
index 000000000..e6d2c09b4
--- /dev/null
+++ b/conf/experimental/sglang/test/sglang.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "sglang"
+description = "SGLang test"
+test_template_name = "sglang"
+
+[cmd_args]
+docker_image_url = "lmsysorg/sglang:dev-cu13"
+
+[semantic_eval_cmd_args]
+module = "sglang.test.run_eval"
+args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}"
+
+[extra_env_vars]
+UCX_NET_DEVICES = "all"
+UCX_TLS = "^gdr_copy,cuda_ipc"
diff --git a/conf/experimental/sglang/test_scenario/sglang.toml b/conf/experimental/sglang/test_scenario/sglang.toml
new file mode 100644
index 000000000..be1bb7bb5
--- /dev/null
+++ b/conf/experimental/sglang/test_scenario/sglang.toml
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "sglang"
+
+[[Tests]]
+id = "sglang.agg.2nodes"
+test_name = "sglang"
+num_nodes = 2
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.decode]
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.agg.1node"
+test_name = "sglang"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.decode]
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.disagg.sync"
+test_name = "sglang"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  gpu_ids = "0,1"
+  tensor_parallel_size = 4
+  mem_fraction_static = 0.75
+
+  [Tests.cmd_args.decode]
+  gpu_ids = "2,3"
+  tensor_parallel_size = 4
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.disagg.async"
+test_name = "sglang"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  gpu_ids = "0,1"
+  tensor_parallel_size = 4
+  mem_fraction_static = 0.75
+
+  [Tests.cmd_args.decode]
+  gpu_ids = "2,3"
+  tensor_parallel_size = 2
+  mem_fraction_static = 0.75
+
+[[Tests]]
+id = "sglang.disagg.2nodes"
+test_name = "sglang"
+num_nodes = 2
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  tensor_parallel_size = 8
+  mem_fraction_static = 0.75
+
+  [Tests.cmd_args.decode]
+  tensor_parallel_size = 8
+  mem_fraction_static = 0.75
diff --git a/conf/experimental/vllm/test/vllm.toml b/conf/experimental/vllm/test/vllm.toml
new file mode 100644
index 000000000..891023201
--- /dev/null
+++ b/conf/experimental/vllm/test/vllm.toml
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "vllm"
+description = "vLLM test"
+test_template_name = "vllm"
+
+[[git_repos]]
+url = "https://github.com/vllm-project/vllm.git"
+commit = "main"
+mount_as = "/vllm_repo"
+
+[cmd_args]
+docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1"
+
+[semantic_eval_cmd_args]
+script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py"
+args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json"
+
+[extra_env_vars]
+UCX_NET_DEVICES = "all"
+UCX_TLS = "^gdr_copy,cuda_ipc"
diff --git a/conf/experimental/vllm/test_scenario/vllm.toml b/conf/experimental/vllm/test_scenario/vllm.toml
new file mode 100644
index 000000000..ecacf4276
--- /dev/null
+++ b/conf/experimental/vllm/test_scenario/vllm.toml
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "vllm"
+
+[[Tests]]
+id = "vllm.disagg.sync"
+test_name = "vllm"
+num_nodes = 2
+time_limit = "00:30:00"
+
+  [Tests.cmd_args.prefill]
+  enforce_eager = ""
+  tensor_parallel_size = 2
+  max_num_batched_tokens = 1024
+
+  [Tests.cmd_args.decode]
+  enforce_eager = ""
+  tensor_parallel_size = 2
+  max_num_batched_tokens = 1024
+
+[[Tests]]
+id = "vllm.disagg.async"
+test_name = "vllm"
+num_nodes = 1
+time_limit = "00:10:00"
+
+  [Tests.cmd_args.prefill]
+  gpu_ids = "0,1"
+  enforce_eager = ""
+  tensor_parallel_size = 4
+  max_num_batched_tokens = 1024
+
+  [Tests.cmd_args.decode]
+  gpu_ids = "2,3"
+  enforce_eager = ""
+  tensor_parallel_size = 2
+  max_num_batched_tokens = 1024
diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py
index 4ab2718a6..75e0dca5e 100644
--- a/src/cloudai/workloads/common/llm_serving.py
+++ b/src/cloudai/workloads/common/llm_serving.py
@@ -313,32 +313,23 @@ def generate_report(self) -> None:
         table.add_column("TPOT Mean, ms", justify="right")
         table.add_column("TPOT Median, ms", justify="right")
         table.add_column("TPOT P99, ms", justify="right")
+
+        row = [
+            f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})",
+            f"{results.mean_ttft_ms:.4f}",
+            f"{results.median_ttft_ms:.4f}",
+            f"{results.p99_ttft_ms:.4f}",
+            f"{results.mean_tpot_ms:.4f}",
+            f"{results.median_tpot_ms:.4f}",
+            f"{results.p99_tpot_ms:.4f}",
+        ]
+
         accuracy = self.get_metric("accuracy")
         if accuracy != METRIC_ERROR:
             table.add_column("Accuracy", justify="right")
-            row = [
-                f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})",
-                f"{results.mean_ttft_ms:.4f}",
-                f"{results.median_ttft_ms:.4f}",
-                f"{results.p99_ttft_ms:.4f}",
-                f"{results.mean_tpot_ms:.4f}",
-                f"{results.median_tpot_ms:.4f}",
-                f"{results.p99_tpot_ms:.4f}",
-                f"{accuracy:.4f}",
-            ]
-        else:
-            row = [
-                f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})",
-                f"{results.mean_ttft_ms:.4f}",
-                f"{results.median_ttft_ms:.4f}",
-                f"{results.p99_ttft_ms:.4f}",
-                f"{results.mean_tpot_ms:.4f}",
-                f"{results.median_tpot_ms:.4f}",
-                f"{results.p99_tpot_ms:.4f}",
-            ]
-        table.add_row(
-            *row,
-        )
+            row.append(f"{accuracy:.4f}")
+
+        table.add_row(*row)
         console.print(table)
 
 

From b32782affac9362f201ab6711e818562757a573c Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Wed, 20 May 2026 23:39:13 +0200
Subject: [PATCH 4/6] adjust configs

---
 conf/experimental/sglang/test/sglang.toml          |  2 +-
 conf/experimental/sglang/test_scenario/sglang.toml | 10 +++++-----
 conf/experimental/vllm/test_scenario/vllm.toml     |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml
index e6d2c09b4..ec420d0fd 100644
--- a/conf/experimental/sglang/test/sglang.toml
+++ b/conf/experimental/sglang/test/sglang.toml
@@ -19,7 +19,7 @@ description = "SGLang test"
 test_template_name = "sglang"
 
 [cmd_args]
-docker_image_url = "lmsysorg/sglang:dev-cu13"
+docker_image_url = "nvcr.io#nvidia/ai-dynamo/sglang-runtime:1.1.1"
 
 [semantic_eval_cmd_args]
 module = "sglang.test.run_eval"
diff --git a/conf/experimental/sglang/test_scenario/sglang.toml b/conf/experimental/sglang/test_scenario/sglang.toml
index be1bb7bb5..b6f96f4e2 100644
--- a/conf/experimental/sglang/test_scenario/sglang.toml
+++ b/conf/experimental/sglang/test_scenario/sglang.toml
@@ -42,12 +42,12 @@ time_limit = "00:10:00"
 
   [Tests.cmd_args.prefill]
   gpu_ids = "0,1"
-  tensor_parallel_size = 4
+  tensor_parallel_size = 2
   mem_fraction_static = 0.75
 
   [Tests.cmd_args.decode]
   gpu_ids = "2,3"
-  tensor_parallel_size = 4
+  tensor_parallel_size = 2
   mem_fraction_static = 0.75
 
 [[Tests]]
@@ -58,7 +58,7 @@ time_limit = "00:10:00"
 
   [Tests.cmd_args.prefill]
   gpu_ids = "0,1"
-  tensor_parallel_size = 4
+  tensor_parallel_size = 2
   mem_fraction_static = 0.75
 
   [Tests.cmd_args.decode]
@@ -73,9 +73,9 @@ num_nodes = 2
 time_limit = "00:10:00"
 
   [Tests.cmd_args.prefill]
-  tensor_parallel_size = 8
+  tensor_parallel_size = 4
   mem_fraction_static = 0.75
 
   [Tests.cmd_args.decode]
-  tensor_parallel_size = 8
+  tensor_parallel_size = 4
   mem_fraction_static = 0.75
diff --git a/conf/experimental/vllm/test_scenario/vllm.toml b/conf/experimental/vllm/test_scenario/vllm.toml
index ecacf4276..8e1207221 100644
--- a/conf/experimental/vllm/test_scenario/vllm.toml
+++ b/conf/experimental/vllm/test_scenario/vllm.toml
@@ -41,7 +41,7 @@ time_limit = "00:10:00"
   [Tests.cmd_args.prefill]
   gpu_ids = "0,1"
   enforce_eager = ""
-  tensor_parallel_size = 4
+  tensor_parallel_size = 1
   max_num_batched_tokens = 1024
 
   [Tests.cmd_args.decode]

From a73815863f0eefaf32c7cd1d5f12cf11e712820d Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Thu, 21 May 2026 11:06:48 +0200
Subject: [PATCH 5/6] support custom bash for semantic eval

---
 src/cloudai/workloads/common/llm_serving.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py
index 75e0dca5e..87ad7b3a3 100644
--- a/src/cloudai/workloads/common/llm_serving.py
+++ b/src/cloudai/workloads/common/llm_serving.py
@@ -634,13 +634,14 @@ def _gen_semantic_eval_block(self, srun_prefix: str) -> str:
         semantic_cmd = self.get_semantic_eval_command()
         if not semantic_cmd:
             return ""
+        semantic_cmd_full = self._with_custom_bash(" ".join(semantic_cmd))
 
         return f"""\
 
 echo "Running semantic validation..."
 {srun_prefix} \\
     --output={(self.test_run.output_path / self.semantic_eval_log_file).absolute()} \\
-    {" ".join(semantic_cmd)}"""
+    {semantic_cmd_full}"""
 
     def _gen_srun_command(self) -> str:
         serve_commands = self.get_serve_commands()

From 7cb8c64b8a5c2baf4809d23023b8aa6cb7b4850e Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Thu, 21 May 2026 11:16:54 +0200
Subject: [PATCH 6/6] update sglang image

---
 conf/experimental/sglang/test/sglang.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml
index ec420d0fd..e6d2c09b4 100644
--- a/conf/experimental/sglang/test/sglang.toml
+++ b/conf/experimental/sglang/test/sglang.toml
@@ -19,7 +19,7 @@ description = "SGLang test"
 test_template_name = "sglang"
 
 [cmd_args]
-docker_image_url = "nvcr.io#nvidia/ai-dynamo/sglang-runtime:1.1.1"
+docker_image_url = "lmsysorg/sglang:dev-cu13"
 
 [semantic_eval_cmd_args]
 module = "sglang.test.run_eval"