From 54297678120beb19def0030323826025359af1a9 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 13 May 2026 16:31:19 +0200 Subject: [PATCH 1/6] vllm/sglang: add semantic degradation support --- doc/workloads/sglang.rst | 34 ++++++++ doc/workloads/vllm.rst | 29 +++++++ src/cloudai/workloads/common/llm_serving.py | 79 ++++++++++++++++--- src/cloudai/workloads/sglang/__init__.py | 4 + .../sglang/report_generation_strategy.py | 12 ++- src/cloudai/workloads/sglang/sglang.py | 33 ++++++++ .../sglang/slurm_command_gen_strategy.py | 19 +++++ src/cloudai/workloads/vllm/__init__.py | 14 +++- .../vllm/report_generation_strategy.py | 5 +- .../vllm/slurm_command_gen_strategy.py | 19 +++++ src/cloudai/workloads/vllm/vllm.py | 48 +++++++++++ .../sglang/test_command_gen_strategy_slurm.py | 72 ++++++++++++++++- .../test_job_status_retrieval_strategy.py | 60 +++++++++++++- .../sglang/test_report_gen_strategy.py | 41 +++++++++- .../vllm/test_command_gen_strategy_slurm.py | 49 ++++++++++++ .../test_job_status_retrieval_strategy.py | 40 +++++++++- .../vllm/test_report_gen_strategy.py | 38 ++++++++- 17 files changed, 579 insertions(+), 17 deletions(-) diff --git a/doc/workloads/sglang.rst b/doc/workloads/sglang.rst index 77121715b..d0561c773 100644 --- a/doc/workloads/sglang.rst +++ b/doc/workloads/sglang.rst @@ -28,6 +28,10 @@ Test + Scenario example max_concurrency = 16 num_prompts = 30 + [semantic_eval_cmd_args] + module = "sglang.test.run_eval" + args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + .. code-block:: toml :caption: scenario.toml (scenario with one test) @@ -68,6 +72,29 @@ Test-in-Scenario example num_prompts = 30 +Semantic Validation +------------------- +To run GSM8K semantic validation after the serving benchmark, add ``semantic_eval_cmd_args``. CloudAI reports +``accuracy`` from the eval output, but does not enforce an accuracy threshold. + +.. code-block:: toml + :caption: test.toml (semantic validation) + + [semantic_eval_cmd_args] + module = "sglang.test.run_eval" + args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + +For images that still use the legacy SGLang GSM8K runner, override the module and raw arguments: + +.. code-block:: toml + + [semantic_eval_cmd_args] + module = "sglang.test.few_shot_gsm8k" + args = "--num-questions 200" + +The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders. + + Control number of GPUs ---------------------- The number of GPUs can be controlled using the options below, listed from lowest to highest priority: @@ -130,6 +157,13 @@ Benchmark Command Arguments :members: :show-inheritance: +Semantic Eval Command Arguments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autopydantic_model:: cloudai.workloads.sglang.sglang.SglangSemanticEvalCmdArgs + :members: + :show-inheritance: + Test Definition ~~~~~~~~~~~~~~~ diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index 7285c5858..930bcf11b 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -28,6 +28,10 @@ Test and Scenario Examples max_concurrency = 16 num_prompts = 30 + [semantic_eval_cmd_args] + script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + .. code-block:: toml :caption: scenario.toml (scenario with one test) @@ -68,6 +72,24 @@ Test-in-Scenario example num_prompts = 30 +Semantic Validation +------------------- +To run GSM8K semantic validation after the serving benchmark, add ``semantic_eval_cmd_args``. CloudAI reports +``accuracy`` from the eval output, but does not enforce an accuracy threshold. + +.. code-block:: toml + :caption: test.toml (semantic validation) + + [semantic_eval_cmd_args] + script = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + +If the runtime image does not contain the eval script, mount a vLLM repository with existing ``git_repos`` support and +point ``script`` at the mounted path. + +The ``args`` string supports ``{model}``, ``{host}``, ``{port}``, and ``{output_path}`` placeholders. + + Controlling the Number of GPUs ------------------------------- The number of GPUs can be controlled using the options below, listed from lowest to highest priority: @@ -154,6 +176,13 @@ Benchmark Command Arguments :members: :show-inheritance: +Semantic Eval Command Arguments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: cloudai.workloads.vllm.vllm.VllmSemanticEvalCmdArgs + :members: + :show-inheritance: + Test Definition ~~~~~~~~~~~~~~~ diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index a023f2443..f83b3a97a 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -217,6 +217,7 @@ class LLMServingReportGenerationStrategy(ReportGenerationStrategy, Generic[TestD "throughput", "tps-per-user", "tps-per-gpu", + "accuracy", ] @property @@ -240,6 +241,10 @@ def all_gpu_ids(self, tdef: TestDefT, gpus_per_node: int | None) -> list[int]: def parse_results(self) -> ReportT | None: return self.parse_output(self.test_run.output_path / self.result_file_name) + def parse_semantic_accuracy(self) -> float | None: + """Parse semantic validation accuracy, if supported by the workload.""" + return None + def can_handle_directory(self) -> bool: return self.parse_results() is not None @@ -261,6 +266,13 @@ def get_metric(self, metric: str) -> MetricValue: if metric not in self.metrics: return METRIC_ERROR + if metric == "accuracy": + tdef = cast(TestDefT, self.test_run.test) + if getattr(tdef, "semantic_eval_cmd_args", None) is None: + return METRIC_ERROR + accuracy = self.parse_semantic_accuracy() + return accuracy if accuracy is not None else METRIC_ERROR + results = self.parse_results() if results is None: return METRIC_ERROR @@ -286,14 +298,31 @@ def generate_report(self) -> None: table.add_column("TPOT Mean, ms", justify="right") table.add_column("TPOT Median, ms", justify="right") table.add_column("TPOT P99, ms", justify="right") + accuracy = self.get_metric("accuracy") + if accuracy != METRIC_ERROR: + table.add_column("Accuracy", justify="right") + row = [ + f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})", + f"{results.mean_ttft_ms:.4f}", + f"{results.median_ttft_ms:.4f}", + f"{results.p99_ttft_ms:.4f}", + f"{results.mean_tpot_ms:.4f}", + f"{results.median_tpot_ms:.4f}", + f"{results.p99_tpot_ms:.4f}", + f"{accuracy:.4f}", + ] + else: + row = [ + f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})", + f"{results.mean_ttft_ms:.4f}", + f"{results.median_ttft_ms:.4f}", + f"{results.p99_ttft_ms:.4f}", + f"{results.mean_tpot_ms:.4f}", + f"{results.median_tpot_ms:.4f}", + f"{results.p99_tpot_ms:.4f}", + ] table.add_row( - f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})", - f"{results.mean_ttft_ms:.4f}", - f"{results.median_ttft_ms:.4f}", - f"{results.p99_ttft_ms:.4f}", - f"{results.mean_tpot_ms:.4f}", - f"{results.median_tpot_ms:.4f}", - f"{results.p99_tpot_ms:.4f}", + *row, ) console.print(table) @@ -522,6 +551,11 @@ def bench_log_file(self) -> str: """Benchmark log file name.""" return f"{self.workload_slug}-bench.log" + @property + def semantic_eval_log_file(self) -> str: + """Semantic validation log file name.""" + return f"{self.workload_slug}-semantic-eval.log" + @property def serve_pid_var(self) -> str: """Shell variable holding the aggregated serve PID.""" @@ -558,6 +592,33 @@ def get_bench_command(self) -> list[str]: def get_helper_command(self) -> list[str]: """Return the helper process command for disaggregated mode.""" + def get_semantic_eval_command(self) -> list[str] | None: + """Return the optional semantic validation command.""" + return None + + def _expand_semantic_eval_args(self, args: str, *, host: str) -> str: + replacements = { + "{model}": self.tdef.cmd_args.model, + "{host}": host, + "{port}": str(self.serve_port), + "{output_path}": str(self.test_run.output_path.absolute()), + } + for placeholder, value in replacements.items(): + args = args.replace(placeholder, value) + return args + + def _gen_semantic_eval_block(self, srun_prefix: str) -> str: + semantic_cmd = self.get_semantic_eval_command() + if not semantic_cmd: + return "" + + return f"""\ + +echo "Running semantic validation..." +{srun_prefix} \\ + --output={(self.test_run.output_path / self.semantic_eval_log_file).absolute()} \\ + {" ".join(semantic_cmd)}""" + def _gen_srun_command(self) -> str: serve_commands = self.get_serve_commands() srun_command = self._gen_llm_serving_srun_command(serve_commands) @@ -593,7 +654,7 @@ def _gen_aggregated_script(self, serve_cmd: list[str], bench_cmd: str) -> str: echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={(self.test_run.output_path / self.bench_log_file).absolute()} \\ - {bench_cmd}""" + {bench_cmd}{self._gen_semantic_eval_block(f"{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1")}""" def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: str) -> str: prefill_cmd, decode_cmd = serve_commands @@ -651,4 +712,4 @@ def _gen_disaggregated_script(self, serve_commands: list[list[str]], bench_cmd: echo "Running benchmark..." {prefill_srun_prefix} \\ --output={(self.test_run.output_path / self.bench_log_file).absolute()} \\ - {bench_cmd}""" + {bench_cmd}{self._gen_semantic_eval_block(prefill_srun_prefix)}""" diff --git a/src/cloudai/workloads/sglang/__init__.py b/src/cloudai/workloads/sglang/__init__.py index 7b1ebf778..578eef27b 100644 --- a/src/cloudai/workloads/sglang/__init__.py +++ b/src/cloudai/workloads/sglang/__init__.py @@ -18,10 +18,12 @@ from .sglang import ( SGLANG_BENCH_JSONL_FILE, SGLANG_BENCH_LOG_FILE, + SGLANG_SEMANTIC_EVAL_LOG_FILE, SglangArgs, SglangBenchCmdArgs, SGLangBenchReport, SglangCmdArgs, + SglangSemanticEvalCmdArgs, SglangTestDefinition, ) from .slurm_command_gen_strategy import SglangSlurmCommandGenStrategy @@ -29,11 +31,13 @@ __all__ = [ "SGLANG_BENCH_JSONL_FILE", "SGLANG_BENCH_LOG_FILE", + "SGLANG_SEMANTIC_EVAL_LOG_FILE", "SGLangBenchReport", "SGLangBenchReportGenerationStrategy", "SglangArgs", "SglangBenchCmdArgs", "SglangCmdArgs", + "SglangSemanticEvalCmdArgs", "SglangSlurmCommandGenStrategy", "SglangTestDefinition", ] diff --git a/src/cloudai/workloads/sglang/report_generation_strategy.py b/src/cloudai/workloads/sglang/report_generation_strategy.py index b535647bc..00c58d83f 100644 --- a/src/cloudai/workloads/sglang/report_generation_strategy.py +++ b/src/cloudai/workloads/sglang/report_generation_strategy.py @@ -18,7 +18,14 @@ from cloudai.workloads.common.llm_serving import LLMServingReportGenerationStrategy, all_gpu_ids -from .sglang import SGLANG_BENCH_JSONL_FILE, SGLangBenchReport, SglangTestDefinition, parse_sglang_bench_output +from .sglang import ( + SGLANG_BENCH_JSONL_FILE, + SGLANG_SEMANTIC_EVAL_LOG_FILE, + SGLangBenchReport, + SglangTestDefinition, + parse_sglang_bench_output, + parse_sglang_semantic_accuracy, +) class SGLangBenchReportGenerationStrategy(LLMServingReportGenerationStrategy[SglangTestDefinition, SGLangBenchReport]): @@ -35,5 +42,8 @@ def report_title(self) -> str: def parse_output(self, path: Path) -> SGLangBenchReport | None: return parse_sglang_bench_output(path) + def parse_semantic_accuracy(self) -> float | None: + return parse_sglang_semantic_accuracy(self.test_run.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE) + def all_gpu_ids(self, tdef: SglangTestDefinition, gpus_per_node: int | None) -> list[int]: return all_gpu_ids(tdef, gpus_per_node) diff --git a/src/cloudai/workloads/sglang/sglang.py b/src/cloudai/workloads/sglang/sglang.py index dbb48e92d..41428ccc8 100644 --- a/src/cloudai/workloads/sglang/sglang.py +++ b/src/cloudai/workloads/sglang/sglang.py @@ -17,6 +17,7 @@ from __future__ import annotations import logging +import re from functools import cache from pathlib import Path @@ -34,6 +35,7 @@ SGLANG_SERVE_LOG_FILE = "sglang-serve.log" SGLANG_BENCH_LOG_FILE = "sglang-bench.log" SGLANG_BENCH_JSONL_FILE = "sglang-bench.jsonl" +SGLANG_SEMANTIC_EVAL_LOG_FILE = "sglang-semantic-eval.log" class SglangArgs(LLMServingArgs): @@ -85,14 +87,29 @@ class SglangBenchCmdArgs(CmdArgs): output_details: bool = True +class SglangSemanticEvalCmdArgs(CmdArgs): + """SGLang semantic validation command arguments.""" + + module: str = "sglang.test.run_eval" + args: str = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + + class SglangTestDefinition(LLMServingTestDefinition[SglangCmdArgs]): """Test object for SGLang.""" bench_cmd_args: SglangBenchCmdArgs = SglangBenchCmdArgs() + semantic_eval_cmd_args: SglangSemanticEvalCmdArgs | None = None def was_run_successful(self, tr: TestRun) -> JobStatusResult: res = parse_sglang_bench_output(tr.output_path / SGLANG_BENCH_JSONL_FILE) if res and res.completed > 0: + if self.semantic_eval_cmd_args is not None: + accuracy = parse_sglang_semantic_accuracy(tr.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE) + if accuracy is None: + return JobStatusResult( + is_successful=False, + error_message=f"SGLang semantic accuracy not found in {tr.output_path}.", + ) return JobStatusResult(is_successful=True) return JobStatusResult( @@ -139,3 +156,19 @@ def parse_sglang_bench_output(jsonl_file: Path) -> SGLangBenchReport | None: continue return None + + +@cache +def parse_sglang_semantic_accuracy(log_file: Path) -> float | None: + """Parse SGLang semantic validation accuracy from run_eval or legacy GSM8K output.""" + if not log_file.is_file(): + return None + + pattern = re.compile(r"\b(?:Score|Accuracy):\s*([0-9]*\.?[0-9]+)") + with log_file.open(encoding="utf-8", errors="ignore") as f: + for line in f: + match = pattern.search(line) + if match: + return float(match.group(1)) + + return None diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py index fd2548b18..f1c7c741c 100644 --- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py @@ -23,6 +23,7 @@ SglangArgs, SglangBenchCmdArgs, SglangCmdArgs, + SglangSemanticEvalCmdArgs, SglangTestDefinition, ) @@ -123,5 +124,23 @@ def get_bench_command(self) -> list[str]: return command + def get_semantic_eval_command(self) -> list[str] | None: + eval_args: SglangSemanticEvalCmdArgs | None = self.tdef.semantic_eval_cmd_args + if eval_args is None: + return None + + host = self.bench_host + command = [ + "python3", + "-m", + eval_args.module, + f"--host {host}", + f"--port {self.serve_port}", + ] + args = self._expand_semantic_eval_args(eval_args.args, host=host) + if args: + command.append(args) + return command + def aggregated_serve_env(self) -> dict[str, str]: return {"CUDA_VISIBLE_DEVICES": ",".join(str(gpu_id) for gpu_id in self.gpu_ids)} diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py index 5dbcf72ce..9d0da4e6f 100644 --- a/src/cloudai/workloads/vllm/__init__.py +++ b/src/cloudai/workloads/vllm/__init__.py @@ -16,15 +16,27 @@ from .report_generation_strategy import VLLMBenchReport, VLLMBenchReportGenerationStrategy from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy -from .vllm import VLLM_BENCH_LOG_FILE, VllmArgs, VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition +from .vllm import ( + VLLM_BENCH_LOG_FILE, + VLLM_GSM8K_JSON_FILE, + VLLM_SEMANTIC_EVAL_LOG_FILE, + VllmArgs, + VllmBenchCmdArgs, + VllmCmdArgs, + VllmSemanticEvalCmdArgs, + VllmTestDefinition, +) __all__ = [ "VLLM_BENCH_LOG_FILE", + "VLLM_GSM8K_JSON_FILE", + "VLLM_SEMANTIC_EVAL_LOG_FILE", "VLLMBenchReport", "VLLMBenchReportGenerationStrategy", "VllmArgs", "VllmBenchCmdArgs", "VllmCmdArgs", + "VllmSemanticEvalCmdArgs", "VllmSlurmCommandGenStrategy", "VllmTestDefinition", ] diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py index 0cfbae5e8..fcecabfba 100644 --- a/src/cloudai/workloads/vllm/report_generation_strategy.py +++ b/src/cloudai/workloads/vllm/report_generation_strategy.py @@ -21,7 +21,7 @@ from cloudai.workloads.common.llm_serving import LLMServingBenchReport, LLMServingReportGenerationStrategy, all_gpu_ids -from .vllm import VLLM_BENCH_JSON_FILE, VllmTestDefinition +from .vllm import VLLM_BENCH_JSON_FILE, VllmTestDefinition, parse_vllm_semantic_accuracy class VLLMBenchReport(LLMServingBenchReport): @@ -62,5 +62,8 @@ def report_title(self) -> str: def parse_output(self, path: Path) -> VLLMBenchReport | None: return parse_vllm_bench_output(path) + def parse_semantic_accuracy(self) -> float | None: + return parse_vllm_semantic_accuracy(self.test_run.output_path) + def all_gpu_ids(self, tdef: VllmTestDefinition, gpus_per_node: int | None) -> list[int]: return all_gpu_ids(tdef, gpus_per_node) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 1fb40d83d..ea41cddf6 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -22,6 +22,7 @@ from .vllm import ( VLLM_BENCH_JSON_FILE, VllmCmdArgs, + VllmSemanticEvalCmdArgs, VllmTestDefinition, ) @@ -121,3 +122,21 @@ def get_bench_command(self) -> list[str]: "--save-result", *extras, ] + + def get_semantic_eval_command(self) -> list[str] | None: + eval_args: VllmSemanticEvalCmdArgs | None = self.tdef.semantic_eval_cmd_args + if eval_args is None: + return None + + host = self.bench_host + http_host = host if host.startswith("http://") or host.startswith("https://") else f"http://{host}" + command = [ + "python3", + eval_args.script, + f"--host {http_host}", + f"--port {self.serve_port}", + ] + args = self._expand_semantic_eval_args(eval_args.args, host=http_host) + if args: + command.append(args) + return command diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 9f8933b8c..7eb829971 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -16,7 +16,11 @@ from __future__ import annotations +import json import logging +import re +from functools import cache +from pathlib import Path from typing import Optional, cast from pydantic import ConfigDict, Field @@ -35,6 +39,8 @@ VLLM_SERVE_LOG_FILE = "vllm-serve.log" VLLM_BENCH_LOG_FILE = "vllm-bench.log" VLLM_BENCH_JSON_FILE = "vllm-bench.json" +VLLM_GSM8K_JSON_FILE = "vllm-gsm8k.json" +VLLM_SEMANTIC_EVAL_LOG_FILE = "vllm-semantic-eval.log" class VllmArgs(LLMServingArgs): @@ -81,10 +87,18 @@ class VllmBenchCmdArgs(CmdArgs): num_prompts: int = 30 +class VllmSemanticEvalCmdArgs(CmdArgs): + """vLLM semantic validation command arguments.""" + + script: str = "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py" + args: str = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + + class VllmTestDefinition(LLMServingTestDefinition[VllmCmdArgs]): """Test object for vLLM.""" bench_cmd_args: VllmBenchCmdArgs = VllmBenchCmdArgs() + semantic_eval_cmd_args: VllmSemanticEvalCmdArgs | None = None proxy_script_repo: GitRepo | None = None @property @@ -165,6 +179,13 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: try: num_successful_requests = int(line.split()[2]) if num_successful_requests > 0: + if self.semantic_eval_cmd_args is not None: + accuracy = parse_vllm_semantic_accuracy(tr.output_path) + if accuracy is None: + return JobStatusResult( + is_successful=False, + error_message=f"vLLM semantic accuracy not found in {tr.output_path}.", + ) return JobStatusResult(is_successful=True) except Exception as e: logging.debug(f"Error parsing number of successful requests: {e}") @@ -172,3 +193,30 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: return JobStatusResult( is_successful=False, error_message=f"vLLM bench log does not contain benchmark result in {tr.output_path}." ) + + +@cache +def parse_vllm_semantic_accuracy(output_path: Path) -> float | None: + """Parse vLLM semantic validation accuracy from JSON results or the eval log.""" + json_path = output_path / VLLM_GSM8K_JSON_FILE + if json_path.is_file(): + try: + data = json.loads(json_path.read_text(encoding="utf-8")) + accuracy = data.get("accuracy") if isinstance(data, dict) else None + if isinstance(accuracy, (int, float)): + return float(accuracy) + except Exception as e: + logging.debug(f"Error parsing vLLM semantic JSON output: {e}") + + log_path = output_path / VLLM_SEMANTIC_EVAL_LOG_FILE + if not log_path.is_file(): + return None + + pattern = re.compile(r"\bAccuracy:\s*([0-9]*\.?[0-9]+)") + with log_path.open(encoding="utf-8", errors="ignore") as f: + for line in f: + match = pattern.search(line) + if match: + return float(match.group(1)) + + return None diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index 18c9d739b..84cdab162 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -15,12 +15,19 @@ # limitations under the License. from pathlib import Path +from typing import cast import pytest from cloudai.core import TestRun from cloudai.systems.slurm import SlurmSystem -from cloudai.workloads.sglang import SglangArgs, SglangCmdArgs, SglangSlurmCommandGenStrategy, SglangTestDefinition +from cloudai.workloads.sglang import ( + SglangArgs, + SglangCmdArgs, + SglangSemanticEvalCmdArgs, + SglangSlurmCommandGenStrategy, + SglangTestDefinition, +) from cloudai.workloads.sglang.sglang import SGLANG_BENCH_JSONL_FILE, SGLANG_BENCH_LOG_FILE @@ -136,6 +143,69 @@ def test_get_sglang_bench_command_writes_jsonl( assert output_file_args[0].endswith(f"/{SGLANG_BENCH_JSONL_FILE}") +def test_get_sglang_semantic_eval_command_defaults(sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy): + sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test) + sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs() + + command = sglang_cmd_gen_strategy.get_semantic_eval_command() + + assert command == [ + "python3", + "-m", + "sglang.test.run_eval", + "--host ${NODE}", + "--port 8000", + "--eval-name gsm8k --num-examples 200 --num-threads 128 --model Qwen/Qwen3-8B", + ] + + +def test_get_sglang_semantic_eval_command_supports_custom_module_and_args( + sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy, +): + sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test) + sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs( + module="sglang.test.few_shot_gsm8k", + args="--num-questions 200 --data-path {output_path}/gsm8k.jsonl --seen {host}:{port}", + ) + + command = sglang_cmd_gen_strategy.get_semantic_eval_command() + + assert command is not None + assert command[2] == "sglang.test.few_shot_gsm8k" + assert command[-1] == ( + f"--num-questions 200 --data-path {sglang_cmd_gen_strategy.test_run.output_path.absolute()}/gsm8k.jsonl " + "--seen ${NODE}:8000" + ) + + +def test_gen_srun_command_contains_sglang_semantic_eval(sglang_cmd_gen_strategy: SglangSlurmCommandGenStrategy): + sglang_test = cast(SglangTestDefinition, sglang_cmd_gen_strategy.test_run.test) + sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs() + + srun_command = sglang_cmd_gen_strategy._gen_srun_command() + + assert "Running benchmark..." in srun_command + assert "Running semantic validation..." in srun_command + assert ( + "--output=" + str((sglang_cmd_gen_strategy.test_run.output_path / "sglang-semantic-eval.log").absolute()) + in srun_command + ) + assert "python3 -m sglang.test.run_eval --host ${NODE} --port 8000" in srun_command + + +def test_gen_srun_command_contains_sglang_semantic_eval_in_disagg( + sglang_disagg_tr: TestRun, slurm_system: SlurmSystem +) -> None: + sglang_test = cast(SglangTestDefinition, sglang_disagg_tr.test) + sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs() + strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) + + srun_command = strategy._gen_srun_command() + + assert "Running semantic validation..." in srun_command + assert "python3 -m sglang.test.run_eval --host ${PREFILL_NODE} --port 8000" in srun_command + + def test_gen_srun_command_contains_expected_flow(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: strategy = SglangSlurmCommandGenStrategy(slurm_system, sglang_disagg_tr) diff --git a/tests/workloads/sglang/test_job_status_retrieval_strategy.py b/tests/workloads/sglang/test_job_status_retrieval_strategy.py index 71335711b..099e9f329 100644 --- a/tests/workloads/sglang/test_job_status_retrieval_strategy.py +++ b/tests/workloads/sglang/test_job_status_retrieval_strategy.py @@ -17,7 +17,13 @@ import json from cloudai.core import TestRun -from cloudai.workloads.sglang import SGLANG_BENCH_JSONL_FILE, SglangCmdArgs, SglangTestDefinition +from cloudai.workloads.sglang import ( + SGLANG_BENCH_JSONL_FILE, + SGLANG_SEMANTIC_EVAL_LOG_FILE, + SglangCmdArgs, + SglangSemanticEvalCmdArgs, + SglangTestDefinition, +) class TestSglangSuccessCheck: @@ -89,3 +95,55 @@ def test_failed_job_no_successful_requests(self, base_tr: TestRun) -> None: assert ( result.error_message == f"SGLang bench jsonl does not contain successful requests in {base_tr.output_path}." ) + + def test_semantic_eval_successful_with_low_accuracy(self, base_tr: TestRun) -> None: + self.sglang_tdef.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs() + base_tr.output_path.mkdir(parents=True, exist_ok=True) + (base_tr.output_path / SGLANG_BENCH_JSONL_FILE).write_text( + json.dumps( + { + "completed": 3, + "num_prompts": 3, + "request_throughput": 1.0, + "max_concurrency": 16, + "mean_ttft_ms": 1.0, + "median_ttft_ms": 1.0, + "p99_ttft_ms": 1.0, + "mean_tpot_ms": 1.0, + "median_tpot_ms": 1.0, + "p99_tpot_ms": 1.0, + } + ) + + "\n" + ) + (base_tr.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE).write_text("Score: 0.000\n") + + result = self.sglang_tdef.was_run_successful(base_tr) + + assert result.is_successful + + def test_semantic_eval_requires_parseable_accuracy(self, base_tr: TestRun) -> None: + self.sglang_tdef.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs() + base_tr.output_path.mkdir(parents=True, exist_ok=True) + (base_tr.output_path / SGLANG_BENCH_JSONL_FILE).write_text( + json.dumps( + { + "completed": 3, + "num_prompts": 3, + "request_throughput": 1.0, + "max_concurrency": 16, + "mean_ttft_ms": 1.0, + "median_ttft_ms": 1.0, + "p99_ttft_ms": 1.0, + "mean_tpot_ms": 1.0, + "median_tpot_ms": 1.0, + "p99_tpot_ms": 1.0, + } + ) + + "\n" + ) + + result = self.sglang_tdef.was_run_successful(base_tr) + + assert not result.is_successful + assert result.error_message == f"SGLang semantic accuracy not found in {base_tr.output_path}." diff --git a/tests/workloads/sglang/test_report_gen_strategy.py b/tests/workloads/sglang/test_report_gen_strategy.py index d1597f8c6..86d5ed9c3 100644 --- a/tests/workloads/sglang/test_report_gen_strategy.py +++ b/tests/workloads/sglang/test_report_gen_strategy.py @@ -16,6 +16,7 @@ import json from pathlib import Path +from typing import cast import pytest @@ -26,9 +27,15 @@ SGLangBenchReport, SGLangBenchReportGenerationStrategy, SglangCmdArgs, + SglangSemanticEvalCmdArgs, SglangTestDefinition, ) -from cloudai.workloads.sglang.sglang import SGLANG_BENCH_JSONL_FILE, parse_sglang_bench_output +from cloudai.workloads.sglang.sglang import ( + SGLANG_BENCH_JSONL_FILE, + SGLANG_SEMANTIC_EVAL_LOG_FILE, + parse_sglang_bench_output, + parse_sglang_semantic_accuracy, +) BENCH_RECORD = { "num_prompts": 30, @@ -131,6 +138,38 @@ def test_sglang_tps_per_gpu(slurm_system: SlurmSystem, sglang_tr: TestRun) -> No assert metric == 600.0 +def test_sglang_accuracy_metric(slurm_system: SlurmSystem, sglang_tr: TestRun): + sglang_test = cast(SglangTestDefinition, sglang_tr.test) + sglang_test.semantic_eval_cmd_args = SglangSemanticEvalCmdArgs() + (sglang_tr.output_path / SGLANG_SEMANTIC_EVAL_LOG_FILE).write_text("Score: 0.945\n", encoding="utf-8") + + strategy = SGLangBenchReportGenerationStrategy(slurm_system, sglang_tr) + + assert strategy.get_metric("accuracy") == 0.945 + + +def test_parse_sglang_semantic_accuracy_from_score(tmp_path: Path): + log_path = tmp_path / "score.log" + log_path.write_text("Total latency: 1.000 s\nScore: 0.812\n", encoding="utf-8") + + assert parse_sglang_semantic_accuracy(log_path) == 0.812 + + +def test_parse_sglang_semantic_accuracy_from_legacy_accuracy(tmp_path: Path) -> None: + log_path = tmp_path / "accuracy.log" + log_path.write_text("Accuracy: 0.945\nInvalid: 0.000\n", encoding="utf-8") + + assert parse_sglang_semantic_accuracy(log_path) == 0.945 + + +def test_parse_sglang_semantic_accuracy_missing_or_invalid(tmp_path: Path) -> None: + log_path = tmp_path / "invalid.log" + log_path.write_text("no score here\n", encoding="utf-8") + + assert parse_sglang_semantic_accuracy(tmp_path / "missing.log") is None + assert parse_sglang_semantic_accuracy(log_path) is None + + def test_sglang_tps_per_user__concurrency_is_zero() -> None: bench_report = SGLangBenchReport.model_validate({**BENCH_RECORD, "max_concurrency": 0}) assert bench_report.tps_per_user is None diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index 0585e9b91..c7163b689 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -25,6 +25,7 @@ VllmArgs, VllmBenchCmdArgs, VllmCmdArgs, + VllmSemanticEvalCmdArgs, VllmSlurmCommandGenStrategy, VllmTestDefinition, ) @@ -184,6 +185,54 @@ def test_get_vllm_bench_command_with_extra_args( assert "--extra-3 3" in cmd +class TestVllmSemanticEvalCommand: + def test_get_vllm_semantic_eval_command_defaults(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy): + vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test) + vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs() + + command = vllm_cmd_gen_strategy.get_semantic_eval_command() + + assert command == [ + "python3", + "/opt/vllm/tests/evals/gsm8k/gsm8k_eval.py", + "--host http://${NODE}", + "--port 8000", + "--num-questions 200 --save-results " + f"{vllm_cmd_gen_strategy.test_run.output_path.absolute()}/vllm-gsm8k.json", + ] + + def test_gen_srun_command_contains_vllm_semantic_eval( + self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy + ) -> None: + vllm_test = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test) + vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs() + + srun_command = vllm_cmd_gen_strategy._gen_srun_command() + + assert "Running benchmark..." in srun_command + assert "Running semantic validation..." in srun_command + assert ( + "--output=" + str((vllm_cmd_gen_strategy.test_run.output_path / "vllm-semantic-eval.log").absolute()) + in srun_command + ) + assert "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py --host http://${NODE} --port 8000" in srun_command + + def test_gen_srun_command_contains_vllm_semantic_eval_in_disagg( + self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + vllm_disagg_test = cast(VllmTestDefinition, vllm_disagg_tr.test) + vllm_disagg_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs() + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + + srun_command = strategy._gen_srun_command() + + assert "Running semantic validation..." in srun_command + assert ( + "python3 /opt/vllm/tests/evals/gsm8k/gsm8k_eval.py --host http://${PREFILL_NODE} --port 8000" + in srun_command + ) + + class TestVllmAggregatedMode: """Tests for vLLM non-disaggregated mode with 1 GPU.""" diff --git a/tests/workloads/vllm/test_job_status_retrieval_strategy.py b/tests/workloads/vllm/test_job_status_retrieval_strategy.py index ec7678199..830e64ce9 100644 --- a/tests/workloads/vllm/test_job_status_retrieval_strategy.py +++ b/tests/workloads/vllm/test_job_status_retrieval_strategy.py @@ -15,7 +15,13 @@ # limitations under the License. from cloudai.core import TestRun -from cloudai.workloads.vllm import VLLM_BENCH_LOG_FILE, VllmCmdArgs, VllmTestDefinition +from cloudai.workloads.vllm import ( + VLLM_BENCH_LOG_FILE, + VLLM_GSM8K_JSON_FILE, + VllmCmdArgs, + VllmSemanticEvalCmdArgs, + VllmTestDefinition, +) class TestVllmSuccessCheck: @@ -72,3 +78,35 @@ def test_no_successful_requests(self, base_tr: TestRun) -> None: result = self.vllm_tdef.was_run_successful(base_tr) assert not result.is_successful assert result.error_message == f"vLLM bench log does not contain benchmark result in {base_tr.output_path}." + + def test_semantic_eval_successful_with_low_accuracy(self, base_tr: TestRun) -> None: + self.vllm_tdef.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs() + base_tr.output_path.mkdir(parents=True, exist_ok=True) + log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE + log_file.write_text( + """ +============ Serving Benchmark Result ============ +Successful requests: 1 +""" + ) + (base_tr.output_path / VLLM_GSM8K_JSON_FILE).write_text('{"accuracy": 0.0}') + + result = self.vllm_tdef.was_run_successful(base_tr) + + assert result.is_successful + + def test_semantic_eval_requires_parseable_accuracy(self, base_tr: TestRun) -> None: + self.vllm_tdef.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs() + base_tr.output_path.mkdir(parents=True, exist_ok=True) + log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE + log_file.write_text( + """ +============ Serving Benchmark Result ============ +Successful requests: 1 +""" + ) + + result = self.vllm_tdef.was_run_successful(base_tr) + + assert not result.is_successful + assert result.error_message == f"vLLM semantic accuracy not found in {base_tr.output_path}." diff --git a/tests/workloads/vllm/test_report_gen_strategy.py b/tests/workloads/vllm/test_report_gen_strategy.py index c1ae400cf..7e1821067 100644 --- a/tests/workloads/vllm/test_report_gen_strategy.py +++ b/tests/workloads/vllm/test_report_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. from pathlib import Path +from typing import cast import pytest @@ -25,10 +26,16 @@ VLLMBenchReport, VLLMBenchReportGenerationStrategy, VllmCmdArgs, + VllmSemanticEvalCmdArgs, VllmTestDefinition, ) from cloudai.workloads.vllm.report_generation_strategy import parse_vllm_bench_output -from cloudai.workloads.vllm.vllm import VLLM_BENCH_JSON_FILE +from cloudai.workloads.vllm.vllm import ( + VLLM_BENCH_JSON_FILE, + VLLM_GSM8K_JSON_FILE, + VLLM_SEMANTIC_EVAL_LOG_FILE, + parse_vllm_semantic_accuracy, +) BENCH_DATA = VLLMBenchReport( num_prompts=30, @@ -115,3 +122,32 @@ def test_vllm_tps_per_gpu(slurm_system: SlurmSystem, vllm_tr: TestRun, ngpus: in metric = strategy.get_metric("tps-per-gpu") assert metric == BENCH_DATA.throughput / ngpus + + +def test_vllm_accuracy_metric(slurm_system: SlurmSystem, vllm_tr: TestRun) -> None: + vllm_test = cast(VllmTestDefinition, vllm_tr.test) + vllm_test.semantic_eval_cmd_args = VllmSemanticEvalCmdArgs() + (vllm_tr.output_path / VLLM_GSM8K_JSON_FILE).write_text('{"accuracy": 0.875}', encoding="utf-8") + + strategy = VLLMBenchReportGenerationStrategy(slurm_system, vllm_tr) + + assert strategy.get_metric("accuracy") == 0.875 + + +def test_parse_vllm_semantic_accuracy_from_json(tmp_path: Path) -> None: + (tmp_path / VLLM_GSM8K_JSON_FILE).write_text('{"accuracy": 0.91}', encoding="utf-8") + + assert parse_vllm_semantic_accuracy(tmp_path) == 0.91 + + +def test_parse_vllm_semantic_accuracy_falls_back_to_log(tmp_path: Path) -> None: + (tmp_path / VLLM_GSM8K_JSON_FILE).write_text("{invalid", encoding="utf-8") + (tmp_path / VLLM_SEMANTIC_EVAL_LOG_FILE).write_text("Accuracy: 0.742\n", encoding="utf-8") + + assert parse_vllm_semantic_accuracy(tmp_path) == 0.742 + + +def test_parse_vllm_semantic_accuracy_missing_or_invalid(tmp_path: Path) -> None: + (tmp_path / VLLM_SEMANTIC_EVAL_LOG_FILE).write_text("no accuracy here\n", encoding="utf-8") + + assert parse_vllm_semantic_accuracy(tmp_path) is None From 7f1b7a7c22ccc6385a5e95d89a0d966a4c0fd81d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 20 May 2026 22:05:38 +0200 Subject: [PATCH 2/6] update readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 65f42b4dc..a96814570 100644 --- a/README.md +++ b/README.md @@ -43,10 +43,12 @@ These schemas enable CloudAI to be flexible and compatible with different system |NIXL benchmark|✅|❌|❌|❌| |NIXL kvbench|✅|❌|❌|❌| |NIXL CTPerf|✅|❌|❌|❌| +|SGLang|✅|❌|❌|❌| |Sleep|✅|✅|❌|✅| |SlurmContainer|✅|❌|❌|❌| |Triton Inference|✅|❌|❌|❌| |UCC|✅|❌|❌|❌| +|vLLM|✅|❌|❌|❌| Note: Deprecated means that a workload support exists, but we are not maintaining it actively anymore and newer configurations might not work. From b0e2425f4e7c3e4eaf83050f7bf9d9d95853ef3d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 20 May 2026 22:43:43 +0200 Subject: [PATCH 3/6] vllm/sglang configs in the repo --- conf/experimental/sglang/test/sglang.toml | 30 +++++++ .../sglang/test_scenario/sglang.toml | 81 +++++++++++++++++++ conf/experimental/vllm/test/vllm.toml | 35 ++++++++ .../experimental/vllm/test_scenario/vllm.toml | 51 ++++++++++++ src/cloudai/workloads/common/llm_serving.py | 37 ++++----- 5 files changed, 211 insertions(+), 23 deletions(-) create mode 100644 conf/experimental/sglang/test/sglang.toml create mode 100644 conf/experimental/sglang/test_scenario/sglang.toml create mode 100644 conf/experimental/vllm/test/vllm.toml create mode 100644 conf/experimental/vllm/test_scenario/vllm.toml diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml new file mode 100644 index 000000000..e6d2c09b4 --- /dev/null +++ b/conf/experimental/sglang/test/sglang.toml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "sglang" +description = "SGLang test" +test_template_name = "sglang" + +[cmd_args] +docker_image_url = "lmsysorg/sglang:dev-cu13" + +[semantic_eval_cmd_args] +module = "sglang.test.run_eval" +args = "--eval-name gsm8k --num-examples 200 --num-threads 128 --model {model}" + +[extra_env_vars] +UCX_NET_DEVICES = "all" +UCX_TLS = "^gdr_copy,cuda_ipc" diff --git a/conf/experimental/sglang/test_scenario/sglang.toml b/conf/experimental/sglang/test_scenario/sglang.toml new file mode 100644 index 000000000..be1bb7bb5 --- /dev/null +++ b/conf/experimental/sglang/test_scenario/sglang.toml @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "sglang" + +[[Tests]] +id = "sglang.agg.2nodes" +test_name = "sglang" +num_nodes = 2 +time_limit = "00:10:00" + + [Tests.cmd_args.decode] + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.agg.1node" +test_name = "sglang" +num_nodes = 1 +time_limit = "00:10:00" + + [Tests.cmd_args.decode] + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.disagg.sync" +test_name = "sglang" +num_nodes = 1 +time_limit = "00:10:00" + + [Tests.cmd_args.prefill] + gpu_ids = "0,1" + tensor_parallel_size = 4 + mem_fraction_static = 0.75 + + [Tests.cmd_args.decode] + gpu_ids = "2,3" + tensor_parallel_size = 4 + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.disagg.async" +test_name = "sglang" +num_nodes = 1 +time_limit = "00:10:00" + + [Tests.cmd_args.prefill] + gpu_ids = "0,1" + tensor_parallel_size = 4 + mem_fraction_static = 0.75 + + [Tests.cmd_args.decode] + gpu_ids = "2,3" + tensor_parallel_size = 2 + mem_fraction_static = 0.75 + +[[Tests]] +id = "sglang.disagg.2nodes" +test_name = "sglang" +num_nodes = 2 +time_limit = "00:10:00" + + [Tests.cmd_args.prefill] + tensor_parallel_size = 8 + mem_fraction_static = 0.75 + + [Tests.cmd_args.decode] + tensor_parallel_size = 8 + mem_fraction_static = 0.75 diff --git a/conf/experimental/vllm/test/vllm.toml b/conf/experimental/vllm/test/vllm.toml new file mode 100644 index 000000000..891023201 --- /dev/null +++ b/conf/experimental/vllm/test/vllm.toml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "vllm" +description = "vLLM test" +test_template_name = "vllm" + +[[git_repos]] +url = "https://github.com/vllm-project/vllm.git" +commit = "main" +mount_as = "/vllm_repo" + +[cmd_args] +docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.1.1" + +[semantic_eval_cmd_args] +script = "/vllm_repo/tests/evals/gsm8k/gsm8k_eval.py" +args = "--num-questions 200 --save-results {output_path}/vllm-gsm8k.json" + +[extra_env_vars] +UCX_NET_DEVICES = "all" +UCX_TLS = "^gdr_copy,cuda_ipc" diff --git a/conf/experimental/vllm/test_scenario/vllm.toml b/conf/experimental/vllm/test_scenario/vllm.toml new file mode 100644 index 000000000..ecacf4276 --- /dev/null +++ b/conf/experimental/vllm/test_scenario/vllm.toml @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "vllm" + +[[Tests]] +id = "vllm.disagg.sync" +test_name = "vllm" +num_nodes = 2 +time_limit = "00:30:00" + + [Tests.cmd_args.prefill] + enforce_eager = "" + tensor_parallel_size = 2 + max_num_batched_tokens = 1024 + + [Tests.cmd_args.decode] + enforce_eager = "" + tensor_parallel_size = 2 + max_num_batched_tokens = 1024 + +[[Tests]] +id = "vllm.disagg.async" +test_name = "vllm" +num_nodes = 1 +time_limit = "00:10:00" + + [Tests.cmd_args.prefill] + gpu_ids = "0,1" + enforce_eager = "" + tensor_parallel_size = 4 + max_num_batched_tokens = 1024 + + [Tests.cmd_args.decode] + gpu_ids = "2,3" + enforce_eager = "" + tensor_parallel_size = 2 + max_num_batched_tokens = 1024 diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 4ab2718a6..75e0dca5e 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -313,32 +313,23 @@ def generate_report(self) -> None: table.add_column("TPOT Mean, ms", justify="right") table.add_column("TPOT Median, ms", justify="right") table.add_column("TPOT P99, ms", justify="right") + + row = [ + f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})", + f"{results.mean_ttft_ms:.4f}", + f"{results.median_ttft_ms:.4f}", + f"{results.p99_ttft_ms:.4f}", + f"{results.mean_tpot_ms:.4f}", + f"{results.median_tpot_ms:.4f}", + f"{results.p99_tpot_ms:.4f}", + ] + accuracy = self.get_metric("accuracy") if accuracy != METRIC_ERROR: table.add_column("Accuracy", justify="right") - row = [ - f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})", - f"{results.mean_ttft_ms:.4f}", - f"{results.median_ttft_ms:.4f}", - f"{results.p99_ttft_ms:.4f}", - f"{results.mean_tpot_ms:.4f}", - f"{results.median_tpot_ms:.4f}", - f"{results.p99_tpot_ms:.4f}", - f"{accuracy:.4f}", - ] - else: - row = [ - f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})", - f"{results.mean_ttft_ms:.4f}", - f"{results.median_ttft_ms:.4f}", - f"{results.p99_ttft_ms:.4f}", - f"{results.mean_tpot_ms:.4f}", - f"{results.median_tpot_ms:.4f}", - f"{results.p99_tpot_ms:.4f}", - ] - table.add_row( - *row, - ) + row.append(f"{accuracy:.4f}") + + table.add_row(*row) console.print(table) From b32782affac9362f201ab6711e818562757a573c Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Wed, 20 May 2026 23:39:13 +0200 Subject: [PATCH 4/6] adjust configs --- conf/experimental/sglang/test/sglang.toml | 2 +- conf/experimental/sglang/test_scenario/sglang.toml | 10 +++++----- conf/experimental/vllm/test_scenario/vllm.toml | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml index e6d2c09b4..ec420d0fd 100644 --- a/conf/experimental/sglang/test/sglang.toml +++ b/conf/experimental/sglang/test/sglang.toml @@ -19,7 +19,7 @@ description = "SGLang test" test_template_name = "sglang" [cmd_args] -docker_image_url = "lmsysorg/sglang:dev-cu13" +docker_image_url = "nvcr.io#nvidia/ai-dynamo/sglang-runtime:1.1.1" [semantic_eval_cmd_args] module = "sglang.test.run_eval" diff --git a/conf/experimental/sglang/test_scenario/sglang.toml b/conf/experimental/sglang/test_scenario/sglang.toml index be1bb7bb5..b6f96f4e2 100644 --- a/conf/experimental/sglang/test_scenario/sglang.toml +++ b/conf/experimental/sglang/test_scenario/sglang.toml @@ -42,12 +42,12 @@ time_limit = "00:10:00" [Tests.cmd_args.prefill] gpu_ids = "0,1" - tensor_parallel_size = 4 + tensor_parallel_size = 2 mem_fraction_static = 0.75 [Tests.cmd_args.decode] gpu_ids = "2,3" - tensor_parallel_size = 4 + tensor_parallel_size = 2 mem_fraction_static = 0.75 [[Tests]] @@ -58,7 +58,7 @@ time_limit = "00:10:00" [Tests.cmd_args.prefill] gpu_ids = "0,1" - tensor_parallel_size = 4 + tensor_parallel_size = 2 mem_fraction_static = 0.75 [Tests.cmd_args.decode] @@ -73,9 +73,9 @@ num_nodes = 2 time_limit = "00:10:00" [Tests.cmd_args.prefill] - tensor_parallel_size = 8 + tensor_parallel_size = 4 mem_fraction_static = 0.75 [Tests.cmd_args.decode] - tensor_parallel_size = 8 + tensor_parallel_size = 4 mem_fraction_static = 0.75 diff --git a/conf/experimental/vllm/test_scenario/vllm.toml b/conf/experimental/vllm/test_scenario/vllm.toml index ecacf4276..8e1207221 100644 --- a/conf/experimental/vllm/test_scenario/vllm.toml +++ b/conf/experimental/vllm/test_scenario/vllm.toml @@ -41,7 +41,7 @@ time_limit = "00:10:00" [Tests.cmd_args.prefill] gpu_ids = "0,1" enforce_eager = "" - tensor_parallel_size = 4 + tensor_parallel_size = 1 max_num_batched_tokens = 1024 [Tests.cmd_args.decode] From a73815863f0eefaf32c7cd1d5f12cf11e712820d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 21 May 2026 11:06:48 +0200 Subject: [PATCH 5/6] support custom bash for semantic eval --- src/cloudai/workloads/common/llm_serving.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index 75e0dca5e..87ad7b3a3 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -634,13 +634,14 @@ def _gen_semantic_eval_block(self, srun_prefix: str) -> str: semantic_cmd = self.get_semantic_eval_command() if not semantic_cmd: return "" + semantic_cmd_full = self._with_custom_bash(" ".join(semantic_cmd)) return f"""\ echo "Running semantic validation..." {srun_prefix} \\ --output={(self.test_run.output_path / self.semantic_eval_log_file).absolute()} \\ - {" ".join(semantic_cmd)}""" + {semantic_cmd_full}""" def _gen_srun_command(self) -> str: serve_commands = self.get_serve_commands() From 7cb8c64b8a5c2baf4809d23023b8aa6cb7b4850e Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 21 May 2026 11:16:54 +0200 Subject: [PATCH 6/6] update sglang image --- conf/experimental/sglang/test/sglang.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/experimental/sglang/test/sglang.toml b/conf/experimental/sglang/test/sglang.toml index ec420d0fd..e6d2c09b4 100644 --- a/conf/experimental/sglang/test/sglang.toml +++ b/conf/experimental/sglang/test/sglang.toml @@ -19,7 +19,7 @@ description = "SGLang test" test_template_name = "sglang" [cmd_args] -docker_image_url = "nvcr.io#nvidia/ai-dynamo/sglang-runtime:1.1.1" +docker_image_url = "lmsysorg/sglang:dev-cu13" [semantic_eval_cmd_args] module = "sglang.test.run_eval"