From 8ffdcf8e995238225a49d859314bd931b026f7ee Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Wed, 20 May 2026 17:23:34 -0700 Subject: [PATCH 1/6] feat(ai_dynamo): add aiperf workload support Add AIPerf as a new workload type alongside GenAIPerf in the AIDynamo workload. Includes the aiperf.sh wrapper script, AIPerf Pydantic model, arg serialization in the command gen strategy, and experimental test/ scenario configs for single-node disaggregated runs. --- conf/experimental/ai_dynamo/test/aiperf.toml | 87 +++++++++++ .../aiperf_slurm_single_node.toml | 37 +++++ .../test_scenario/vllm_slurm_single_node.toml | 37 +++++ src/cloudai/workloads/ai_dynamo/__init__.py | 2 + src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 24 ++- src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 16 ++ src/cloudai/workloads/ai_dynamo/aiperf.sh | 143 ++++++++++++++++++ .../ai_dynamo/slurm_command_gen_strategy.py | 1 + tests/ref_data/ai-dynamo.sbatch | 6 +- 9 files changed, 351 insertions(+), 2 deletions(-) create mode 100644 conf/experimental/ai_dynamo/test/aiperf.toml create mode 100644 conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml create mode 100644 conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml create mode 100644 src/cloudai/workloads/ai_dynamo/aiperf.sh diff --git a/conf/experimental/ai_dynamo/test/aiperf.toml b/conf/experimental/ai_dynamo/test/aiperf.toml new file mode 100644 index 000000000..7fb015b86 --- /dev/null +++ b/conf/experimental/ai_dynamo/test/aiperf.toml @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "AIPerf-Qwen3-0.6B" +description = "aiperf benchmark" +test_template_name = "AIDynamo" +extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] + +[cmd_args] +docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1" +workloads = "aiperf.sh" + + [cmd_args.dynamo] + backend = "vllm" + model = "Qwen/Qwen3-0.6B" + + [cmd_args.dynamo.prefill_worker] + num-nodes = 1 + cmd = 'python3 -m dynamo.vllm --is-prefill-worker' + worker-initialized-regex = 'VllmWorker.*has.been.initialized' + extra-args = "--no-enable-expert-parallel" + + [cmd_args.dynamo.prefill_worker.args] + gpu-memory-utilization = 0.8 + tensor-parallel-size = 8 + pipeline-parallel-size = 1 + data-parallel-size = 1 + + [cmd_args.dynamo.decode_worker] + num-nodes = 1 + cmd = 'python3 -m dynamo.vllm' + worker-initialized-regex = 'VllmWorker.*has.been.initialized' + extra-args = "--no-enable-expert-parallel" + + [cmd_args.dynamo.decode_worker.args] + gpu-memory-utilization = 0.8 + tensor-parallel-size = 8 + pipeline-parallel-size = 1 + data-parallel-size = 1 + + [cmd_args.lmcache] + controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001" + + [cmd_args.lmcache.args] + chunk_size = 256 + local_cpu = false + nixl_buffer_size = 10737418240 + nixl_buffer_device = "cuda" + extra_config_enable_nixl_storage = true + extra_config_nixl_backend = "GDS_MT" + extra_config_nixl_file_pool_size = 64 + + enable_controller = true + lmcache_instance_id = "lmcache_default_instance" + controller_url = "localhost:9001" + lmcache_worker_port = 8788 + distributed_url = "localhost:8789" + + [cmd_args.aiperf] + cmd = "aiperf profile" + + [cmd_args.aiperf.args] + concurrency = 2 + request-count = 50 + synthetic-input-tokens-mean = 300 + output-tokens-mean = 500 + +[extra_env_vars] +UCX_LOG_LEVEL = "warn" +HF_HUB_OFFLINE = "1" +TRANSFORMERS_OFFLINE = "1" +HF_DATASETS_OFFLINE = "1" +DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" +UCX_TLS = "all" diff --git a/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml new file mode 100644 index 000000000..fcc0a0afc --- /dev/null +++ b/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "dynamo-aiperf-slurm-single-node" +job_status_check = false + +[[Tests]] +id = "test.disagg.single-node" +test_name = "AIPerf-Qwen3-0.6B" +time_limit = "00:15:00" + + [Tests.cmd_args] + + [Tests.cmd_args.dynamo.prefill_worker] + num-nodes = 1 + [Tests.cmd_args.dynamo.prefill_worker.args] + tensor-parallel-size = 4 + pipeline-parallel-size = 1 + + [Tests.cmd_args.dynamo.decode_worker] + num-nodes = 1 + [Tests.cmd_args.dynamo.decode_worker.args] + tensor-parallel-size = 4 + pipeline-parallel-size = 1 diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml new file mode 100644 index 000000000..143784f2c --- /dev/null +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "dynamo-vllm-slurm-single-node" +job_status_check = false + +[[Tests]] +id = "test.disagg.single-node" +test_name = "vLLM" +time_limit = "00:10:00" + + [Tests.cmd_args] + + [Tests.cmd_args.dynamo.prefill_worker] + num-nodes = 1 + [Tests.cmd_args.dynamo.prefill_worker.args] + tensor-parallel-size = 4 + pipeline-parallel-size = 1 + + [Tests.cmd_args.dynamo.decode_worker] + num-nodes = 1 + [Tests.cmd_args.dynamo.decode_worker.args] + tensor-parallel-size = 4 + pipeline-parallel-size = 1 diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py index b5f030eeb..1360ce10d 100644 --- a/src/cloudai/workloads/ai_dynamo/__init__.py +++ b/src/cloudai/workloads/ai_dynamo/__init__.py @@ -18,6 +18,7 @@ AIDynamoArgs, AIDynamoCmdArgs, AIDynamoTestDefinition, + AIPerf, GenAIPerf, LMCache, LMCacheArgs, @@ -35,6 +36,7 @@ "AIDynamoReportGenerationStrategy", "AIDynamoSlurmCommandGenStrategy", "AIDynamoTestDefinition", + "AIPerf", "GenAIPerf", "LMCache", "LMCacheArgs", diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 55dc1f1b3..01912f0c1 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -282,6 +282,25 @@ def installables(self) -> list[Installable]: return [self.script] +class AIPerf(Workload): + """Workload configuration for aiperf benchmarking.""" + + model_config = ConfigDict(extra="allow") + + name: str = "aiperf" + cmd: str = "aiperf profile" + script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh") + report_name: str = Field( + default="aiperf_report.csv", + serialization_alias="report-name", + validation_alias=AliasChoices("report-name", "report_name"), + ) + + @property + def installables(self) -> list[Installable]: + return [self.script] + + class Constraints(BaseModel): """Constraints for validation of AI Dynamo configurations when using DSE.""" @@ -301,12 +320,13 @@ class AIDynamoCmdArgs(CmdArgs): dynamo: AIDynamoArgs lmcache: LMCache = Field(default_factory=LMCache) genai_perf: GenAIPerf = Field(default_factory=GenAIPerf) + aiperf: AIPerf = Field(default_factory=AIPerf) workloads: str = "genai_perf.sh" @field_validator("workloads", mode="before") @classmethod def validate_workloads(cls, v: str) -> str: - allowed_workloads = ["genai_perf.sh"] + allowed_workloads = ["genai_perf.sh", "aiperf.sh"] values = [w.strip() for w in v.split(",")] for workload in values: if workload not in allowed_workloads: @@ -322,6 +342,7 @@ def installables(self) -> list[Installable]: return [ *self.lmcache.installables, *self.genai_perf.installables, + *self.aiperf.installables, ] @@ -356,6 +377,7 @@ def get_workload_map(self) -> dict[str, Workload]: """Get a map of workload scripts to workload objects.""" return { self.cmd_args.genai_perf.script.src.name: self.cmd_args.genai_perf, + self.cmd_args.aiperf.script.src.name: self.cmd_args.aiperf, } @property diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index fd391fa5f..46f5daa42 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -35,6 +35,8 @@ declare -A lmcache_args declare -A lmcache_config declare -A genai_perf_args declare -A genai_perf_config +declare -A aiperf_args +declare -A aiperf_config declare -A dynamo_args dynamo_args["backend"]="vllm" @@ -163,6 +165,10 @@ _parse_cli_pairs() { genai_perf_args["--${key#--genai_perf-args-}"]="$2" ;; --genai_perf-*) genai_perf_config["--${key#--genai_perf-}"]="$2" ;; + --aiperf-args-*) + aiperf_args["--${key#--aiperf-args-}"]="$2" ;; + --aiperf-*) + aiperf_config["--${key#--aiperf-}"]="$2" ;; --hf-home) HUGGINGFACE_HOME="$2" ;; --storage-cache-dir) @@ -353,6 +359,8 @@ _dump_args() { log "LMCache args:\n$(arg_array_to_string lmcache_args)" log "GenAI config params:\n$(arg_array_to_string genai_perf_config)" log "GenAI-Perf args:\n$(arg_array_to_string genai_perf_args)" + log "AIPerf config params:\n$(arg_array_to_string aiperf_config)" + log "AIPerf args:\n$(arg_array_to_string aiperf_args)" log "--------------------------------" } @@ -505,6 +513,10 @@ _is_genai_perf_workload() { [[ "${dynamo_args["workloads"]}" == *"genai_perf.sh"* ]] } +_is_aiperf_workload() { + [[ "${dynamo_args["workloads"]}" == *"aiperf.sh"* ]] +} + _init_runtime_env() { if _is_vllm || _is_sglang; then export HF_HOME="${HUGGINGFACE_HOME}" @@ -1026,6 +1038,10 @@ function launch_workloads() launch_workload genai_perf_config genai_perf_args fi + if _is_aiperf_workload; then + launch_workload aiperf_config aiperf_args + fi + mark_done } diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh new file mode 100644 index 000000000..80baa0fd8 --- /dev/null +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# aiperf.sh — aiperf profile wrapper for ai_dynamo workloads. +# +# Called from ai_dynamo.sh's launch_workload() with: +# bash aiperf.sh --result-dir --model --url --port +# [--cmd ] [--report-name ] [--extra-args ] +# -- ... +# +# Context flags (before --) that are recognised and used: +# --result-dir Directory where artifacts and the final report are written. +# --model HuggingFace model identifier (e.g. Qwen/Qwen3-0.6B). +# --url Base URL of the dynamo.frontend (e.g. http://node01). +# --port HTTP port the dynamo.frontend is listening on. +# --report-name Output CSV name (default: aiperf_report.csv). +# --cmd Full launch command including subcommand (default: "aiperf profile"). +# --extra-args Raw string appended verbatim after all other flags. +# +# All unrecognised flags (--install-dir, --gpus-per-node, etc.) are silently +# consumed so this script is forward-compatible with launch_workload additions. +# +# Everything after -- is passed directly to the aiperf profile invocation. + +set -Eeuo pipefail + +result_dir="" +model="" +url="http://localhost" +port=8000 +report_name="aiperf_report.csv" +cmd="aiperf profile" +extra_args="" +declare -a aiperf_profile_args=() + +log() { + echo "[$(date '+%F %T') $(hostname)]: $*" +} + +_parse_aiperf_args() { + while [[ $# -ge 2 ]]; do + case "$1" in + --*) aiperf_profile_args+=("$1" "$2"); shift 2 ;; + *) shift ;; + esac + done + # Capture a trailing lone boolean flag if present. + # Use if/fi — not [[ ]] && — so set -e does not trigger on a false condition. + if [[ $# -eq 1 && "$1" == --* ]]; then + aiperf_profile_args+=("$1") + fi +} + +process_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --result-dir) result_dir="$2"; shift 2 ;; + --model) model="$2"; shift 2 ;; + --url) url="$2"; shift 2 ;; + --port) port="$2"; shift 2 ;; + --report-name) report_name="$2"; shift 2 ;; + --cmd) cmd="$2"; shift 2 ;; + --extra-args) extra_args="$2"; shift 2 ;; + --) shift; _parse_aiperf_args "$@"; break ;; + --*) shift 2 ;; # consume unknown flag + its value + *) shift ;; + esac + done + + log "Parsed args: + result_dir: $result_dir + model: $model + url: $url + port: $port + report_name: $report_name + cmd: $cmd + extra_args: $extra_args + profile_args: ${aiperf_profile_args[*]:-}" +} + +process_results() { + local artifact_dir="$result_dir/aiperf_artifacts" + local csv_path + csv_path=$(find "$artifact_dir" -name "*.csv" -print -quit 2>/dev/null || true) + if [[ -n "$csv_path" ]]; then + cp "$csv_path" "$result_dir/$report_name" + log "aiperf report saved to $result_dir/$report_name" + else + log "ERROR: no CSV found in $artifact_dir — aiperf may not have completed" + exit 1 + fi +} + +main() { + process_args "$@" + + if [[ -z "$result_dir" ]]; then + log "ERROR: --result-dir is required"; exit 1 + fi + if [[ -z "$model" ]]; then + log "ERROR: --model is required"; exit 1 + fi + + local full_url="${url}:${port}" + local artifact_dir="$result_dir/aiperf_artifacts" + rm -rf "$artifact_dir" + + # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"]) + local -a run_cmd=() + read -ra run_cmd <<< "$cmd" + + log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url" + + "${run_cmd[@]}" \ + --model "$model" \ + --url "$full_url" \ + --endpoint-type chat \ + --streaming \ + --artifact-dir "$artifact_dir" \ + --no-server-metrics \ + "${aiperf_profile_args[@]}" \ + ${extra_args} + + log "aiperf run complete" + process_results +} + +main "$@" +exit 0 diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 4fbee5c7d..17079875c 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -117,6 +117,7 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: args.extend(self._get_nested_toml_args(td.cmd_args.lmcache, "--lmcache-")) args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-")) + args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-")) return args diff --git a/tests/ref_data/ai-dynamo.sbatch b/tests/ref_data/ai-dynamo.sbatch index f9b5150c5..492e3c427 100644 --- a/tests/ref_data/ai-dynamo.sbatch +++ b/tests/ref_data/ai-dynamo.sbatch @@ -85,4 +85,8 @@ srun \ --genai_perf-random-seed "42" \ --genai_perf-request-count "100" \ --genai_perf-synthetic-input-tokens-mean "550" \ - --genai_perf-warmup-request-count "10" \ No newline at end of file + --genai_perf-warmup-request-count "10" \ + --aiperf-name "aiperf" \ + --aiperf-cmd "aiperf profile" \ + --aiperf-script "/cloudai_install/aiperf.sh" \ + --aiperf-report-name "aiperf_report.csv" \ No newline at end of file From 8a801b1c8ea4e8cd77b5e43321506fd46c7ec931 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Wed, 20 May 2026 18:34:55 -0700 Subject: [PATCH 2/6] refactor(ai_dynamo): fold aiperf into vllm test, fix aiperf.sh arg handling - Move aiperf config into vllm.toml (workloads = "aiperf.sh" + [cmd_args.aiperf]) so the backend and benchmark live in one file; remove standalone aiperf.toml and aiperf_slurm_single_node.toml - Fix aiperf.sh: convert extra_args from string to array to prevent word-splitting/glob expansion; fix unknown --* flag handling to shift 1 for boolean flags instead of always shift 2 --- conf/experimental/ai_dynamo/test/aiperf.toml | 87 ------------------- conf/experimental/ai_dynamo/test/vllm.toml | 10 ++- .../aiperf_slurm_single_node.toml | 37 -------- .../test_scenario/vllm_slurm_single_node.toml | 2 +- src/cloudai/workloads/ai_dynamo/aiperf.sh | 10 +-- 5 files changed, 15 insertions(+), 131 deletions(-) delete mode 100644 conf/experimental/ai_dynamo/test/aiperf.toml delete mode 100644 conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml diff --git a/conf/experimental/ai_dynamo/test/aiperf.toml b/conf/experimental/ai_dynamo/test/aiperf.toml deleted file mode 100644 index 7fb015b86..000000000 --- a/conf/experimental/ai_dynamo/test/aiperf.toml +++ /dev/null @@ -1,87 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "AIPerf-Qwen3-0.6B" -description = "aiperf benchmark" -test_template_name = "AIDynamo" -extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] - -[cmd_args] -docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1" -workloads = "aiperf.sh" - - [cmd_args.dynamo] - backend = "vllm" - model = "Qwen/Qwen3-0.6B" - - [cmd_args.dynamo.prefill_worker] - num-nodes = 1 - cmd = 'python3 -m dynamo.vllm --is-prefill-worker' - worker-initialized-regex = 'VllmWorker.*has.been.initialized' - extra-args = "--no-enable-expert-parallel" - - [cmd_args.dynamo.prefill_worker.args] - gpu-memory-utilization = 0.8 - tensor-parallel-size = 8 - pipeline-parallel-size = 1 - data-parallel-size = 1 - - [cmd_args.dynamo.decode_worker] - num-nodes = 1 - cmd = 'python3 -m dynamo.vllm' - worker-initialized-regex = 'VllmWorker.*has.been.initialized' - extra-args = "--no-enable-expert-parallel" - - [cmd_args.dynamo.decode_worker.args] - gpu-memory-utilization = 0.8 - tensor-parallel-size = 8 - pipeline-parallel-size = 1 - data-parallel-size = 1 - - [cmd_args.lmcache] - controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001" - - [cmd_args.lmcache.args] - chunk_size = 256 - local_cpu = false - nixl_buffer_size = 10737418240 - nixl_buffer_device = "cuda" - extra_config_enable_nixl_storage = true - extra_config_nixl_backend = "GDS_MT" - extra_config_nixl_file_pool_size = 64 - - enable_controller = true - lmcache_instance_id = "lmcache_default_instance" - controller_url = "localhost:9001" - lmcache_worker_port = 8788 - distributed_url = "localhost:8789" - - [cmd_args.aiperf] - cmd = "aiperf profile" - - [cmd_args.aiperf.args] - concurrency = 2 - request-count = 50 - synthetic-input-tokens-mean = 300 - output-tokens-mean = 500 - -[extra_env_vars] -UCX_LOG_LEVEL = "warn" -HF_HUB_OFFLINE = "1" -TRANSFORMERS_OFFLINE = "1" -HF_DATASETS_OFFLINE = "1" -DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" -UCX_TLS = "all" diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml index f1249564f..c88ff3e10 100644 --- a/conf/experimental/ai_dynamo/test/vllm.toml +++ b/conf/experimental/ai_dynamo/test/vllm.toml @@ -21,7 +21,7 @@ extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] [cmd_args] docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1" -workloads = "genai_perf.sh" +workloads = "aiperf.sh" [cmd_args.dynamo] backend = "vllm" @@ -85,6 +85,14 @@ workloads = "genai_perf.sh" warmup-request-count = 5 concurrency = 2 + [cmd_args.aiperf] + + [cmd_args.aiperf.args] + concurrency = 2 + request-count = 50 + synthetic-input-tokens-mean = 300 + output-tokens-mean = 500 + [extra_env_vars] UCX_LOG_LEVEL = "warn" HF_HUB_OFFLINE = "1" diff --git a/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml deleted file mode 100644 index fcc0a0afc..000000000 --- a/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "dynamo-aiperf-slurm-single-node" -job_status_check = false - -[[Tests]] -id = "test.disagg.single-node" -test_name = "AIPerf-Qwen3-0.6B" -time_limit = "00:15:00" - - [Tests.cmd_args] - - [Tests.cmd_args.dynamo.prefill_worker] - num-nodes = 1 - [Tests.cmd_args.dynamo.prefill_worker.args] - tensor-parallel-size = 4 - pipeline-parallel-size = 1 - - [Tests.cmd_args.dynamo.decode_worker] - num-nodes = 1 - [Tests.cmd_args.dynamo.decode_worker.args] - tensor-parallel-size = 4 - pipeline-parallel-size = 1 diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml index 143784f2c..76bef2482 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml @@ -20,7 +20,7 @@ job_status_check = false [[Tests]] id = "test.disagg.single-node" test_name = "vLLM" -time_limit = "00:10:00" +time_limit = "00:15:00" [Tests.cmd_args] diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh index 80baa0fd8..9f5a78b33 100644 --- a/src/cloudai/workloads/ai_dynamo/aiperf.sh +++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh @@ -44,7 +44,7 @@ url="http://localhost" port=8000 report_name="aiperf_report.csv" cmd="aiperf profile" -extra_args="" +declare -a extra_args=() declare -a aiperf_profile_args=() log() { @@ -74,9 +74,9 @@ process_args() { --port) port="$2"; shift 2 ;; --report-name) report_name="$2"; shift 2 ;; --cmd) cmd="$2"; shift 2 ;; - --extra-args) extra_args="$2"; shift 2 ;; + --extra-args) read -ra extra_args <<< "$2"; shift 2 ;; --) shift; _parse_aiperf_args "$@"; break ;; - --*) shift 2 ;; # consume unknown flag + its value + --*) if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;; # consume unknown flag; shift 2 only if next arg is a value *) shift ;; esac done @@ -88,7 +88,7 @@ process_args() { port: $port report_name: $report_name cmd: $cmd - extra_args: $extra_args + extra_args: ${extra_args[*]:-} profile_args: ${aiperf_profile_args[*]:-}" } @@ -133,7 +133,7 @@ main() { --artifact-dir "$artifact_dir" \ --no-server-metrics \ "${aiperf_profile_args[@]}" \ - ${extra_args} + "${extra_args[@]}" log "aiperf run complete" process_results From f124881b914a751d95b66c69d018974162241434 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Wed, 20 May 2026 18:49:07 -0700 Subject: [PATCH 3/6] fix(ai_dynamo): update vllm_slurm scenario test_name, remove redundant scenario file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix stale test_name reference in vllm_slurm.toml (vLLM-Qwen3-0.6B → vLLM) and remove vllm_slurm_single_node.toml since vllm_slurm.toml already covers the single-node case. --- .../ai_dynamo/test_scenario/vllm_slurm.toml | 4 +- .../test_scenario/vllm_slurm_single_node.toml | 37 ------------------- 2 files changed, 2 insertions(+), 39 deletions(-) delete mode 100644 conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml index c63c648fe..45031da3a 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml @@ -19,7 +19,7 @@ job_status_check = false [[Tests]] id = "test.disagg.single-node" -test_name = "vLLM-Qwen3-0.6B" +test_name = "vLLM" time_limit = "00:10:00" [Tests.cmd_args] @@ -38,7 +38,7 @@ time_limit = "00:10:00" [[Tests]] id = "test.disagg.multinode" -test_name = "vLLM-Qwen3-0.6B" +test_name = "vLLM" time_limit = "00:10:00" [Tests.cmd_args] diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml deleted file mode 100644 index 76bef2482..000000000 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name = "dynamo-vllm-slurm-single-node" -job_status_check = false - -[[Tests]] -id = "test.disagg.single-node" -test_name = "vLLM" -time_limit = "00:15:00" - - [Tests.cmd_args] - - [Tests.cmd_args.dynamo.prefill_worker] - num-nodes = 1 - [Tests.cmd_args.dynamo.prefill_worker.args] - tensor-parallel-size = 4 - pipeline-parallel-size = 1 - - [Tests.cmd_args.dynamo.decode_worker] - num-nodes = 1 - [Tests.cmd_args.dynamo.decode_worker.args] - tensor-parallel-size = 4 - pipeline-parallel-size = 1 From a523413ee58b60fc9958a2a2ea92f0103e49d6b5 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Thu, 21 May 2026 13:47:02 -0700 Subject: [PATCH 4/6] fix(ai_dynamo): add aiperf to sglang test, fix report strategy default - Add aiperf config to sglang.toml (workloads = "aiperf.sh") so both vLLM and sglang backends use aiperf as the default benchmark - Fix sglang_slurm.toml: correct stale test_name and add job_status_check=false - Fix AIDynamoReportGenerationStrategy default benchmark_name from "genai_perf" to "aiperf" to match the new default workload; genai_perf metrics still accessible via "genai_perf:metric_name:metric_type" format - Update unit tests: fix existing genai_perf tests to use explicit prefix, add aiperf-specific tests covering per-request metrics, summary metrics, and default benchmark resolution --- conf/experimental/ai_dynamo/test/sglang.toml | 10 +++- .../ai_dynamo/test_scenario/sglang_slurm.toml | 3 +- .../ai_dynamo/report_generation_strategy.py | 2 +- .../ai_dynamo/test_report_gen_strategy.py | 56 ++++++++++++++----- 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index d1429fe11..0ebc3bd60 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -21,7 +21,7 @@ extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] [cmd_args] docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0" -workloads = "genai_perf.sh" +workloads = "aiperf.sh" [cmd_args.dynamo] backend = "sglang" @@ -93,6 +93,14 @@ workloads = "genai_perf.sh" warmup-request-count = 5 concurrency = 2 + [cmd_args.aiperf] + + [cmd_args.aiperf.args] + concurrency = 2 + request-count = 50 + synthetic-input-tokens-mean = 300 + output-tokens-mean = 500 + [extra_env_vars] UCX_LOG_LEVEL = "warn" HF_HUB_OFFLINE = "1" diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml index d6b8eac1c..383557377 100644 --- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml @@ -15,10 +15,11 @@ # limitations under the License. name = "dynamo_sglang" +job_status_check = false [[Tests]] id = "sglang-Qwen3-0.6B" -test_name = "sglang-Qwen3-0.6B" +test_name = "sglang" time_limit = "00:20:00" [Tests.cmd_args] diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py index a2f243712..2ddd3bfa9 100644 --- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py @@ -44,7 +44,7 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type: def get_metric(self, metric: str) -> MetricValue: logging.info(f"Getting metric: {metric}") - benchmark_name = "genai_perf" + benchmark_name = "aiperf" metric_name = metric metric_type = "avg" diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index 2674f4a77..eb93b993a 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -47,6 +47,20 @@ def get_csv_content() -> str: ) +def get_aiperf_csv_content() -> str: + return ( + "Metric,avg,min,max,p1,p5,p10,p25,p50,p75,p90,p95,p99,std\n" + "Inter Token Latency (ms),2.83,2.78,2.91,2.78,2.79,2.81,2.82,2.83,2.84,2.85,2.88,2.90,0.03\n" + "Time to First Token (ms),49.87,17.15,99.91,18.60,21.26,22.46,49.35,49.87,50.52,51.63,53.91,92.31,9.20\n" + "Output Sequence Length (tokens),498.06,410.00,501.00,450.67,499.00,500.00,500.00,500.00,500.00,500.00,500.00,501.00,12.62\n" + "\n" + "Metric,Value\n" + "Output Token Throughput (tokens/sec),595.68\n" + "Total Token Throughput (tokens/sec),954.47\n" + "Request Count,50.00\n" + ) + + @pytest.fixture def ai_dynamo_tr(tmp_path: Path) -> TestRun: test = AIDynamoTestDefinition( @@ -70,6 +84,7 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun: csv_content = get_csv_content() (tr.output_path / "genai_perf_report.csv").write_text(csv_content) + (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content()) (tr.output_path / "profile_genai_perf.csv").write_text(csv_content) (tr.output_path / "profile_genai_perf.json").write_text("mock json content") (tr.output_path / test.success_marker).touch() @@ -95,32 +110,45 @@ def test_ai_dynamo_generate_report(slurm_system: SlurmSystem, ai_dynamo_tr: Test assert True -def test_ai_dynamo_get_metric_single_values(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: +def test_ai_dynamo_get_metric_genai_perf(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) - # Test that metrics from the first CSV section work - assert strategy.get_metric("Output Sequence Length (tokens)") == 101.01 - assert strategy.get_metric("Input Sequence Length (tokens)") == 123.45 + # genai_perf metrics require explicit benchmark prefix since default is now aiperf + assert strategy.get_metric("genai_perf:Output Sequence Length (tokens):avg") == 101.01 + assert strategy.get_metric("genai_perf:Inter Token Latency (ms):avg") == 12.34 + assert strategy.get_metric("genai_perf:Time To First Token (ms):avg") == 111.12 + assert strategy.get_metric("genai_perf:Inter Token Latency (ms):p50") == 89.01 -def test_ai_dynamo_get_metric_statistical_values(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: +def test_ai_dynamo_get_metric_aiperf_defaults(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) - # Use exact metric names from CSV (with avg column, which is default) - assert strategy.get_metric("Time To First Token (ms)") == 111.12 - assert strategy.get_metric("Time To Second Token (ms)") == 11.13 - assert strategy.get_metric("Request Latency (ms)") == 1111.14 - assert strategy.get_metric("Inter Token Latency (ms)") == 12.34 + # bare metric names default to aiperf_report.csv (avg column) + assert strategy.get_metric("Inter Token Latency (ms)") == 2.83 + assert strategy.get_metric("Output Token Throughput (tokens/sec)") == 595.68 + + +def test_ai_dynamo_get_metric_aiperf_explicit(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: + strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) + + # per-request metrics (first CSV section with avg/p50 columns) + assert strategy.get_metric("aiperf:Inter Token Latency (ms):avg") == 2.83 + assert strategy.get_metric("aiperf:Inter Token Latency (ms):p50") == 2.83 + assert strategy.get_metric("aiperf:Time to First Token (ms):avg") == 49.87 + + # summary metrics (second CSV section — value lands in "avg" column by position) + assert strategy.get_metric("aiperf:Output Token Throughput (tokens/sec):avg") == 595.68 + assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47 def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) - assert strategy.get_metric("invalid-metric") == METRIC_ERROR + assert strategy.get_metric("nonexistent-metric") == METRIC_ERROR - # Empty the CSV file to test error handling - (ai_dynamo_tr.output_path / "genai_perf_report.csv").write_text("") - assert strategy.get_metric("invalid-metric") == METRIC_ERROR + # Empty the aiperf CSV to test error handling for the default path + (ai_dynamo_tr.output_path / "aiperf_report.csv").write_text("") + assert strategy.get_metric("Inter Token Latency (ms)") == METRIC_ERROR def test_was_run_successful(ai_dynamo_tr: TestRun) -> None: From d5d556ebf933aa96f1c293729758375fe9dd2c5e Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Thu, 21 May 2026 15:30:54 -0700 Subject: [PATCH 5/6] fix(ai_dynamo): derive benchmark name from workload config in report strategy - Replace hardcoded benchmark_name default in get_metric() with dynamic derivation from cmd_args.workloads_list (e.g. "aiperf.sh" -> "aiperf"); explicit benchmark:metric:type format still takes precedence - Add aiperf config to sglang.toml and fix stale test_name in sglang_slurm.toml - Update unit tests: add ai_dynamo_aiperf_tr fixture, separate genai_perf and aiperf metric tests, fix E501 in aiperf CSV fixture data --- .../ai_dynamo/test_scenario/sglang_slurm.toml | 1 - .../ai_dynamo/report_generation_strategy.py | 5 +- .../ai_dynamo/test_report_gen_strategy.py | 63 ++++++++++++------- 3 files changed, 45 insertions(+), 24 deletions(-) diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml index 383557377..26ed91285 100644 --- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml @@ -15,7 +15,6 @@ # limitations under the License. name = "dynamo_sglang" -job_status_check = false [[Tests]] id = "sglang-Qwen3-0.6B" diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py index 2ddd3bfa9..a8e4e91b8 100644 --- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py @@ -44,7 +44,6 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type: def get_metric(self, metric: str) -> MetricValue: logging.info(f"Getting metric: {metric}") - benchmark_name = "aiperf" metric_name = metric metric_type = "avg" @@ -54,6 +53,10 @@ def get_metric(self, metric: str) -> MetricValue: logging.warning(f"Invalid metric format: {metric}. Expected 'benchmark:metric_name:metric_type'") return METRIC_ERROR benchmark_name, metric_name, metric_type = parts + else: + # Derive from the configured workload script (e.g. "aiperf.sh" → "aiperf"). + workloads_list = getattr(getattr(self.test_run.test, "cmd_args", None), "workloads_list", None) + benchmark_name = Path(workloads_list[0]).stem if workloads_list else "aiperf" source_csv = self.test_run.output_path / f"{benchmark_name}_report.csv" logging.info(f"CSV file: {source_csv}") diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py index eb93b993a..0e51c414f 100644 --- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py +++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py @@ -25,6 +25,7 @@ AIDynamoArgs, AIDynamoCmdArgs, AIDynamoTestDefinition, + AIPerf, GenAIPerf, LMCache, LMCacheArgs, @@ -49,10 +50,10 @@ def get_csv_content() -> str: def get_aiperf_csv_content() -> str: return ( - "Metric,avg,min,max,p1,p5,p10,p25,p50,p75,p90,p95,p99,std\n" - "Inter Token Latency (ms),2.83,2.78,2.91,2.78,2.79,2.81,2.82,2.83,2.84,2.85,2.88,2.90,0.03\n" - "Time to First Token (ms),49.87,17.15,99.91,18.60,21.26,22.46,49.35,49.87,50.52,51.63,53.91,92.31,9.20\n" - "Output Sequence Length (tokens),498.06,410.00,501.00,450.67,499.00,500.00,500.00,500.00,500.00,500.00,500.00,501.00,12.62\n" + "Metric,avg,min,max\n" + "Inter Token Latency (ms),2.83,2.78,2.91\n" + "Time to First Token (ms),49.87,17.15,99.91\n" + "Output Sequence Length (tokens),498.06,410.00,501.00\n" "\n" "Metric,Value\n" "Output Token Throughput (tokens/sec),595.68\n" @@ -92,6 +93,32 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun: return tr +@pytest.fixture +def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun: + test = AIDynamoTestDefinition( + name="ai_dynamo_aiperf", + description="desc", + test_template_name="t", + cmd_args=AIDynamoCmdArgs( + docker_image_url="http://url", + workloads="aiperf.sh", + dynamo=AIDynamoArgs( + prefill_worker=WorkerConfig( + cmd="python3 -m dynamo.vllm --is-prefill-worker", + worker_initialized_regex="VllmWorker.*has.been.initialized", + args=WorkerBaseArgs(), + ), + ), + aiperf=AIPerf(), + lmcache=LMCache(args=LMCacheArgs()), + ), + ) + tr = TestRun(name="ai_dynamo_aiperf", test=test, num_nodes=1, nodes=[], output_path=tmp_path) + (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content()) + (tr.output_path / test.success_marker).touch() + return tr + + @pytest.fixture def csv_content() -> str: return get_csv_content() @@ -104,39 +131,32 @@ def test_ai_dynamo_can_handle_directory(slurm_system: SlurmSystem, ai_dynamo_tr: def test_ai_dynamo_generate_report(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun, csv_content: str) -> None: strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) - # The new implementation does not generate a report file strategy.generate_report() - # Just verify the method runs without error assert True def test_ai_dynamo_get_metric_genai_perf(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) - # genai_perf metrics require explicit benchmark prefix since default is now aiperf - assert strategy.get_metric("genai_perf:Output Sequence Length (tokens):avg") == 101.01 - assert strategy.get_metric("genai_perf:Inter Token Latency (ms):avg") == 12.34 + # Default fixture uses workloads="genai_perf.sh" — bare names resolve to genai_perf_report.csv. + assert strategy.get_metric("Inter Token Latency (ms)") == 12.34 + assert strategy.get_metric("Output Sequence Length (tokens)") == 101.01 + + # Explicit prefix also works. assert strategy.get_metric("genai_perf:Time To First Token (ms):avg") == 111.12 assert strategy.get_metric("genai_perf:Inter Token Latency (ms):p50") == 89.01 -def test_ai_dynamo_get_metric_aiperf_defaults(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: - strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) +def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun) -> None: + strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_tr) - # bare metric names default to aiperf_report.csv (avg column) + # aiperf fixture uses workloads="aiperf.sh" — bare names resolve to aiperf_report.csv. assert strategy.get_metric("Inter Token Latency (ms)") == 2.83 assert strategy.get_metric("Output Token Throughput (tokens/sec)") == 595.68 - -def test_ai_dynamo_get_metric_aiperf_explicit(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None: - strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr) - - # per-request metrics (first CSV section with avg/p50 columns) + # Explicit prefix. assert strategy.get_metric("aiperf:Inter Token Latency (ms):avg") == 2.83 - assert strategy.get_metric("aiperf:Inter Token Latency (ms):p50") == 2.83 assert strategy.get_metric("aiperf:Time to First Token (ms):avg") == 49.87 - - # summary metrics (second CSV section — value lands in "avg" column by position) assert strategy.get_metric("aiperf:Output Token Throughput (tokens/sec):avg") == 595.68 assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47 @@ -146,8 +166,7 @@ def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: T assert strategy.get_metric("nonexistent-metric") == METRIC_ERROR - # Empty the aiperf CSV to test error handling for the default path - (ai_dynamo_tr.output_path / "aiperf_report.csv").write_text("") + (ai_dynamo_tr.output_path / "genai_perf_report.csv").write_text("") assert strategy.get_metric("Inter Token Latency (ms)") == METRIC_ERROR From 4e49abb9e321e66607d277a2129134e21b53242d Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Thu, 21 May 2026 16:55:07 -0700 Subject: [PATCH 6/6] docs(ai_dynamo): update doc for aiperf default, genai-perf opt-in, and sglang - Replace genai-perf references with aiperf as the default benchmark tool - Add "Choosing a Benchmark Tool" section explaining the workloads field and how to switch to genai_perf.sh with a TOML snippet - Update result CSV example to include TTFT, TTST, Request Latency, and throughput metrics from an actual GB200 run - Add "Supported Backends" section listing vLLM and sglang --- doc/workloads/ai_dynamo.rst | 75 ++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst index 24d1cd310..023d92bf2 100644 --- a/doc/workloads/ai_dynamo.rst +++ b/doc/workloads/ai_dynamo.rst @@ -47,7 +47,7 @@ Node Configuration for AI Dynamo AI Dynamo jobs use three distinct types of nodes: -- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`genai-perf`), and the first decode worker +- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`aiperf` by default, configurable via ``workloads`` in the test TOML), and the first decode worker - **Prefill node(s)**: Handle the prefill stage of inference - **Decode node(s)**: Handle the decode stage of inference (optional, depending on model and setup) @@ -82,32 +82,71 @@ The job progress monitoring can be done using either of the following options: watch tail -n 4 ./results//*.txt -The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch ``genai-perf``, which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``. +The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch the configured benchmark tool (``aiperf`` by default), which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``. -Review genai-perf Benchmark Results ------------------------------------ +Choosing a Benchmark Tool +~~~~~~~~~~~~~~~~~~~~~~~~~ -After job completion, CloudAI will place the output logs and result files in the designated results directory. To analyze performance metrics and validate inference outcomes: +The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``: -- Navigate to the results directory (e.g., ``./results/...``) -- Most importantly, open the ``profile_genai_perf.csv`` file to examine the final benchmarking results +.. code-block:: toml -This CSV file includes detailed metrics collected by genai-perf, such as request latency, throughput, and system utilization statistics. Use this data to evaluate the model's performance and identify potential bottlenecks or optimization opportunities. + [cmd_args] + workloads = "aiperf.sh" # default — uses aiperf, writes aiperf_report.csv + +To use genai-perf instead, set: + +.. code-block:: toml + + [cmd_args] + workloads = "genai_perf.sh" # uses genai-perf, writes genai_perf_report.csv + + [cmd_args.genai_perf] + cmd = "genai-perf profile" + extra-args = "--streaming --verbose -- -v --async" + + [cmd_args.genai_perf.args] + endpoint-type = "chat" + output-tokens-mean = 500 + request-count = 50 + +Review Benchmark Results +------------------------ + +After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field: + +- ``aiperf.sh`` (default) → ``aiperf_report.csv`` +- ``genai_perf.sh`` → ``genai_perf_report.csv`` + +Navigate to ``./results///0/`` and open the CSV to examine performance metrics. + +Example ``aiperf_report.csv`` (default): :: - Metric,avg,min,max,p99,p95,p90,p75,p50,p25 - Time To First Token (ms),"1,146.31",249.48,"3,485.23","3,457.97","3,349.56","3,215.06","1,330.93",640.07,286.52 - Time To Second Token (ms),26.05,0.00,133.51,96.12,36.56,34.88,34.35,33.55,1.78 - Request Latency (ms),"6,406.20","5,371.47","9,608.72","9,436.13","9,046.58","9,028.16","6,549.60","5,690.23","5,493.63" - Inter Token Latency (ms),30.35,27.59,35.60,35.23,33.88,32.53,31.05,30.13,29.04 - Output Sequence Length (tokens),174.45,164.00,187.00,186.22,183.10,180.10,177.00,174.00,171.75 - Input Sequence Length (tokens),"3,000.05","2,999.00","3,001.00","3,001.00","3,001.00","3,000.00","3,000.00","3,000.00","3,000.00" + Metric,avg,min,max,p25,p50,p75,p99,std + Inter Token Latency (ms),2.81,2.66,2.88,2.79,2.83,2.84,2.87,0.04 + Time to First Token (ms),49.87,17.15,99.91,49.35,49.87,50.52,92.31,9.20 + Time to Second Token (ms),0.50,0.03,4.05,0.03,0.04,0.04,3.47,1.08 + Request Latency (ms),1652.30,1203.61,6433.87,1453.19,1462.99,1466.72,6431.16,976.18 + Output Sequence Length (tokens),498.06,410.00,501.00,500.00,500.00,500.00,501.00,12.62 + Input Sequence Length (tokens),300.00,300.00,300.00,300.00,300.00,300.00,300.00,0.00 Metric,Value - Output Token Throughput (per sec),261.25 - Request Throughput (per sec),1.50 - Request Count (count),40.00 + Output Token Throughput (tokens/sec),598.78 + Total Token Throughput (tokens/sec),962.32 + Request Throughput (requests/sec),1.20 + Request Count,50.00 + +Supported Backends +------------------ + +The following backends are available via the ``conf/experimental/ai_dynamo/test/`` directory: + +- **vLLM** (``vllm.toml``) — use with ``test_scenario/vllm_slurm.toml`` +- **sglang** (``sglang.toml``) — use with ``test_scenario/sglang_slurm.toml`` + +Both backends use ``aiperf`` as the default benchmark tool and support disaggregated prefill/decode. API Documentation