From 8ffdcf8e995238225a49d859314bd931b026f7ee Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Wed, 20 May 2026 17:23:34 -0700
Subject: [PATCH 1/6] feat(ai_dynamo): add aiperf workload support

Add AIPerf as a new workload type alongside GenAIPerf in the AIDynamo
workload. Includes the aiperf.sh wrapper script, AIPerf Pydantic model,
arg serialization in the command gen strategy, and experimental test/
scenario configs for single-node disaggregated runs.
---
 conf/experimental/ai_dynamo/test/aiperf.toml  |  87 +++++++++++
 .../aiperf_slurm_single_node.toml             |  37 +++++
 .../test_scenario/vllm_slurm_single_node.toml |  37 +++++
 src/cloudai/workloads/ai_dynamo/__init__.py   |   2 +
 src/cloudai/workloads/ai_dynamo/ai_dynamo.py  |  24 ++-
 src/cloudai/workloads/ai_dynamo/ai_dynamo.sh  |  16 ++
 src/cloudai/workloads/ai_dynamo/aiperf.sh     | 143 ++++++++++++++++++
 .../ai_dynamo/slurm_command_gen_strategy.py   |   1 +
 tests/ref_data/ai-dynamo.sbatch               |   6 +-
 9 files changed, 351 insertions(+), 2 deletions(-)
 create mode 100644 conf/experimental/ai_dynamo/test/aiperf.toml
 create mode 100644 conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml
 create mode 100644 conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
 create mode 100644 src/cloudai/workloads/ai_dynamo/aiperf.sh

diff --git a/conf/experimental/ai_dynamo/test/aiperf.toml b/conf/experimental/ai_dynamo/test/aiperf.toml
new file mode 100644
index 000000000..7fb015b86
--- /dev/null
+++ b/conf/experimental/ai_dynamo/test/aiperf.toml
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "AIPerf-Qwen3-0.6B"
+description = "aiperf benchmark"
+test_template_name = "AIDynamo"
+extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
+
+[cmd_args]
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
+workloads = "aiperf.sh"
+
+  [cmd_args.dynamo]
+  backend = "vllm"
+  model = "Qwen/Qwen3-0.6B"
+
+    [cmd_args.dynamo.prefill_worker]
+    num-nodes = 1
+    cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
+    worker-initialized-regex = 'VllmWorker.*has.been.initialized'
+    extra-args = "--no-enable-expert-parallel"
+
+      [cmd_args.dynamo.prefill_worker.args]
+      gpu-memory-utilization = 0.8
+      tensor-parallel-size = 8
+      pipeline-parallel-size = 1
+      data-parallel-size = 1
+
+    [cmd_args.dynamo.decode_worker]
+    num-nodes = 1
+    cmd = 'python3 -m dynamo.vllm'
+    worker-initialized-regex = 'VllmWorker.*has.been.initialized'
+    extra-args = "--no-enable-expert-parallel"
+
+      [cmd_args.dynamo.decode_worker.args]
+      gpu-memory-utilization = 0.8
+      tensor-parallel-size = 8
+      pipeline-parallel-size = 1
+      data-parallel-size = 1
+
+  [cmd_args.lmcache]
+  controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
+
+    [cmd_args.lmcache.args]
+    chunk_size = 256
+    local_cpu = false
+    nixl_buffer_size = 10737418240
+    nixl_buffer_device = "cuda"
+    extra_config_enable_nixl_storage = true
+    extra_config_nixl_backend = "GDS_MT"
+    extra_config_nixl_file_pool_size = 64
+
+    enable_controller = true
+    lmcache_instance_id = "lmcache_default_instance"
+    controller_url = "localhost:9001"
+    lmcache_worker_port = 8788
+    distributed_url = "localhost:8789"
+
+  [cmd_args.aiperf]
+  cmd = "aiperf profile"
+
+    [cmd_args.aiperf.args]
+    concurrency = 2
+    request-count = 50
+    synthetic-input-tokens-mean = 300
+    output-tokens-mean = 500
+
+[extra_env_vars]
+UCX_LOG_LEVEL = "warn"
+HF_HUB_OFFLINE = "1"
+TRANSFORMERS_OFFLINE = "1"
+HF_DATASETS_OFFLINE = "1"
+DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
+UCX_TLS = "all"
diff --git a/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml
new file mode 100644
index 000000000..fcc0a0afc
--- /dev/null
+++ b/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "dynamo-aiperf-slurm-single-node"
+job_status_check = false
+
+[[Tests]]
+id = "test.disagg.single-node"
+test_name = "AIPerf-Qwen3-0.6B"
+time_limit = "00:15:00"
+
+  [Tests.cmd_args]
+
+    [Tests.cmd_args.dynamo.prefill_worker]
+    num-nodes = 1
+      [Tests.cmd_args.dynamo.prefill_worker.args]
+      tensor-parallel-size = 4
+      pipeline-parallel-size = 1
+
+    [Tests.cmd_args.dynamo.decode_worker]
+    num-nodes = 1
+      [Tests.cmd_args.dynamo.decode_worker.args]
+      tensor-parallel-size = 4
+      pipeline-parallel-size = 1
diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
new file mode 100644
index 000000000..143784f2c
--- /dev/null
+++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "dynamo-vllm-slurm-single-node"
+job_status_check = false
+
+[[Tests]]
+id = "test.disagg.single-node"
+test_name = "vLLM"
+time_limit = "00:10:00"
+
+  [Tests.cmd_args]
+
+    [Tests.cmd_args.dynamo.prefill_worker]
+    num-nodes = 1
+      [Tests.cmd_args.dynamo.prefill_worker.args]
+      tensor-parallel-size = 4
+      pipeline-parallel-size = 1
+
+    [Tests.cmd_args.dynamo.decode_worker]
+    num-nodes = 1
+      [Tests.cmd_args.dynamo.decode_worker.args]
+      tensor-parallel-size = 4
+      pipeline-parallel-size = 1
diff --git a/src/cloudai/workloads/ai_dynamo/__init__.py b/src/cloudai/workloads/ai_dynamo/__init__.py
index b5f030eeb..1360ce10d 100644
--- a/src/cloudai/workloads/ai_dynamo/__init__.py
+++ b/src/cloudai/workloads/ai_dynamo/__init__.py
@@ -18,6 +18,7 @@
     AIDynamoArgs,
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
+    AIPerf,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -35,6 +36,7 @@
     "AIDynamoReportGenerationStrategy",
     "AIDynamoSlurmCommandGenStrategy",
     "AIDynamoTestDefinition",
+    "AIPerf",
     "GenAIPerf",
     "LMCache",
     "LMCacheArgs",
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
index 55dc1f1b3..01912f0c1 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -282,6 +282,25 @@ def installables(self) -> list[Installable]:
         return [self.script]
 
 
+class AIPerf(Workload):
+    """Workload configuration for aiperf benchmarking."""
+
+    model_config = ConfigDict(extra="allow")
+
+    name: str = "aiperf"
+    cmd: str = "aiperf profile"
+    script: File = File(Path(__file__).parent.parent / "ai_dynamo/aiperf.sh")
+    report_name: str = Field(
+        default="aiperf_report.csv",
+        serialization_alias="report-name",
+        validation_alias=AliasChoices("report-name", "report_name"),
+    )
+
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.script]
+
+
 class Constraints(BaseModel):
     """Constraints for validation of AI Dynamo configurations when using DSE."""
 
@@ -301,12 +320,13 @@ class AIDynamoCmdArgs(CmdArgs):
     dynamo: AIDynamoArgs
     lmcache: LMCache = Field(default_factory=LMCache)
     genai_perf: GenAIPerf = Field(default_factory=GenAIPerf)
+    aiperf: AIPerf = Field(default_factory=AIPerf)
     workloads: str = "genai_perf.sh"
 
     @field_validator("workloads", mode="before")
     @classmethod
     def validate_workloads(cls, v: str) -> str:
-        allowed_workloads = ["genai_perf.sh"]
+        allowed_workloads = ["genai_perf.sh", "aiperf.sh"]
         values = [w.strip() for w in v.split(",")]
         for workload in values:
             if workload not in allowed_workloads:
@@ -322,6 +342,7 @@ def installables(self) -> list[Installable]:
         return [
             *self.lmcache.installables,
             *self.genai_perf.installables,
+            *self.aiperf.installables,
         ]
 
 
@@ -356,6 +377,7 @@ def get_workload_map(self) -> dict[str, Workload]:
         """Get a map of workload scripts to workload objects."""
         return {
             self.cmd_args.genai_perf.script.src.name: self.cmd_args.genai_perf,
+            self.cmd_args.aiperf.script.src.name: self.cmd_args.aiperf,
         }
 
     @property
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
index fd391fa5f..46f5daa42 100644
--- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
+++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh
@@ -35,6 +35,8 @@ declare -A lmcache_args
 declare -A lmcache_config
 declare -A genai_perf_args
 declare -A genai_perf_config
+declare -A aiperf_args
+declare -A aiperf_config
 
 declare -A dynamo_args
 dynamo_args["backend"]="vllm"
@@ -163,6 +165,10 @@ _parse_cli_pairs() {
         genai_perf_args["--${key#--genai_perf-args-}"]="$2" ;;
       --genai_perf-*)
         genai_perf_config["--${key#--genai_perf-}"]="$2" ;;
+      --aiperf-args-*)
+        aiperf_args["--${key#--aiperf-args-}"]="$2" ;;
+      --aiperf-*)
+        aiperf_config["--${key#--aiperf-}"]="$2" ;;
       --hf-home)
         HUGGINGFACE_HOME="$2" ;;
       --storage-cache-dir)
@@ -353,6 +359,8 @@ _dump_args() {
   log "LMCache args:\n$(arg_array_to_string lmcache_args)"
   log "GenAI config params:\n$(arg_array_to_string genai_perf_config)"
   log "GenAI-Perf args:\n$(arg_array_to_string genai_perf_args)"
+  log "AIPerf config params:\n$(arg_array_to_string aiperf_config)"
+  log "AIPerf args:\n$(arg_array_to_string aiperf_args)"
   log "--------------------------------"
 }
 
@@ -505,6 +513,10 @@ _is_genai_perf_workload() {
   [[ "${dynamo_args["workloads"]}" == *"genai_perf.sh"* ]]
 }
 
+_is_aiperf_workload() {
+  [[ "${dynamo_args["workloads"]}" == *"aiperf.sh"* ]]
+}
+
 _init_runtime_env() {
   if _is_vllm || _is_sglang; then
     export HF_HOME="${HUGGINGFACE_HOME}"
@@ -1026,6 +1038,10 @@ function launch_workloads()
     launch_workload genai_perf_config genai_perf_args
   fi
 
+  if _is_aiperf_workload; then
+    launch_workload aiperf_config aiperf_args
+  fi
+
   mark_done
 }
 
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
new file mode 100644
index 000000000..80baa0fd8
--- /dev/null
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# aiperf.sh — aiperf profile wrapper for ai_dynamo workloads.
+#
+# Called from ai_dynamo.sh's launch_workload() with:
+#   bash aiperf.sh --result-dir <dir> --model <model> --url <url> --port <port>
+#                  [--cmd <cmd>] [--report-name <name>] [--extra-args <args>]
+#                  -- <aiperf-args>...
+#
+# Context flags (before --) that are recognised and used:
+#   --result-dir    Directory where artifacts and the final report are written.
+#   --model         HuggingFace model identifier (e.g. Qwen/Qwen3-0.6B).
+#   --url           Base URL of the dynamo.frontend (e.g. http://node01).
+#   --port          HTTP port the dynamo.frontend is listening on.
+#   --report-name   Output CSV name (default: aiperf_report.csv).
+#   --cmd           Full launch command including subcommand (default: "aiperf profile").
+#   --extra-args    Raw string appended verbatim after all other flags.
+#
+# All unrecognised flags (--install-dir, --gpus-per-node, etc.) are silently
+# consumed so this script is forward-compatible with launch_workload additions.
+#
+# Everything after -- is passed directly to the aiperf profile invocation.
+
+set -Eeuo pipefail
+
+result_dir=""
+model=""
+url="http://localhost"
+port=8000
+report_name="aiperf_report.csv"
+cmd="aiperf profile"
+extra_args=""
+declare -a aiperf_profile_args=()
+
+log() {
+  echo "[$(date '+%F %T') $(hostname)]: $*"
+}
+
+_parse_aiperf_args() {
+  while [[ $# -ge 2 ]]; do
+    case "$1" in
+      --*) aiperf_profile_args+=("$1" "$2"); shift 2 ;;
+      *)   shift ;;
+    esac
+  done
+  # Capture a trailing lone boolean flag if present.
+  # Use if/fi — not [[ ]] && — so set -e does not trigger on a false condition.
+  if [[ $# -eq 1 && "$1" == --* ]]; then
+    aiperf_profile_args+=("$1")
+  fi
+}
+
+process_args() {
+  while [[ $# -gt 0 ]]; do
+    case "$1" in
+      --result-dir)   result_dir="$2";  shift 2 ;;
+      --model)        model="$2";       shift 2 ;;
+      --url)          url="$2";         shift 2 ;;
+      --port)         port="$2";        shift 2 ;;
+      --report-name)  report_name="$2"; shift 2 ;;
+      --cmd)          cmd="$2";         shift 2 ;;
+      --extra-args)   extra_args="$2";  shift 2 ;;
+      --)             shift; _parse_aiperf_args "$@"; break ;;
+      --*)            shift 2 ;;  # consume unknown flag + its value
+      *)              shift ;;
+    esac
+  done
+
+  log "Parsed args:
+    result_dir:   $result_dir
+    model:        $model
+    url:          $url
+    port:         $port
+    report_name:  $report_name
+    cmd:          $cmd
+    extra_args:   $extra_args
+    profile_args: ${aiperf_profile_args[*]:-}"
+}
+
+process_results() {
+  local artifact_dir="$result_dir/aiperf_artifacts"
+  local csv_path
+  csv_path=$(find "$artifact_dir" -name "*.csv" -print -quit 2>/dev/null || true)
+  if [[ -n "$csv_path" ]]; then
+    cp "$csv_path" "$result_dir/$report_name"
+    log "aiperf report saved to $result_dir/$report_name"
+  else
+    log "ERROR: no CSV found in $artifact_dir — aiperf may not have completed"
+    exit 1
+  fi
+}
+
+main() {
+  process_args "$@"
+
+  if [[ -z "$result_dir" ]]; then
+    log "ERROR: --result-dir is required"; exit 1
+  fi
+  if [[ -z "$model" ]]; then
+    log "ERROR: --model is required"; exit 1
+  fi
+
+  local full_url="${url}:${port}"
+  local artifact_dir="$result_dir/aiperf_artifacts"
+  rm -rf "$artifact_dir"
+
+  # Split cmd into an array (e.g. "aiperf profile" → ["aiperf", "profile"])
+  local -a run_cmd=()
+  read -ra run_cmd <<< "$cmd"
+
+  log "Launching aiperf: ${run_cmd[*]} --model $model --url $full_url"
+
+  "${run_cmd[@]}" \
+    --model         "$model" \
+    --url           "$full_url" \
+    --endpoint-type chat \
+    --streaming \
+    --artifact-dir  "$artifact_dir" \
+    --no-server-metrics \
+    "${aiperf_profile_args[@]}" \
+    ${extra_args}
+
+  log "aiperf run complete"
+  process_results
+}
+
+main "$@"
+exit 0
diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
index 4fbee5c7d..17079875c 100644
--- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py
@@ -117,6 +117,7 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]:
 
         args.extend(self._get_nested_toml_args(td.cmd_args.lmcache, "--lmcache-"))
         args.extend(self._get_nested_toml_args(td.cmd_args.genai_perf, "--genai_perf-"))
+        args.extend(self._get_nested_toml_args(td.cmd_args.aiperf, "--aiperf-"))
 
         return args
 
diff --git a/tests/ref_data/ai-dynamo.sbatch b/tests/ref_data/ai-dynamo.sbatch
index f9b5150c5..492e3c427 100644
--- a/tests/ref_data/ai-dynamo.sbatch
+++ b/tests/ref_data/ai-dynamo.sbatch
@@ -85,4 +85,8 @@ srun \
   --genai_perf-random-seed "42" \
   --genai_perf-request-count "100" \
   --genai_perf-synthetic-input-tokens-mean "550" \
-  --genai_perf-warmup-request-count "10"
\ No newline at end of file
+  --genai_perf-warmup-request-count "10" \
+  --aiperf-name "aiperf" \
+  --aiperf-cmd "aiperf profile" \
+  --aiperf-script "/cloudai_install/aiperf.sh" \
+  --aiperf-report-name "aiperf_report.csv"
\ No newline at end of file

From 8a801b1c8ea4e8cd77b5e43321506fd46c7ec931 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Wed, 20 May 2026 18:34:55 -0700
Subject: [PATCH 2/6] refactor(ai_dynamo): fold aiperf into vllm test, fix
 aiperf.sh arg handling

- Move aiperf config into vllm.toml (workloads = "aiperf.sh" + [cmd_args.aiperf])
  so the backend and benchmark live in one file; remove standalone aiperf.toml
  and aiperf_slurm_single_node.toml
- Fix aiperf.sh: convert extra_args from string to array to prevent
  word-splitting/glob expansion; fix unknown --* flag handling to shift 1
  for boolean flags instead of always shift 2
---
 conf/experimental/ai_dynamo/test/aiperf.toml  | 87 -------------------
 conf/experimental/ai_dynamo/test/vllm.toml    | 10 ++-
 .../aiperf_slurm_single_node.toml             | 37 --------
 .../test_scenario/vllm_slurm_single_node.toml |  2 +-
 src/cloudai/workloads/ai_dynamo/aiperf.sh     | 10 +--
 5 files changed, 15 insertions(+), 131 deletions(-)
 delete mode 100644 conf/experimental/ai_dynamo/test/aiperf.toml
 delete mode 100644 conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml

diff --git a/conf/experimental/ai_dynamo/test/aiperf.toml b/conf/experimental/ai_dynamo/test/aiperf.toml
deleted file mode 100644
index 7fb015b86..000000000
--- a/conf/experimental/ai_dynamo/test/aiperf.toml
+++ /dev/null
@@ -1,87 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "AIPerf-Qwen3-0.6B"
-description = "aiperf benchmark"
-test_template_name = "AIDynamo"
-extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
-
-[cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
-workloads = "aiperf.sh"
-
-  [cmd_args.dynamo]
-  backend = "vllm"
-  model = "Qwen/Qwen3-0.6B"
-
-    [cmd_args.dynamo.prefill_worker]
-    num-nodes = 1
-    cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
-    worker-initialized-regex = 'VllmWorker.*has.been.initialized'
-    extra-args = "--no-enable-expert-parallel"
-
-      [cmd_args.dynamo.prefill_worker.args]
-      gpu-memory-utilization = 0.8
-      tensor-parallel-size = 8
-      pipeline-parallel-size = 1
-      data-parallel-size = 1
-
-    [cmd_args.dynamo.decode_worker]
-    num-nodes = 1
-    cmd = 'python3 -m dynamo.vllm'
-    worker-initialized-regex = 'VllmWorker.*has.been.initialized'
-    extra-args = "--no-enable-expert-parallel"
-
-      [cmd_args.dynamo.decode_worker.args]
-      gpu-memory-utilization = 0.8
-      tensor-parallel-size = 8
-      pipeline-parallel-size = 1
-      data-parallel-size = 1
-
-  [cmd_args.lmcache]
-  controller_cmd = "lmcache_controller --host localhost --port 9000 --monitor-port 9001"
-
-    [cmd_args.lmcache.args]
-    chunk_size = 256
-    local_cpu = false
-    nixl_buffer_size = 10737418240
-    nixl_buffer_device = "cuda"
-    extra_config_enable_nixl_storage = true
-    extra_config_nixl_backend = "GDS_MT"
-    extra_config_nixl_file_pool_size = 64
-
-    enable_controller = true
-    lmcache_instance_id = "lmcache_default_instance"
-    controller_url = "localhost:9001"
-    lmcache_worker_port = 8788
-    distributed_url = "localhost:8789"
-
-  [cmd_args.aiperf]
-  cmd = "aiperf profile"
-
-    [cmd_args.aiperf.args]
-    concurrency = 2
-    request-count = 50
-    synthetic-input-tokens-mean = 300
-    output-tokens-mean = 500
-
-[extra_env_vars]
-UCX_LOG_LEVEL = "warn"
-HF_HUB_OFFLINE = "1"
-TRANSFORMERS_OFFLINE = "1"
-HF_DATASETS_OFFLINE = "1"
-DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
-UCX_TLS = "all"
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
index f1249564f..c88ff3e10 100644
--- a/conf/experimental/ai_dynamo/test/vllm.toml
+++ b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -21,7 +21,7 @@ extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
 docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.8.1"
-workloads = "genai_perf.sh"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "vllm"
@@ -85,6 +85,14 @@ workloads = "genai_perf.sh"
     warmup-request-count = 5
     concurrency = 2
 
+  [cmd_args.aiperf]
+
+    [cmd_args.aiperf.args]
+    concurrency = 2
+    request-count = 50
+    synthetic-input-tokens-mean = 300
+    output-tokens-mean = 500
+
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
 HF_HUB_OFFLINE = "1"
diff --git a/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml
deleted file mode 100644
index fcc0a0afc..000000000
--- a/conf/experimental/ai_dynamo/test_scenario/aiperf_slurm_single_node.toml
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "dynamo-aiperf-slurm-single-node"
-job_status_check = false
-
-[[Tests]]
-id = "test.disagg.single-node"
-test_name = "AIPerf-Qwen3-0.6B"
-time_limit = "00:15:00"
-
-  [Tests.cmd_args]
-
-    [Tests.cmd_args.dynamo.prefill_worker]
-    num-nodes = 1
-      [Tests.cmd_args.dynamo.prefill_worker.args]
-      tensor-parallel-size = 4
-      pipeline-parallel-size = 1
-
-    [Tests.cmd_args.dynamo.decode_worker]
-    num-nodes = 1
-      [Tests.cmd_args.dynamo.decode_worker.args]
-      tensor-parallel-size = 4
-      pipeline-parallel-size = 1
diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
index 143784f2c..76bef2482 100644
--- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
+++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
@@ -20,7 +20,7 @@ job_status_check = false
 [[Tests]]
 id = "test.disagg.single-node"
 test_name = "vLLM"
-time_limit = "00:10:00"
+time_limit = "00:15:00"
 
   [Tests.cmd_args]
 
diff --git a/src/cloudai/workloads/ai_dynamo/aiperf.sh b/src/cloudai/workloads/ai_dynamo/aiperf.sh
index 80baa0fd8..9f5a78b33 100644
--- a/src/cloudai/workloads/ai_dynamo/aiperf.sh
+++ b/src/cloudai/workloads/ai_dynamo/aiperf.sh
@@ -44,7 +44,7 @@ url="http://localhost"
 port=8000
 report_name="aiperf_report.csv"
 cmd="aiperf profile"
-extra_args=""
+declare -a extra_args=()
 declare -a aiperf_profile_args=()
 
 log() {
@@ -74,9 +74,9 @@ process_args() {
       --port)         port="$2";        shift 2 ;;
       --report-name)  report_name="$2"; shift 2 ;;
       --cmd)          cmd="$2";         shift 2 ;;
-      --extra-args)   extra_args="$2";  shift 2 ;;
+      --extra-args)   read -ra extra_args <<< "$2"; shift 2 ;;
       --)             shift; _parse_aiperf_args "$@"; break ;;
-      --*)            shift 2 ;;  # consume unknown flag + its value
+      --*)            if [[ -n "${2:-}" && "${2}" != -* ]]; then shift 2; else shift 1; fi ;;  # consume unknown flag; shift 2 only if next arg is a value
       *)              shift ;;
     esac
   done
@@ -88,7 +88,7 @@ process_args() {
     port:         $port
     report_name:  $report_name
     cmd:          $cmd
-    extra_args:   $extra_args
+    extra_args:   ${extra_args[*]:-}
     profile_args: ${aiperf_profile_args[*]:-}"
 }
 
@@ -133,7 +133,7 @@ main() {
     --artifact-dir  "$artifact_dir" \
     --no-server-metrics \
     "${aiperf_profile_args[@]}" \
-    ${extra_args}
+    "${extra_args[@]}"
 
   log "aiperf run complete"
   process_results

From f124881b914a751d95b66c69d018974162241434 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Wed, 20 May 2026 18:49:07 -0700
Subject: [PATCH 3/6] fix(ai_dynamo): update vllm_slurm scenario test_name,
 remove redundant scenario file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix stale test_name reference in vllm_slurm.toml (vLLM-Qwen3-0.6B → vLLM)
and remove vllm_slurm_single_node.toml since vllm_slurm.toml already covers
the single-node case.
---
 .../ai_dynamo/test_scenario/vllm_slurm.toml   |  4 +-
 .../test_scenario/vllm_slurm_single_node.toml | 37 -------------------
 2 files changed, 2 insertions(+), 39 deletions(-)
 delete mode 100644 conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml

diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml
index c63c648fe..45031da3a 100644
--- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml
+++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml
@@ -19,7 +19,7 @@ job_status_check = false
 
 [[Tests]]
 id = "test.disagg.single-node"
-test_name = "vLLM-Qwen3-0.6B"
+test_name = "vLLM"
 time_limit = "00:10:00"
 
   [Tests.cmd_args]
@@ -38,7 +38,7 @@ time_limit = "00:10:00"
 
 [[Tests]]
 id = "test.disagg.multinode"
-test_name = "vLLM-Qwen3-0.6B"
+test_name = "vLLM"
 time_limit = "00:10:00"
 
   [Tests.cmd_args]
diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
deleted file mode 100644
index 76bef2482..000000000
--- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm_single_node.toml
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name = "dynamo-vllm-slurm-single-node"
-job_status_check = false
-
-[[Tests]]
-id = "test.disagg.single-node"
-test_name = "vLLM"
-time_limit = "00:15:00"
-
-  [Tests.cmd_args]
-
-    [Tests.cmd_args.dynamo.prefill_worker]
-    num-nodes = 1
-      [Tests.cmd_args.dynamo.prefill_worker.args]
-      tensor-parallel-size = 4
-      pipeline-parallel-size = 1
-
-    [Tests.cmd_args.dynamo.decode_worker]
-    num-nodes = 1
-      [Tests.cmd_args.dynamo.decode_worker.args]
-      tensor-parallel-size = 4
-      pipeline-parallel-size = 1

From a523413ee58b60fc9958a2a2ea92f0103e49d6b5 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Thu, 21 May 2026 13:47:02 -0700
Subject: [PATCH 4/6] fix(ai_dynamo): add aiperf to sglang test, fix report
 strategy default

- Add aiperf config to sglang.toml (workloads = "aiperf.sh") so both
  vLLM and sglang backends use aiperf as the default benchmark
- Fix sglang_slurm.toml: correct stale test_name and add job_status_check=false
- Fix AIDynamoReportGenerationStrategy default benchmark_name from
  "genai_perf" to "aiperf" to match the new default workload; genai_perf
  metrics still accessible via "genai_perf:metric_name:metric_type" format
- Update unit tests: fix existing genai_perf tests to use explicit prefix,
  add aiperf-specific tests covering per-request metrics, summary metrics,
  and default benchmark resolution
---
 conf/experimental/ai_dynamo/test/sglang.toml  | 10 +++-
 .../ai_dynamo/test_scenario/sglang_slurm.toml |  3 +-
 .../ai_dynamo/report_generation_strategy.py   |  2 +-
 .../ai_dynamo/test_report_gen_strategy.py     | 56 ++++++++++++++-----
 4 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml
index d1429fe11..0ebc3bd60 100644
--- a/conf/experimental/ai_dynamo/test/sglang.toml
+++ b/conf/experimental/ai_dynamo/test/sglang.toml
@@ -21,7 +21,7 @@ extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"]
 
 [cmd_args]
 docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
-workloads = "genai_perf.sh"
+workloads = "aiperf.sh"
 
   [cmd_args.dynamo]
   backend = "sglang"
@@ -93,6 +93,14 @@ workloads = "genai_perf.sh"
     warmup-request-count = 5
     concurrency = 2
 
+  [cmd_args.aiperf]
+
+    [cmd_args.aiperf.args]
+    concurrency = 2
+    request-count = 50
+    synthetic-input-tokens-mean = 300
+    output-tokens-mean = 500
+
 [extra_env_vars]
 UCX_LOG_LEVEL = "warn"
 HF_HUB_OFFLINE = "1"
diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
index d6b8eac1c..383557377 100644
--- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
+++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
@@ -15,10 +15,11 @@
 # limitations under the License.
 
 name = "dynamo_sglang"
+job_status_check = false
 
 [[Tests]]
 id = "sglang-Qwen3-0.6B"
-test_name = "sglang-Qwen3-0.6B"
+test_name = "sglang"
 time_limit = "00:20:00"
 
   [Tests.cmd_args]
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
index a2f243712..2ddd3bfa9 100644
--- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
@@ -44,7 +44,7 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type:
 
     def get_metric(self, metric: str) -> MetricValue:
         logging.info(f"Getting metric: {metric}")
-        benchmark_name = "genai_perf"
+        benchmark_name = "aiperf"
         metric_name = metric
         metric_type = "avg"
 
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index 2674f4a77..eb93b993a 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -47,6 +47,20 @@ def get_csv_content() -> str:
     )
 
 
+def get_aiperf_csv_content() -> str:
+    return (
+        "Metric,avg,min,max,p1,p5,p10,p25,p50,p75,p90,p95,p99,std\n"
+        "Inter Token Latency (ms),2.83,2.78,2.91,2.78,2.79,2.81,2.82,2.83,2.84,2.85,2.88,2.90,0.03\n"
+        "Time to First Token (ms),49.87,17.15,99.91,18.60,21.26,22.46,49.35,49.87,50.52,51.63,53.91,92.31,9.20\n"
+        "Output Sequence Length (tokens),498.06,410.00,501.00,450.67,499.00,500.00,500.00,500.00,500.00,500.00,500.00,501.00,12.62\n"
+        "\n"
+        "Metric,Value\n"
+        "Output Token Throughput (tokens/sec),595.68\n"
+        "Total Token Throughput (tokens/sec),954.47\n"
+        "Request Count,50.00\n"
+    )
+
+
 @pytest.fixture
 def ai_dynamo_tr(tmp_path: Path) -> TestRun:
     test = AIDynamoTestDefinition(
@@ -70,6 +84,7 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun:
 
     csv_content = get_csv_content()
     (tr.output_path / "genai_perf_report.csv").write_text(csv_content)
+    (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content())
     (tr.output_path / "profile_genai_perf.csv").write_text(csv_content)
     (tr.output_path / "profile_genai_perf.json").write_text("mock json content")
     (tr.output_path / test.success_marker).touch()
@@ -95,32 +110,45 @@ def test_ai_dynamo_generate_report(slurm_system: SlurmSystem, ai_dynamo_tr: Test
     assert True
 
 
-def test_ai_dynamo_get_metric_single_values(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
+def test_ai_dynamo_get_metric_genai_perf(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
     strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
 
-    # Test that metrics from the first CSV section work
-    assert strategy.get_metric("Output Sequence Length (tokens)") == 101.01
-    assert strategy.get_metric("Input Sequence Length (tokens)") == 123.45
+    # genai_perf metrics require explicit benchmark prefix since default is now aiperf
+    assert strategy.get_metric("genai_perf:Output Sequence Length (tokens):avg") == 101.01
+    assert strategy.get_metric("genai_perf:Inter Token Latency (ms):avg") == 12.34
+    assert strategy.get_metric("genai_perf:Time To First Token (ms):avg") == 111.12
+    assert strategy.get_metric("genai_perf:Inter Token Latency (ms):p50") == 89.01
 
 
-def test_ai_dynamo_get_metric_statistical_values(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
+def test_ai_dynamo_get_metric_aiperf_defaults(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
     strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
 
-    # Use exact metric names from CSV (with avg column, which is default)
-    assert strategy.get_metric("Time To First Token (ms)") == 111.12
-    assert strategy.get_metric("Time To Second Token (ms)") == 11.13
-    assert strategy.get_metric("Request Latency (ms)") == 1111.14
-    assert strategy.get_metric("Inter Token Latency (ms)") == 12.34
+    # bare metric names default to aiperf_report.csv (avg column)
+    assert strategy.get_metric("Inter Token Latency (ms)") == 2.83
+    assert strategy.get_metric("Output Token Throughput (tokens/sec)") == 595.68
+
+
+def test_ai_dynamo_get_metric_aiperf_explicit(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
+    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
+
+    # per-request metrics (first CSV section with avg/p50 columns)
+    assert strategy.get_metric("aiperf:Inter Token Latency (ms):avg") == 2.83
+    assert strategy.get_metric("aiperf:Inter Token Latency (ms):p50") == 2.83
+    assert strategy.get_metric("aiperf:Time to First Token (ms):avg") == 49.87
+
+    # summary metrics (second CSV section — value lands in "avg" column by position)
+    assert strategy.get_metric("aiperf:Output Token Throughput (tokens/sec):avg") == 595.68
+    assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47
 
 
 def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
     strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
 
-    assert strategy.get_metric("invalid-metric") == METRIC_ERROR
+    assert strategy.get_metric("nonexistent-metric") == METRIC_ERROR
 
-    # Empty the CSV file to test error handling
-    (ai_dynamo_tr.output_path / "genai_perf_report.csv").write_text("")
-    assert strategy.get_metric("invalid-metric") == METRIC_ERROR
+    # Empty the aiperf CSV to test error handling for the default path
+    (ai_dynamo_tr.output_path / "aiperf_report.csv").write_text("")
+    assert strategy.get_metric("Inter Token Latency (ms)") == METRIC_ERROR
 
 
 def test_was_run_successful(ai_dynamo_tr: TestRun) -> None:

From d5d556ebf933aa96f1c293729758375fe9dd2c5e Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Thu, 21 May 2026 15:30:54 -0700
Subject: [PATCH 5/6] fix(ai_dynamo): derive benchmark name from workload
 config in report strategy

- Replace hardcoded benchmark_name default in get_metric() with dynamic
  derivation from cmd_args.workloads_list (e.g. "aiperf.sh" -> "aiperf");
  explicit benchmark:metric:type format still takes precedence
- Add aiperf config to sglang.toml and fix stale test_name in sglang_slurm.toml
- Update unit tests: add ai_dynamo_aiperf_tr fixture, separate genai_perf
  and aiperf metric tests, fix E501 in aiperf CSV fixture data
---
 .../ai_dynamo/test_scenario/sglang_slurm.toml |  1 -
 .../ai_dynamo/report_generation_strategy.py   |  5 +-
 .../ai_dynamo/test_report_gen_strategy.py     | 63 ++++++++++++-------
 3 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
index 383557377..26ed91285 100644
--- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
+++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml
@@ -15,7 +15,6 @@
 # limitations under the License.
 
 name = "dynamo_sglang"
-job_status_check = false
 
 [[Tests]]
 id = "sglang-Qwen3-0.6B"
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
index 2ddd3bfa9..a8e4e91b8 100644
--- a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
+++ b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
@@ -44,7 +44,6 @@ def extract_metric_from_csv(self, csv_file: Path, metric_name: str, metric_type:
 
     def get_metric(self, metric: str) -> MetricValue:
         logging.info(f"Getting metric: {metric}")
-        benchmark_name = "aiperf"
         metric_name = metric
         metric_type = "avg"
 
@@ -54,6 +53,10 @@ def get_metric(self, metric: str) -> MetricValue:
                 logging.warning(f"Invalid metric format: {metric}. Expected 'benchmark:metric_name:metric_type'")
                 return METRIC_ERROR
             benchmark_name, metric_name, metric_type = parts
+        else:
+            # Derive from the configured workload script (e.g. "aiperf.sh" → "aiperf").
+            workloads_list = getattr(getattr(self.test_run.test, "cmd_args", None), "workloads_list", None)
+            benchmark_name = Path(workloads_list[0]).stem if workloads_list else "aiperf"
 
         source_csv = self.test_run.output_path / f"{benchmark_name}_report.csv"
         logging.info(f"CSV file: {source_csv}")
diff --git a/tests/workloads/ai_dynamo/test_report_gen_strategy.py b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
index eb93b993a..0e51c414f 100644
--- a/tests/workloads/ai_dynamo/test_report_gen_strategy.py
+++ b/tests/workloads/ai_dynamo/test_report_gen_strategy.py
@@ -25,6 +25,7 @@
     AIDynamoArgs,
     AIDynamoCmdArgs,
     AIDynamoTestDefinition,
+    AIPerf,
     GenAIPerf,
     LMCache,
     LMCacheArgs,
@@ -49,10 +50,10 @@ def get_csv_content() -> str:
 
 def get_aiperf_csv_content() -> str:
     return (
-        "Metric,avg,min,max,p1,p5,p10,p25,p50,p75,p90,p95,p99,std\n"
-        "Inter Token Latency (ms),2.83,2.78,2.91,2.78,2.79,2.81,2.82,2.83,2.84,2.85,2.88,2.90,0.03\n"
-        "Time to First Token (ms),49.87,17.15,99.91,18.60,21.26,22.46,49.35,49.87,50.52,51.63,53.91,92.31,9.20\n"
-        "Output Sequence Length (tokens),498.06,410.00,501.00,450.67,499.00,500.00,500.00,500.00,500.00,500.00,500.00,501.00,12.62\n"
+        "Metric,avg,min,max\n"
+        "Inter Token Latency (ms),2.83,2.78,2.91\n"
+        "Time to First Token (ms),49.87,17.15,99.91\n"
+        "Output Sequence Length (tokens),498.06,410.00,501.00\n"
         "\n"
         "Metric,Value\n"
         "Output Token Throughput (tokens/sec),595.68\n"
@@ -92,6 +93,32 @@ def ai_dynamo_tr(tmp_path: Path) -> TestRun:
     return tr
 
 
+@pytest.fixture
+def ai_dynamo_aiperf_tr(tmp_path: Path) -> TestRun:
+    test = AIDynamoTestDefinition(
+        name="ai_dynamo_aiperf",
+        description="desc",
+        test_template_name="t",
+        cmd_args=AIDynamoCmdArgs(
+            docker_image_url="http://url",
+            workloads="aiperf.sh",
+            dynamo=AIDynamoArgs(
+                prefill_worker=WorkerConfig(
+                    cmd="python3 -m dynamo.vllm --is-prefill-worker",
+                    worker_initialized_regex="VllmWorker.*has.been.initialized",
+                    args=WorkerBaseArgs(),
+                ),
+            ),
+            aiperf=AIPerf(),
+            lmcache=LMCache(args=LMCacheArgs()),
+        ),
+    )
+    tr = TestRun(name="ai_dynamo_aiperf", test=test, num_nodes=1, nodes=[], output_path=tmp_path)
+    (tr.output_path / "aiperf_report.csv").write_text(get_aiperf_csv_content())
+    (tr.output_path / test.success_marker).touch()
+    return tr
+
+
 @pytest.fixture
 def csv_content() -> str:
     return get_csv_content()
@@ -104,39 +131,32 @@ def test_ai_dynamo_can_handle_directory(slurm_system: SlurmSystem, ai_dynamo_tr:
 
 def test_ai_dynamo_generate_report(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun, csv_content: str) -> None:
     strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
-    # The new implementation does not generate a report file
     strategy.generate_report()
-    # Just verify the method runs without error
     assert True
 
 
 def test_ai_dynamo_get_metric_genai_perf(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
     strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
 
-    # genai_perf metrics require explicit benchmark prefix since default is now aiperf
-    assert strategy.get_metric("genai_perf:Output Sequence Length (tokens):avg") == 101.01
-    assert strategy.get_metric("genai_perf:Inter Token Latency (ms):avg") == 12.34
+    # Default fixture uses workloads="genai_perf.sh" — bare names resolve to genai_perf_report.csv.
+    assert strategy.get_metric("Inter Token Latency (ms)") == 12.34
+    assert strategy.get_metric("Output Sequence Length (tokens)") == 101.01
+
+    # Explicit prefix also works.
     assert strategy.get_metric("genai_perf:Time To First Token (ms):avg") == 111.12
     assert strategy.get_metric("genai_perf:Inter Token Latency (ms):p50") == 89.01
 
 
-def test_ai_dynamo_get_metric_aiperf_defaults(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
-    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
+def test_ai_dynamo_get_metric_aiperf(slurm_system: SlurmSystem, ai_dynamo_aiperf_tr: TestRun) -> None:
+    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_aiperf_tr)
 
-    # bare metric names default to aiperf_report.csv (avg column)
+    # aiperf fixture uses workloads="aiperf.sh" — bare names resolve to aiperf_report.csv.
     assert strategy.get_metric("Inter Token Latency (ms)") == 2.83
     assert strategy.get_metric("Output Token Throughput (tokens/sec)") == 595.68
 
-
-def test_ai_dynamo_get_metric_aiperf_explicit(slurm_system: SlurmSystem, ai_dynamo_tr: TestRun) -> None:
-    strategy = AIDynamoReportGenerationStrategy(slurm_system, ai_dynamo_tr)
-
-    # per-request metrics (first CSV section with avg/p50 columns)
+    # Explicit prefix.
     assert strategy.get_metric("aiperf:Inter Token Latency (ms):avg") == 2.83
-    assert strategy.get_metric("aiperf:Inter Token Latency (ms):p50") == 2.83
     assert strategy.get_metric("aiperf:Time to First Token (ms):avg") == 49.87
-
-    # summary metrics (second CSV section — value lands in "avg" column by position)
     assert strategy.get_metric("aiperf:Output Token Throughput (tokens/sec):avg") == 595.68
     assert strategy.get_metric("aiperf:Total Token Throughput (tokens/sec):avg") == 954.47
 
@@ -146,8 +166,7 @@ def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: T
 
     assert strategy.get_metric("nonexistent-metric") == METRIC_ERROR
 
-    # Empty the aiperf CSV to test error handling for the default path
-    (ai_dynamo_tr.output_path / "aiperf_report.csv").write_text("")
+    (ai_dynamo_tr.output_path / "genai_perf_report.csv").write_text("")
     assert strategy.get_metric("Inter Token Latency (ms)") == METRIC_ERROR
 
 

From 4e49abb9e321e66607d277a2129134e21b53242d Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Thu, 21 May 2026 16:55:07 -0700
Subject: [PATCH 6/6] docs(ai_dynamo): update doc for aiperf default,
 genai-perf opt-in, and sglang

- Replace genai-perf references with aiperf as the default benchmark tool
- Add "Choosing a Benchmark Tool" section explaining the workloads field
  and how to switch to genai_perf.sh with a TOML snippet
- Update result CSV example to include TTFT, TTST, Request Latency, and
  throughput metrics from an actual GB200 run
- Add "Supported Backends" section listing vLLM and sglang
---
 doc/workloads/ai_dynamo.rst | 75 ++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
index 24d1cd310..023d92bf2 100644
--- a/doc/workloads/ai_dynamo.rst
+++ b/doc/workloads/ai_dynamo.rst
@@ -47,7 +47,7 @@ Node Configuration for AI Dynamo
 
 AI Dynamo jobs use three distinct types of nodes:
 
-- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`genai-perf`), and the first decode worker
+- **Frontend node**: Hosts the coordination services (`etcd`, `nats`), the **frontend server**, the **request generator** (`aiperf` by default, configurable via ``workloads`` in the test TOML), and the first decode worker
 - **Prefill node(s)**: Handle the prefill stage of inference
 - **Decode node(s)**: Handle the decode stage of inference (optional, depending on model and setup)
 
@@ -82,32 +82,71 @@ The job progress monitoring can be done using either of the following options:
 
    watch tail -n 4 ./results/<scenario name>/*.txt
 
-The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch ``genai-perf``, which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``.
+The frontend node will initially wait to allow weight loading on all nodes. Once ready, it will launch the configured benchmark tool (``aiperf`` by default), which begins generating requests to the frontend server. All servers cooperate to complete inference, and the output will appear in ``stdout.txt``.
 
-Review genai-perf Benchmark Results
------------------------------------
+Choosing a Benchmark Tool
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
-After job completion, CloudAI will place the output logs and result files in the designated results directory. To analyze performance metrics and validate inference outcomes:
+The benchmark tool is controlled by the ``workloads`` field in the test TOML. The default is ``aiperf.sh``:
 
-- Navigate to the results directory (e.g., ``./results/...``)
-- Most importantly, open the ``profile_genai_perf.csv`` file to examine the final benchmarking results
+.. code-block:: toml
 
-This CSV file includes detailed metrics collected by genai-perf, such as request latency, throughput, and system utilization statistics. Use this data to evaluate the model's performance and identify potential bottlenecks or optimization opportunities.
+   [cmd_args]
+   workloads = "aiperf.sh"   # default — uses aiperf, writes aiperf_report.csv
+
+To use genai-perf instead, set:
+
+.. code-block:: toml
+
+   [cmd_args]
+   workloads = "genai_perf.sh"   # uses genai-perf, writes genai_perf_report.csv
+
+   [cmd_args.genai_perf]
+   cmd = "genai-perf profile"
+   extra-args = "--streaming --verbose -- -v --async"
+
+     [cmd_args.genai_perf.args]
+     endpoint-type = "chat"
+     output-tokens-mean = 500
+     request-count = 50
+
+Review Benchmark Results
+------------------------
+
+After job completion, CloudAI places output logs and result files in the designated results directory. The result file name depends on the configured ``workloads`` field:
+
+- ``aiperf.sh`` (default) → ``aiperf_report.csv``
+- ``genai_perf.sh`` → ``genai_perf_report.csv``
+
+Navigate to ``./results/<scenario>/<test-id>/0/`` and open the CSV to examine performance metrics.
+
+Example ``aiperf_report.csv`` (default):
 
 ::
 
-   Metric,avg,min,max,p99,p95,p90,p75,p50,p25
-   Time To First Token (ms),"1,146.31",249.48,"3,485.23","3,457.97","3,349.56","3,215.06","1,330.93",640.07,286.52
-   Time To Second Token (ms),26.05,0.00,133.51,96.12,36.56,34.88,34.35,33.55,1.78
-   Request Latency (ms),"6,406.20","5,371.47","9,608.72","9,436.13","9,046.58","9,028.16","6,549.60","5,690.23","5,493.63"
-   Inter Token Latency (ms),30.35,27.59,35.60,35.23,33.88,32.53,31.05,30.13,29.04
-   Output Sequence Length (tokens),174.45,164.00,187.00,186.22,183.10,180.10,177.00,174.00,171.75
-   Input Sequence Length (tokens),"3,000.05","2,999.00","3,001.00","3,001.00","3,001.00","3,000.00","3,000.00","3,000.00","3,000.00"
+   Metric,avg,min,max,p25,p50,p75,p99,std
+   Inter Token Latency (ms),2.81,2.66,2.88,2.79,2.83,2.84,2.87,0.04
+   Time to First Token (ms),49.87,17.15,99.91,49.35,49.87,50.52,92.31,9.20
+   Time to Second Token (ms),0.50,0.03,4.05,0.03,0.04,0.04,3.47,1.08
+   Request Latency (ms),1652.30,1203.61,6433.87,1453.19,1462.99,1466.72,6431.16,976.18
+   Output Sequence Length (tokens),498.06,410.00,501.00,500.00,500.00,500.00,501.00,12.62
+   Input Sequence Length (tokens),300.00,300.00,300.00,300.00,300.00,300.00,300.00,0.00
 
    Metric,Value
-   Output Token Throughput (per sec),261.25
-   Request Throughput (per sec),1.50
-   Request Count (count),40.00
+   Output Token Throughput (tokens/sec),598.78
+   Total Token Throughput (tokens/sec),962.32
+   Request Throughput (requests/sec),1.20
+   Request Count,50.00
+
+Supported Backends
+------------------
+
+The following backends are available via the ``conf/experimental/ai_dynamo/test/`` directory:
+
+- **vLLM** (``vllm.toml``) — use with ``test_scenario/vllm_slurm.toml``
+- **sglang** (``sglang.toml``) — use with ``test_scenario/sglang_slurm.toml``
+
+Both backends use ``aiperf`` as the default benchmark tool and support disaggregated prefill/decode.
 
 
 API Documentation