From 605903472fcc92c6c21ae586314397af8201deb0 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 10 Apr 2026 14:58:09 +0200 Subject: [PATCH 1/3] make ai_dynamo up-to-date with sglang --- conf/experimental/ai_dynamo/test/sglang.toml | 7 +- .../ai_dynamo/test_scenario/sglang_slurm.toml | 2 +- .../test_scenario/vllm_kvbm_slurm.toml | 2 +- .../ai_dynamo/test_scenario/vllm_slurm.toml | 4 +- src/cloudai/workloads/ai_dynamo/ai_dynamo.py | 10 +- src/cloudai/workloads/ai_dynamo/ai_dynamo.sh | 9 +- .../ai_dynamo/slurm_command_gen_strategy.py | 138 +++++++++++++++++- .../test_command_gen_strategy_slurm.py | 34 +++++ 8 files changed, 195 insertions(+), 11 deletions(-) diff --git a/conf/experimental/ai_dynamo/test/sglang.toml b/conf/experimental/ai_dynamo/test/sglang.toml index d1429fe11..ea59d07ae 100644 --- a/conf/experimental/ai_dynamo/test/sglang.toml +++ b/conf/experimental/ai_dynamo/test/sglang.toml @@ -20,7 +20,7 @@ test_template_name = "AIDynamo" extra_container_mounts = ["/run/udev:/run/udev", "/tmp:/tmp"] [cmd_args] -docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0" +docker_image_url = "nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.0.0" workloads = "genai_perf.sh" [cmd_args.dynamo] @@ -32,7 +32,7 @@ workloads = "genai_perf.sh" num-nodes = 1 cmd = 'python3 -m dynamo.sglang' extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics" - worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' + worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' [cmd_args.dynamo.prefill_worker.args] page-size = 16 @@ -48,7 +48,7 @@ workloads = "genai_perf.sh" num-nodes = 1 cmd = 'python3 -m dynamo.sglang' extra-args = "--trust-remote-code --skip-tokenizer-init --enable-metrics" - worker-initialized-regex = 'register._register_llm_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' + worker-initialized-regex = 'register._register_model_with_runtime_config:.Successfully.registered.LLM.with.runtime.config' [cmd_args.dynamo.decode_worker.args] page-size = 16 @@ -79,6 +79,7 @@ workloads = "genai_perf.sh" [cmd_args.genai_perf] cmd = "genai-perf profile" + client-docker-image-url = "nvcr.io/nvidia/tritonserver:25.03-py3-sdk" extra-args = "--streaming --verbose -- -v --async" [cmd_args.genai_perf.args] diff --git a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml index d6b8eac1c..26ed91285 100644 --- a/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/sglang_slurm.toml @@ -18,7 +18,7 @@ name = "dynamo_sglang" [[Tests]] id = "sglang-Qwen3-0.6B" -test_name = "sglang-Qwen3-0.6B" +test_name = "sglang" time_limit = "00:20:00" [Tests.cmd_args] diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_kvbm_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_kvbm_slurm.toml index 751c7d0e6..f705f4b66 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_kvbm_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_kvbm_slurm.toml @@ -17,7 +17,7 @@ name = "dynamo_vllm_kvbm" [[Tests]] -id = "vLLM-Qwen3-0.6B" +id = "vLLM" test_name = "vLLM" time_limit = "20:00:00" diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml index c63c648fe..45031da3a 100644 --- a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml +++ b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml @@ -19,7 +19,7 @@ job_status_check = false [[Tests]] id = "test.disagg.single-node" -test_name = "vLLM-Qwen3-0.6B" +test_name = "vLLM" time_limit = "00:10:00" [Tests.cmd_args] @@ -38,7 +38,7 @@ time_limit = "00:10:00" [[Tests]] id = "test.disagg.multinode" -test_name = "vLLM-Qwen3-0.6B" +test_name = "vLLM" time_limit = "00:10:00" [Tests.cmd_args] diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py index 55dc1f1b3..518e488c9 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py @@ -276,10 +276,18 @@ class GenAIPerf(Workload): name: str = "genai_perf" cmd: str = "genai-perf profile" script: File = File(Path(__file__).parent.parent / "ai_dynamo/genai_perf.sh") + client_docker_image_url: str | None = Field( + default=None, + serialization_alias="client-docker-image-url", + validation_alias=AliasChoices("client-docker-image-url", "client_docker_image_url"), + ) @property def installables(self) -> list[Installable]: - return [self.script] + result: list[Installable] = [self.script] + if self.client_docker_image_url: + result.append(DockerImage(url=self.client_docker_image_url)) + return result class Constraints(BaseModel): diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh index fd391fa5f..7e07c63a8 100644 --- a/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh +++ b/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh @@ -9,6 +9,7 @@ DONE_MARKER="./success-marker.txt" FATAL_ERROR_MARKER="./failure-marker.txt" NODE_ROLES_FILE="node_roles.log" TEST_USER="$USER" +WAIT_FOR_EXTERNAL_WORKLOAD="false" export DYN_SDK_DISABLE_ANSI_LOGGING=1 export VLLM_DISABLE_COLORED_OUTPUT=1 @@ -177,6 +178,8 @@ _parse_cli_pairs() { FATAL_ERROR_MARKER="$2" ;; --success-marker) DONE_MARKER="$2" ;; + --wait-for-external-workload) + WAIT_FOR_EXTERNAL_WORKLOAD="$2" ;; esac shift; shift; done @@ -1078,7 +1081,11 @@ function main() sleep 10 - launch_workloads & + if [[ "$WAIT_FOR_EXTERNAL_WORKLOAD" == "true" ]]; then + log "Waiting for external workload to complete" + else + launch_workloads & + fi fi wait_for_frontend_marker diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 4fbee5c7d..ba6e1f429 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -20,7 +20,7 @@ from pydantic import BaseModel, TypeAdapter, ValidationError -from cloudai.core import File, GitRepo +from cloudai.core import DockerImage, File, GitRepo from cloudai.systems.slurm import SlurmCommandGenStrategy from .ai_dynamo import AIDynamoTestDefinition @@ -84,7 +84,7 @@ def _get_nested_toml_args(self, base_model: BaseModel, prefix: str) -> List[str] return result - def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: + def _gen_script_args(self, td: AIDynamoTestDefinition, wait_for_external_workload: bool = False) -> List[str]: assert td.repo.installed_path args = [ "--user $USER", @@ -97,6 +97,9 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: f"--success-marker {self.CONTAINER_MOUNT_OUTPUT}/{td.success_marker}", ] + if wait_for_external_workload: + args.append("--wait-for-external-workload true") + if td.cmd_args.storage_cache_dir: args.append(f"--storage-cache-dir {td.cmd_args.storage_cache_dir}") @@ -120,6 +123,15 @@ def _gen_script_args(self, td: AIDynamoTestDefinition) -> List[str]: return args + def _client_image_path(self, td: AIDynamoTestDefinition) -> str | None: + client_image_url = td.cmd_args.genai_perf.client_docker_image_url + if not client_image_url: + return None + client_image = DockerImage(url=client_image_url) + if client_image.installed_path: + return str(client_image.installed_path) + return None + def _gen_srun_command(self) -> str: td = cast(AIDynamoTestDefinition, self.test_run.test) num_nodes, node_list = self.get_cached_nodes_spec() @@ -142,6 +154,128 @@ def _gen_srun_command(self) -> str: srun_cmd.extend(self._gen_script_args(td)) return " \\\n ".join(srun_cmd) + "\n" + def _gen_service_srun_command(self) -> str: + td = cast(AIDynamoTestDefinition, self.test_run.test) + num_nodes, node_list = self.get_cached_nodes_spec() + out_dir = str(self.test_run.output_path.absolute()) + + srun_cmd = self.gen_srun_prefix() + srun_cmd.extend( + [ + f"--nodes={num_nodes}", + *([] if not node_list else [f"--nodelist={','.join(node_list)}"]), + f"--ntasks={num_nodes}", + "--ntasks-per-node=1", + f"--output={out_dir}/node-%n-stdout.txt", + f"--error={out_dir}/node-%n-stderr.txt", + "bash", + f"{self.CONTAINER_MOUNT_INSTALL}/{td.script.src.name}", + ] + ) + srun_cmd.extend(self._gen_script_args(td, wait_for_external_workload=True)) + return " ".join(srun_cmd) + + def _client_container_mounts(self, td: AIDynamoTestDefinition) -> list[str]: + mounts = [ + f"{self.test_run.output_path.absolute()}:{self.CONTAINER_MOUNT_OUTPUT}", + f"{self.system.install_path.absolute()}:{self.CONTAINER_MOUNT_INSTALL}", + f"{self.test_run.output_path.absolute()}", + *td.extra_container_mounts, + f"{self.system.hf_home_path.absolute()}:{self.CONTAINER_MOUNT_HF_HOME}", + ] + + if td.cmd_args.storage_cache_dir: + mounts.append(f"{td.cmd_args.storage_cache_dir}:{td.cmd_args.storage_cache_dir}") + + return mounts + + def _gen_client_srun_prefix(self, image_path: str) -> list[str]: + srun_cmd = ["srun", "--export=ALL", f"--mpi={self.system.mpi}"] + mounts = self._client_container_mounts(cast(AIDynamoTestDefinition, self.test_run.test)) + srun_cmd.append(f"--container-image={image_path}") + srun_cmd.append(f"--container-mounts={','.join(mounts)}") + if not self.system.container_mount_home: + srun_cmd.append("--no-container-mount-home") + if self.system.extra_srun_args: + srun_cmd.append(self.system.extra_srun_args) + if self.test_run.extra_srun_args: + srun_cmd.append(self.test_run.extra_srun_args) + return srun_cmd + + def _gen_external_benchmark_command(self, image_path: str) -> str: + td = cast(AIDynamoTestDefinition, self.test_run.test) + _, node_list = self.get_cached_nodes_spec() + frontend_node = node_list[0] + out_dir = str(self.test_run.output_path.absolute()) + + srun_cmd = self._gen_client_srun_prefix(image_path) + srun_cmd.extend( + [ + "--nodes=1", + f"--nodelist={frontend_node}", + "--ntasks=1", + "--ntasks-per-node=1", + f"--output={out_dir}/genai-perf-stdout.txt", + f"--error={out_dir}/genai-perf-stderr.txt", + "bash", + f"{self.CONTAINER_MOUNT_INSTALL}/{td.cmd_args.genai_perf.script.src.name}", + f"--result-dir {self.CONTAINER_MOUNT_OUTPUT}", + f'--model "{td.cmd_args.dynamo.model}"', + f'--url "http://{frontend_node}"', + f'--port "{td.cmd_args.dynamo.port}"', + f'--endpoint "{td.cmd_args.dynamo.endpoint}"', + f'--gpus-per-node "{self.system.gpus_per_node or 1}"', + f'--report-name "{td.cmd_args.genai_perf.report_name}"', + f'--cmd "{td.cmd_args.genai_perf.cmd}"', + ] + ) + if td.cmd_args.genai_perf.extra_args: + extra_args = td.cmd_args.genai_perf.extra_args + if isinstance(extra_args, list): + extra_args = " ".join(str(arg) for arg in extra_args) + srun_cmd.append(f'--extra-args "{extra_args}"') + + srun_cmd.append("--") + srun_cmd.extend(self._get_toml_args(td.cmd_args.genai_perf.args, "--")) + + return " ".join(srun_cmd) + + def gen_exec_command(self) -> str: + td = cast(AIDynamoTestDefinition, self.test_run.test) + client_image_path = self._client_image_path(td) + if not client_image_path: + return super().gen_exec_command() + + _, node_list = self.get_cached_nodes_spec() + frontend_node = node_list[0] + service_cmd = self._gen_service_srun_command() + benchmark_cmd = self._gen_external_benchmark_command(client_image_path) + success_marker = f"{self.test_run.output_path.absolute()}/{td.success_marker}" + + full_command = "\n".join( + [ + f"{service_cmd} &", + "SERVICE_PID=$!", + "BENCH_RC=0", + "for _ in $(seq 1 120); do", + f' if curl -sf "http://{frontend_node}:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', + " break", + " fi", + " sleep 5", + "done", + f'if ! curl -sf "http://{frontend_node}:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', + " BENCH_RC=1", + "else", + f" {benchmark_cmd} || BENCH_RC=$?", + "fi", + f'touch "{success_marker}"', + "wait $SERVICE_PID || true", + "exit $BENCH_RC", + ] + ) + + return self._write_sbatch_script(full_command) + def _validate_worker_nodes( self, node_list: list[str], worker_nodes: str | None, num_nodes: int, worker_type: str ) -> None: diff --git a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py index a0a028caa..2138b9bef 100644 --- a/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py +++ b/tests/workloads/ai_dynamo/test_command_gen_strategy_slurm.py @@ -148,3 +148,37 @@ def test_dynamo_cmd( ) -> None: result = strategy.gen_dynamo_cmd(module, Path(config)) assert result.strip() == expected + + +def test_gen_exec_command_with_external_genai_perf_client( + slurm_system: SlurmSystem, tmp_path: Path, cmd_args: AIDynamoCmdArgs +) -> None: + dynamo_repo_path = tmp_path / "dynamo_repo" + dynamo_repo_path.mkdir() + + cmd_args.genai_perf.client_docker_image_url = "nvcr.io/nvidia/tritonserver:25.03-py3-sdk" + + tdef = AIDynamoTestDefinition( + name="test", + description="desc", + test_template_name="template", + cmd_args=cmd_args, + repo=GitRepo( + url="https://github.com/ai-dynamo/dynamo.git", + commit="f7e468c7e8ff0d1426db987564e60572167e8464", + installed_path=dynamo_repo_path, + ), + ) + + tr = TestRun(name="run", test=tdef, nodes=["n0", "n1"], num_nodes=2, output_path=tmp_path) + strategy = AIDynamoSlurmCommandGenStrategy(slurm_system, tr) + + command = strategy.gen_exec_command() + script_path = Path(command.removeprefix("sbatch ").strip()) + content = script_path.read_text() + + assert "--wait-for-external-workload true" in content + assert "genai-perf-stdout.txt" in content + assert "genai-perf-stderr.txt" in content + assert "tritonserver:25.03-py3-sdk" in content + assert "/v1/models" in content From 37db3070777e62d4ced0d72ba490d6f3323bb5a9 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 10 Apr 2026 15:05:00 +0200 Subject: [PATCH 2/3] fix nodes --- .../ai_dynamo/slurm_command_gen_strategy.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index ba6e1f429..5bba31683 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -202,10 +202,8 @@ def _gen_client_srun_prefix(self, image_path: str) -> list[str]: srun_cmd.append(self.test_run.extra_srun_args) return srun_cmd - def _gen_external_benchmark_command(self, image_path: str) -> str: + def _gen_external_benchmark_command(self, image_path: str, frontend_node: str) -> str: td = cast(AIDynamoTestDefinition, self.test_run.test) - _, node_list = self.get_cached_nodes_spec() - frontend_node = node_list[0] out_dir = str(self.test_run.output_path.absolute()) srun_cmd = self._gen_client_srun_prefix(image_path) @@ -246,24 +244,27 @@ def gen_exec_command(self) -> str: if not client_image_path: return super().gen_exec_command() - _, node_list = self.get_cached_nodes_spec() - frontend_node = node_list[0] service_cmd = self._gen_service_srun_command() - benchmark_cmd = self._gen_external_benchmark_command(client_image_path) + benchmark_cmd = self._gen_external_benchmark_command(client_image_path, "$FRONTEND_NODE") success_marker = f"{self.test_run.output_path.absolute()}/{td.success_marker}" full_command = "\n".join( [ + 'FRONTEND_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)', + 'if [ -z "$FRONTEND_NODE" ]; then', + ' echo "Failed to resolve frontend node from SLURM_JOB_NODELIST" >&2', + " exit 1", + "fi", f"{service_cmd} &", "SERVICE_PID=$!", "BENCH_RC=0", "for _ in $(seq 1 120); do", - f' if curl -sf "http://{frontend_node}:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', + f' if curl -sf "http://$FRONTEND_NODE:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', " break", " fi", " sleep 5", "done", - f'if ! curl -sf "http://{frontend_node}:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', + f'if ! curl -sf "http://$FRONTEND_NODE:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', " BENCH_RC=1", "else", f" {benchmark_cmd} || BENCH_RC=$?", From 8403ef9b68c3425e5a6a5bf1e68045d4fe6e103d Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 10 Apr 2026 15:45:01 +0200 Subject: [PATCH 3/3] fix fe node --- .../ai_dynamo/slurm_command_gen_strategy.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py index 5bba31683..a6c1c7497 100644 --- a/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py @@ -202,15 +202,16 @@ def _gen_client_srun_prefix(self, image_path: str) -> list[str]: srun_cmd.append(self.test_run.extra_srun_args) return srun_cmd - def _gen_external_benchmark_command(self, image_path: str, frontend_node: str) -> str: + def _gen_external_benchmark_command(self, image_path: str) -> str: td = cast(AIDynamoTestDefinition, self.test_run.test) out_dir = str(self.test_run.output_path.absolute()) srun_cmd = self._gen_client_srun_prefix(image_path) srun_cmd.extend( [ + "--overlap", "--nodes=1", - f"--nodelist={frontend_node}", + "--nodelist=$SLURM_JOB_MASTER_NODE", "--ntasks=1", "--ntasks-per-node=1", f"--output={out_dir}/genai-perf-stdout.txt", @@ -219,7 +220,7 @@ def _gen_external_benchmark_command(self, image_path: str, frontend_node: str) - f"{self.CONTAINER_MOUNT_INSTALL}/{td.cmd_args.genai_perf.script.src.name}", f"--result-dir {self.CONTAINER_MOUNT_OUTPUT}", f'--model "{td.cmd_args.dynamo.model}"', - f'--url "http://{frontend_node}"', + '--url "http://$SLURM_JOB_MASTER_NODE"', f'--port "{td.cmd_args.dynamo.port}"', f'--endpoint "{td.cmd_args.dynamo.endpoint}"', f'--gpus-per-node "{self.system.gpus_per_node or 1}"', @@ -245,26 +246,25 @@ def gen_exec_command(self) -> str: return super().gen_exec_command() service_cmd = self._gen_service_srun_command() - benchmark_cmd = self._gen_external_benchmark_command(client_image_path, "$FRONTEND_NODE") + benchmark_cmd = self._gen_external_benchmark_command(client_image_path) success_marker = f"{self.test_run.output_path.absolute()}/{td.success_marker}" full_command = "\n".join( [ - 'FRONTEND_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)', - 'if [ -z "$FRONTEND_NODE" ]; then', - ' echo "Failed to resolve frontend node from SLURM_JOB_NODELIST" >&2', + 'if [ -z "$SLURM_JOB_MASTER_NODE" ]; then', + ' echo "SLURM_JOB_MASTER_NODE is not set" >&2', " exit 1", "fi", f"{service_cmd} &", "SERVICE_PID=$!", "BENCH_RC=0", "for _ in $(seq 1 120); do", - f' if curl -sf "http://$FRONTEND_NODE:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', + f' if curl -sf "http://$SLURM_JOB_MASTER_NODE:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', # noqa: E501 " break", " fi", " sleep 5", "done", - f'if ! curl -sf "http://$FRONTEND_NODE:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', + f'if ! curl -sf "http://$SLURM_JOB_MASTER_NODE:{td.cmd_args.dynamo.port}/v1/models" >/dev/null 2>&1; then', # noqa: E501 " BENCH_RC=1", "else", f" {benchmark_cmd} || BENCH_RC=$?",