Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions conf/experimental/test/deepep_standard.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@ test_template_name = "DeepEP"

[cmd_args]
# Local .sqsh file:
# docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
# Container registry (uses your Docker credentials):
docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"

mode = "standard"

tokens = 1024
tokens = 4096
num_experts = 256
num_topk = 8
hidden_size = 7168
Expand Down
34 changes: 34 additions & 0 deletions conf/experimental/test/nccl_test_alltoallv.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "nccl_test_alltoallv"
description = "NCCL AlltoAllv"
test_template_name = "NcclTest"

[cmd_args]
docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
# Container provides /opt/nccl-tests/build/alltoallv_perf.
subtest_name = "alltoallv_perf_mpi"
nthreads = 1
ngpus = 1
minbytes = "512M"
maxbytes = "512M"
stepfactor = 2
iters = 10
warmup_iters = 1
check = 1
blocking = 0
use_deepep_matrix = true
36 changes: 36 additions & 0 deletions conf/experimental/test_scenario/deepep_with_nccl_alltoallv.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "deepep-with-nccl-alltoallv"

# First run the DeepEP benchmark which generates the traffic matrix
[[Tests]]
id = "Tests.deepep"
test_name = "deepep_standard"
num_nodes = 2
nodes = ["dgx-gaia-55", "dgx-gaia-56"]
time_limit = "00:30:00"

# Then run NCCL AlltoAllv test using the generated matrix
[[Tests]]
id = "Tests.nccl_alltoallv"
test_name = "nccl_alltoallv"
num_nodes = 2
nodes = ["dgx-gaia-55", "dgx-gaia-56"]
time_limit = "00:30:00"
[[Tests.dependencies]]
type = "start_post_comp"
id = "Tests.deepep"
34 changes: 34 additions & 0 deletions conf/experimental/test_scenario/deepep_with_ucc_alltoallv.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "deepep-with-ucc-alltoallv"

# First run the DeepEP benchmark which generates the traffic matrix
[[Tests]]
id = "Tests.deepep"
test_name = "deepep_standard"
num_nodes = 2
time_limit = "00:30:00"

# Then run UCC AlltoAllv test using the generated matrix (auto-converted)
[[Tests]]
id = "Tests.ucc_alltoallv"
test_name = "ucc_alltoallv_deepep"
num_nodes = 2
time_limit = "00:30:00"
[[Tests.dependencies]]
type = "start_post_comp"
id = "Tests.deepep"
2 changes: 2 additions & 0 deletions src/cloudai/registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def register_all():
DeepEPReportGenerationStrategy,
DeepEPSlurmCommandGenStrategy,
DeepEPTestDefinition,
DeepEPMoEThroughputReporter,
)
from cloudai.workloads.jax_toolbox import (
GPTTestDefinition,
Expand Down Expand Up @@ -301,6 +302,7 @@ def register_all():
Registry().add_report(VllmTestDefinition, VLLMBenchReportGenerationStrategy)

Registry().add_scenario_report("per_test", PerTestReporter, ReportConfig(enable=True))
Registry().add_scenario_report("deepep_moe_throughput", DeepEPMoEThroughputReporter, ReportConfig(enable=True))
Registry().add_scenario_report("status", StatusReporter, ReportConfig(enable=True))
Registry().add_scenario_report("dse", DSEReporter, ReportConfig(enable=True))
Registry().add_scenario_report("tarball", TarballReporter, ReportConfig(enable=True))
Expand Down
2 changes: 2 additions & 0 deletions src/cloudai/workloads/deepep/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
# limitations under the License.

from .deepep import DeepEPCmdArgs, DeepEPTestDefinition
from .deepep_moe_throughput_reporter import DeepEPMoEThroughputReporter
from .report_generation_strategy import DeepEPReportGenerationStrategy
from .slurm_command_gen_strategy import DeepEPSlurmCommandGenStrategy

__all__ = [
"DeepEPCmdArgs",
"DeepEPMoEThroughputReporter",
"DeepEPReportGenerationStrategy",
"DeepEPSlurmCommandGenStrategy",
"DeepEPTestDefinition",
Expand Down
1 change: 1 addition & 0 deletions src/cloudai/workloads/deepep/deepep.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class DeepEPCmdArgs(CmdArgs):
num_iterations: int = 50
shuffle_columns: bool = False
use_kineto_profiler: bool = False
enable_tuning: bool = False
num_sms: int = 24
num_qps_per_rank: int = 12
config_file_path: str = "/tmp/config.yaml"
Expand Down
63 changes: 63 additions & 0 deletions src/cloudai/workloads/deepep/deepep_combined_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""DeepEP dependency helpers for Slurm UCC/NCCL."""

from __future__ import annotations

from pathlib import Path

from cloudai.core import TestRun

DEEPEP_PREV_MOUNT = "/cloudai_deepep_prev"


def start_post_comp_chain(test_run: TestRun) -> list[TestRun]:
"""Follow ``start_post_comp`` (e.g. UCC → NCCL → DeepEP)."""
dep = test_run.dependencies.get("start_post_comp")
if dep is None:
return []
chain: list[TestRun] = []
seen: set[int] = set()
cur: TestRun | None = dep.test_run
while cur is not None and id(cur) not in seen:
seen.add(id(cur))
chain.append(cur)
nxt = cur.dependencies.get("start_post_comp")
cur = nxt.test_run if nxt else None
return chain


def _has_ucc_matrix_under(root: Path) -> bool:
if (root / "ucc_matrix.txt").is_file():
return True
return any(root.glob("**/ucc_matrix.txt"))


def deepep_benchmark_root(test_run: TestRun) -> Path | None:
"""DeepEP job directory (``ucc_matrix`` or BENCHMARK stdout), walking ``start_post_comp``."""
for tr in start_post_comp_chain(test_run):
root = tr.output_path
if _has_ucc_matrix_under(root):
return root
st = root / "stdout.txt"
if st.is_file():
try:
if "BENCHMARK: DeepEP Results" in st.read_text(errors="replace")[:250000]:
return root
except OSError:
continue
return None


def deepep_results_json_files(test_output_path: Path) -> list[Path]:
"""All ``results.json`` paths under ``results/benchmark_*`` or top-level ``benchmark_*``."""
found: list[Path] = []
for pattern in ("results/benchmark_*_ranks_*", "benchmark_*_ranks_*"):
for d in sorted(test_output_path.glob(pattern)):
if d.is_dir():
rj = d / "results.json"
if rj.is_file():
found.append(rj)
return found
Loading
Loading