NVIDIA · ybenvidia · Mar 5, 2026 · Apr 15, 2026 · Apr 26, 2026 · May 13, 2026
@@ -20,13 +20,12 @@ test_template_name = "DeepEP"
 
 [cmd_args]
 # Local .sqsh file:
-# docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
 # Container registry (uses your Docker credentials):
 docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
 
 mode = "standard"
 
-tokens = 1024
+tokens = 4096
 num_experts = 256
 num_topk = 8
 hidden_size = 7168

@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nccl_test_alltoallv"
+description = "NCCL AlltoAllv"
+test_template_name = "NcclTest"
+
+[cmd_args]
+docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
+# Container provides /opt/nccl-tests/build/alltoallv_perf.
+subtest_name = "alltoallv_perf_mpi"
+nthreads = 1
+ngpus = 1
+minbytes = "512M"
+maxbytes = "512M"
+stepfactor = 2
+iters = 10
+warmup_iters = 1
+check = 1
+blocking = 0
+use_deepep_matrix = true
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "deepep-with-nccl-alltoallv"
+
+# First run the DeepEP benchmark which generates the traffic matrix
+[[Tests]]
+id = "Tests.deepep"
+test_name = "deepep_standard"
+num_nodes = 2
+nodes = ["dgx-gaia-55", "dgx-gaia-56"]
+time_limit = "00:30:00"
+
+# Then run NCCL AlltoAllv test using the generated matrix
+[[Tests]]
+id = "Tests.nccl_alltoallv"
+test_name = "nccl_alltoallv"
+num_nodes = 2
+nodes = ["dgx-gaia-55", "dgx-gaia-56"]
+time_limit = "00:30:00"
+  [[Tests.dependencies]]
+  type = "start_post_comp"
+  id = "Tests.deepep"
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "deepep-with-ucc-alltoallv"
+
+# First run the DeepEP benchmark which generates the traffic matrix
+[[Tests]]
+id = "Tests.deepep"
+test_name = "deepep_standard"
+num_nodes = 2
+time_limit = "00:30:00"
+
+# Then run UCC AlltoAllv test using the generated matrix (auto-converted)
+[[Tests]]
+id = "Tests.ucc_alltoallv"
+test_name = "ucc_alltoallv_deepep"
+num_nodes = 2
+time_limit = "00:30:00"
+  [[Tests.dependencies]]
+  type = "start_post_comp"
+  id = "Tests.deepep"
@@ -82,6 +82,7 @@ def register_all():
         DeepEPReportGenerationStrategy,
         DeepEPSlurmCommandGenStrategy,
         DeepEPTestDefinition,
+        DeepEPMoEThroughputReporter,
     )
     from cloudai.workloads.jax_toolbox import (
         GPTTestDefinition,
@@ -301,6 +302,7 @@ def register_all():
     Registry().add_report(VllmTestDefinition, VLLMBenchReportGenerationStrategy)
 
     Registry().add_scenario_report("per_test", PerTestReporter, ReportConfig(enable=True))
+    Registry().add_scenario_report("deepep_moe_throughput", DeepEPMoEThroughputReporter, ReportConfig(enable=True))
     Registry().add_scenario_report("status", StatusReporter, ReportConfig(enable=True))
     Registry().add_scenario_report("dse", DSEReporter, ReportConfig(enable=True))
     Registry().add_scenario_report("tarball", TarballReporter, ReportConfig(enable=True))

@@ -15,11 +15,13 @@
 # limitations under the License.
 
 from .deepep import DeepEPCmdArgs, DeepEPTestDefinition
+from .deepep_moe_throughput_reporter import DeepEPMoEThroughputReporter
 from .report_generation_strategy import DeepEPReportGenerationStrategy
 from .slurm_command_gen_strategy import DeepEPSlurmCommandGenStrategy
 
 __all__ = [
     "DeepEPCmdArgs",
+    "DeepEPMoEThroughputReporter",
     "DeepEPReportGenerationStrategy",
     "DeepEPSlurmCommandGenStrategy",
     "DeepEPTestDefinition",

@@ -38,6 +38,7 @@ class DeepEPCmdArgs(CmdArgs):
     num_iterations: int = 50
     shuffle_columns: bool = False
     use_kineto_profiler: bool = False
+    enable_tuning: bool = False
     num_sms: int = 24
     num_qps_per_rank: int = 12
     config_file_path: str = "/tmp/config.yaml"

@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""DeepEP dependency helpers for Slurm UCC/NCCL."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from cloudai.core import TestRun
+
+DEEPEP_PREV_MOUNT = "/cloudai_deepep_prev"
+
+
+def start_post_comp_chain(test_run: TestRun) -> list[TestRun]:
+    """Follow ``start_post_comp`` (e.g. UCC → NCCL → DeepEP)."""
+    dep = test_run.dependencies.get("start_post_comp")
+    if dep is None:
+        return []
+    chain: list[TestRun] = []
+    seen: set[int] = set()
+    cur: TestRun | None = dep.test_run
+    while cur is not None and id(cur) not in seen:
+        seen.add(id(cur))
+        chain.append(cur)
+        nxt = cur.dependencies.get("start_post_comp")
+        cur = nxt.test_run if nxt else None
+    return chain
+
+
+def _has_ucc_matrix_under(root: Path) -> bool:
+    if (root / "ucc_matrix.txt").is_file():
+        return True
+    return any(root.glob("**/ucc_matrix.txt"))
+
+
+def deepep_benchmark_root(test_run: TestRun) -> Path | None:
+    """DeepEP job directory (``ucc_matrix`` or BENCHMARK stdout), walking ``start_post_comp``."""
+    for tr in start_post_comp_chain(test_run):
+        root = tr.output_path
+        if _has_ucc_matrix_under(root):
+            return root
+        st = root / "stdout.txt"
+        if st.is_file():
+            try:
+                if "BENCHMARK: DeepEP Results" in st.read_text(errors="replace")[:250000]:
+                    return root
+            except OSError:
+                continue
+    return None
+
+
+def deepep_results_json_files(test_output_path: Path) -> list[Path]:
+    """All ``results.json`` paths under ``results/benchmark_*`` or top-level ``benchmark_*``."""
+    found: list[Path] = []
+    for pattern in ("results/benchmark_*_ranks_*", "benchmark_*_ranks_*"):
+        for d in sorted(test_output_path.glob(pattern)):
+            if d.is_dir():
+                rj = d / "results.json"
+                if rj.is_file():
+                    found.append(rj)
+    return found