Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
394 changes: 145 additions & 249 deletions .github/configs/amd-master.yaml

Large diffs are not rendered by default.

819 changes: 351 additions & 468 deletions .github/configs/nvidia-master.yaml

Large diffs are not rendered by default.

25 changes: 21 additions & 4 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -924,8 +924,19 @@ resolve_trace_source() {
# public-dataset loader names allowed by the inferencex-agentx-mvp
# scenario. Used by recipes whose servers have non-default context
# caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
# unfiltered 052726 corpus and switches to the 256k-capped variant).
local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
# unfiltered corpus and switches to the 256k-capped variant), or
# by recipes that want to pin a specific corpus generation rather
# than ride the model-prefix-aware default below.
#
# Default (no override) is model-prefix-aware:
# DSv4 recipes -> 052726 (v5 corpus, the original baseline)
# everything else -> 060226 (v6 corpus, newer CC versions)
# DSv4 stays on 052726 for continuity with prior published baselines.
local default_loader="semianalysis_cc_traces_weka_with_subagents_060226"
if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
default_loader="semianalysis_cc_traces_weka_with_subagents"
fi
local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}"
local dataset
case "$loader" in
semianalysis_cc_traces_weka_with_subagents)
Expand All @@ -934,13 +945,19 @@ resolve_trace_source() {
semianalysis_cc_traces_weka_with_subagents_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
;;
semianalysis_cc_traces_weka_with_subagents_060226)
dataset="semianalysisai/cc-traces-weka-with-subagents-060226"
;;
semianalysis_cc_traces_weka_with_subagents_060226_256k)
dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k"
;;
*)
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k" >&2
exit 1
;;
esac
TRACE_SOURCE_FLAG="--public-dataset $loader"
echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]"
# Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
# for model weights) so subsequent runs read from cache instead of
# re-downloading every job.
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/dsr1_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand All @@ -33,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
export PYTHONNOUSERSITE=1

python3 -m sglang.launch_server \
--model-path $MODEL \
--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
rocm-smi
amd-smi || true

Expand All @@ -34,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
export PYTHONNOUSERSITE=1

python3 -m sglang.launch_server \
--model-path=$MODEL \
--model-path=$MODEL_PATH --served-model-name=$MODEL \
--host=0.0.0.0 \
--port=$PORT \
--trust-remote-code \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand Down Expand Up @@ -221,7 +231,7 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high

{ set +x; } 2>/dev/null
VLLM_CMD=(
vllm serve "$MODEL"
vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
--host 0.0.0.0
--port "$PORT"
--trust-remote-code
Expand Down
23 changes: 21 additions & 2 deletions benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,35 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
# Opt this recipe out of the DSv4 052726 default; use the v6 corpus.
export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226
resolve_trace_source
install_agentic_deps

# DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s.
export VLLM_ENGINE_READY_TIMEOUT_S=3600

# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient
# sliding-window allocations don't evict useful prefix entries. 32k matches
# the trace-replay tuning the PR author validated (0% -> 74% hit rate).
# Requires the custom image (cquil/vllm-openai:*-7ead0a0f...) that carries
# the patch; on stock images the env var is ignored.
export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768

# ---- Server config ----------------------------------------------------------
SERVER_LOG="$RESULT_DIR/server.log"
mkdir -p "$RESULT_DIR"
Expand Down Expand Up @@ -113,7 +132,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
export PYTHONNOUSERSITE=1
export VLLM_FLOAT32_MATMUL_PRECISION=high

vllm serve "$MODEL" \
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port "$PORT" \
--trust-remote-code \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
rocm-smi || true
amd-smi || true

Expand Down Expand Up @@ -135,7 +145,7 @@ fi

echo "Starting sglang server..."
python3 -m sglang.launch_server \
--model-path "$MODEL" \
--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
--host=0.0.0.0 \
--port "$PORT" \
"${PARALLEL_ARGS[@]}" \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/dsv4_fp8_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand All @@ -40,7 +50,7 @@ export PYTHONNOUSERSITE=1

# Per recipe: EP + DP=8 (no --tensor-parallel-size). TP from search space is
# used for GPU allocation by the runner and as the DP size.
vllm serve $MODEL \
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
rocm-smi || true
amd-smi || true

Expand All @@ -42,7 +52,7 @@ echo "Starting SGLang server..."
export PYTHONNOUSERSITE=1

python3 -m sglang.launch_server \
--model-path $MODEL \
--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/glm5_fp8_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand All @@ -39,7 +49,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
export PYTHONNOUSERSITE=1

python3 -m sglang.launch_server \
--model-path=$MODEL \
--model-path=$MODEL_PATH --served-model-name=$MODEL \
--host=0.0.0.0 \
--port=$PORT \
--trust-remote-code \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/gptoss_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand Down Expand Up @@ -53,7 +63,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
export PYTHONNOUSERSITE=1
export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1

vllm serve $MODEL \
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port $PORT \
--config "$RESULT_DIR/config.yaml" \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/gptoss_fp4_h100.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand Down Expand Up @@ -57,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
export PYTHONNOUSERSITE=1
export VLLM_MXFP4_USE_MARLIN=1

vllm serve $MODEL \
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port $PORT \
--config "$RESULT_DIR/config.yaml" \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/gptoss_fp4_h200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
nvidia-smi

# ---- Resolve traces and install deps ----------------------------------------
Expand Down Expand Up @@ -57,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
export PYTHONNOUSERSITE=1
export VLLM_MXFP4_USE_MARLIN=1

vllm serve $MODEL \
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port $PORT \
--config "$RESULT_DIR/config.yaml" \
Expand Down
14 changes: 12 additions & 2 deletions benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
# `hf download` creates the target dir if missing and is itself idempotent.
# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
# Either way, MODEL_PATH is what the server is launched with.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
fi
rocm-smi
amd-smi || true

Expand Down Expand Up @@ -65,7 +75,7 @@ esac

echo "Starting vllm server..."

vllm serve $MODEL \
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
--host 0.0.0.0 \
--port $PORT \
--attention-backend ROCM_AITER_UNIFIED_ATTN \
Expand Down
Loading