From a0d2e9fbfcdeaae8fef11a74e5d74987d4222ddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 28 Jan 2025 19:07:45 +0100 Subject: [PATCH 01/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6eb6aad40..0a6ed020c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -126,7 +126,7 @@ RUN pip uninstall -y megatron-core && \ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner RUN cd /opt/NeMo-Aligner && \ - pip install --no-deps -e . + bash reinstall.sh RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch From 2836b288ebe2e15eaf6cfe668c33c694b9f32351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 28 Jan 2025 19:11:26 +0100 Subject: [PATCH 02/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- reinstall.sh | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 reinstall.sh diff --git a/reinstall.sh b/reinstall.sh new file mode 100644 index 000000000..edbb9a02d --- /dev/null +++ b/reinstall.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -ex + +export MAX_JOBS=8 +export TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea +export PYTRITON_VERSION=0.5.10 +export NEMO_TAG=ko3n1g/build/improve-installer # On: main +export MLM_TAG= # On: main +export ALIGNER_COMMIT=main +export APEX_TAG=main +export TRTLLM_VERSION=v0.13.0 +export PROTOBUF_VERSION=4.24.4 + +cd /opt + +(rm -rf NeMo || true) && + git clone https://github.com/NVIDIA/NeMo.git && + pushd NeMo && + git fetch && + git checkout ${NEMO_TAG} && + bash reinstall.sh && + popd + +(rm -rf TensorRT-LLM || true) && + git clone https://github.com/NVIDIA/TensorRT-LLM.git && + pushd TensorRT-LLM && + git checkout ${TRTLLM_VERSION} && + source docker/common/install_tensorrt.sh && + python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && + pip install -e . + +(rm -rf NeMo-Aligner || true) && + git clone https://github.com/NVIDIA/NeMo-Aligner.git && + pushd NeMo-Aligner && + git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge' && + git checkout -f $ALIGNER_COMMIT && + # case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it + # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail + git pull --rebase || true && + pip install --no-cache-dir -e . && + popd From 3d9f6b74457e10c088ac529a26bd1abf2459353b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 28 Jan 2025 21:56:39 +0100 Subject: [PATCH 03/41] tmp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- reinstall.sh | 172 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 146 insertions(+), 26 deletions(-) diff --git a/reinstall.sh b/reinstall.sh index edbb9a02d..04db32d26 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -13,29 +13,149 @@ export PROTOBUF_VERSION=4.24.4 cd /opt -(rm -rf NeMo || true) && - git clone https://github.com/NVIDIA/NeMo.git && - pushd NeMo && - git fetch && - git checkout ${NEMO_TAG} && - bash reinstall.sh && - popd - -(rm -rf TensorRT-LLM || true) && - git clone https://github.com/NVIDIA/TensorRT-LLM.git && - pushd TensorRT-LLM && - git checkout ${TRTLLM_VERSION} && - source docker/common/install_tensorrt.sh && - python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && - pip install -e . - -(rm -rf NeMo-Aligner || true) && - git clone https://github.com/NVIDIA/NeMo-Aligner.git && - pushd NeMo-Aligner && - git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge' && - git checkout -f $ALIGNER_COMMIT && - # case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it - # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail - git pull --rebase || true && - pip install --no-cache-dir -e . && - popd +#!/bin/bash + +# List of all supported libraries (update this list when adding new libraries) +ALL_LIBRARIES=( + "nemo" + "trtllm" + "aligner" +) + +# -------------------------- +# Library Functions (Implement your logic here) +# -------------------------- + +nemo() { + local mode="$1" + + (rm -rf NeMo || true) && + git clone https://github.com/NVIDIA/NeMo.git && + pushd NeMo && + git fetch && + git checkout ${NEMO_TAG} && + bash reinstall.sh && + popd +} + +trtllm() { + local mode="$1" + + (rm -rf TensorRT-LLM || true) && + git clone https://github.com/NVIDIA/TensorRT-LLM.git && + pushd TensorRT-LLM && + git checkout ${TRTLLM_VERSION} + + if [[ "$mode" == "build" ]]; then + curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && + apt-get install git-lfs && + git lfs install && + apt-get clean + + source docker/common/install_tensorrt.sh && + python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks --build_dir /tmp/trtllm + else + pip install -e /tmp/wheels/trtllm*.whl + fi +} + +aligner() { + local mode="$1" + + (rm -rf NeMo-Aligner || true) && + git clone https://github.com/NVIDIA/NeMo-Aligner.git && + pushd NeMo-Aligner && + git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge' && + git checkout -f $ALIGNER_COMMIT && + # case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it + # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail + git pull --rebase || true && + pip install --no-cache-dir -e . && + popd +} + +# -------------------------- +# Argument Parsing & Validation +# -------------------------- + +# Parse command-line arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --library) + LIBRARY_ARG="$2" + shift 2 + ;; + --mode) + MODE="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Validate required arguments +if [[ -z "$LIBRARY_ARG" ]]; then + echo "Error: --library argument is required" + exit 1 +fi + +if [[ -z "$MODE" ]]; then + echo "Error: --mode argument is required" + exit 1 +fi + +# Validate mode +if [[ "$MODE" != "build" && "$MODE" != "install" ]]; then + echo "Error: Invalid mode. Must be 'build' or 'install'" + exit 1 +fi + +# Process library argument +declare -a LIBRARIES +if [[ "$LIBRARY_ARG" == "all" ]]; then + LIBRARIES=("${ALL_LIBRARIES[@]}") +else + IFS=',' read -ra TEMP_ARRAY <<<"$LIBRARY_ARG" + for lib in "${TEMP_ARRAY[@]}"; do + trimmed_lib=$(echo "$lib" | xargs) + if [[ -n "$trimmed_lib" ]]; then + LIBRARIES+=("$trimmed_lib") + fi + done +fi + +# Validate libraries array +if [[ ${#LIBRARIES[@]} -eq 0 ]]; then + echo "Error: No valid libraries specified" + exit 1 +fi + +# Validate each library is supported +for lib in "${LIBRARIES[@]}"; do + if [[ ! " ${ALL_LIBRARIES[@]} " =~ " ${lib} " ]]; then + echo "Error: Unsupported library '$lib'" + exit 1 + fi +done + +# -------------------------- +# Execution Logic +# -------------------------- + +# Run operations for each library +for library in "${LIBRARIES[@]}"; do + echo "Processing $library ($MODE)..." + "$library" "$MODE" + + # Check if function succeeded + if [[ $? -ne 0 ]]; then + echo "Error: Operation failed for $library" + exit 1 + fi +done + +echo "All operations completed successfully" +exit 0 From 06858fc2c2677d6959acb899774b7e0ff06a39d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 00:09:56 +0100 Subject: [PATCH 04/41] will this work? :P MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 10 ++++------ reinstall.sh | 7 ++++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0a6ed020c..a85892cc8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,12 +62,10 @@ RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.d # TRTLLM ARG TRTLLM_VERSION -RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \ - cd TensorRT-LLM && \ - git checkout ${TRTLLM_VERSION} && \ - . docker/common/install_tensorrt.sh && \ - python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \ - pip install -e . +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN export $TRTLLM_VERSION && \ + reinstall.sh --library trtllm --mode build && \ + reinstall.sh --library trtllm --mode install ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a diff --git a/reinstall.sh b/reinstall.sh index 04db32d26..df915a899 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -50,12 +50,13 @@ trtllm() { curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && apt-get install git-lfs && git lfs install && + git lfs pull && apt-get clean - source docker/common/install_tensorrt.sh && - python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks --build_dir /tmp/trtllm + . docker/common/install_tensorrt.sh && + python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks else - pip install -e /tmp/wheels/trtllm*.whl + pip install -e build/trtllm*.whl fi } From 107973ff3f41527b2acf1370e08b5b542ae7459a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 00:35:36 +0100 Subject: [PATCH 05/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a85892cc8..e6cffd1fa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -63,7 +63,7 @@ RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.d # TRTLLM ARG TRTLLM_VERSION COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN export $TRTLLM_VERSION && \ +RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ reinstall.sh --library trtllm --mode build && \ reinstall.sh --library trtllm --mode install ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ From d0666bea0147285b4f64a9825a6ff92c20842210 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 00:36:58 +0100 Subject: [PATCH 06/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index e6cffd1fa..e160067d2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,18 +54,13 @@ RUN pip uninstall -y apex && \ fi && \ pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ -# Git LFS -RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ - apt-get install git-lfs && \ - git lfs install && \ - apt-get clean - # TRTLLM ARG TRTLLM_VERSION COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ - reinstall.sh --library trtllm --mode build && \ - reinstall.sh --library trtllm --mode install + cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library trtllm --mode build && \ + bash reinstall.sh --library trtllm --mode install ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a From 3cfd864604a721768885f9d475d846c19a820e73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 09:28:45 +0100 Subject: [PATCH 07/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 1 + reinstall.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e160067d2..f5ae83f10 100644 --- a/Dockerfile +++ b/Dockerfile @@ -60,6 +60,7 @@ COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinst RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ cd /opt/NeMo-Aligner && \ bash reinstall.sh --library trtllm --mode build && \ + cp /opt/TensorRT-LLM/build/trtllm*.whl /tmp/build/trtllm*.whl && \ bash reinstall.sh --library trtllm --mode install ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ diff --git a/reinstall.sh b/reinstall.sh index df915a899..0875d915b 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -56,7 +56,7 @@ trtllm() { . docker/common/install_tensorrt.sh && python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks else - pip install -e build/trtllm*.whl + pip install /tmp/build/trtllm*.whl fi } From c28d8017591a6b1e010aa7169017625a34e95e0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 10:56:34 +0100 Subject: [PATCH 08/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 ++ reinstall.sh | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f5ae83f10..19294f87e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -60,6 +60,8 @@ COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinst RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ cd /opt/NeMo-Aligner && \ bash reinstall.sh --library trtllm --mode build && \ + ls -al /opt/TensorRT-LLM && \ + mkdir -p /tmp/build && \ cp /opt/TensorRT-LLM/build/trtllm*.whl /tmp/build/trtllm*.whl && \ bash reinstall.sh --library trtllm --mode install ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ diff --git a/reinstall.sh b/reinstall.sh index 0875d915b..2b60cef75 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -28,6 +28,7 @@ ALL_LIBRARIES=( nemo() { local mode="$1" + cd /opt (rm -rf NeMo || true) && git clone https://github.com/NVIDIA/NeMo.git && @@ -40,7 +41,8 @@ nemo() { trtllm() { local mode="$1" - + cd /opt + (rm -rf TensorRT-LLM || true) && git clone https://github.com/NVIDIA/TensorRT-LLM.git && pushd TensorRT-LLM && From 4fbeb2ca9da0195a9fefc2a4ff2584d533ef63d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 12:18:22 +0100 Subject: [PATCH 09/41] fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 +- reinstall.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 19294f87e..8997466cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,7 +62,7 @@ RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ bash reinstall.sh --library trtllm --mode build && \ ls -al /opt/TensorRT-LLM && \ mkdir -p /tmp/build && \ - cp /opt/TensorRT-LLM/build/trtllm*.whl /tmp/build/trtllm*.whl && \ + cp /opt/TensorRT-LLM/build/tensorrt_llm*.whl /tmp/build/tensorrt_llm*.whl && \ bash reinstall.sh --library trtllm --mode install ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ diff --git a/reinstall.sh b/reinstall.sh index 2b60cef75..913bdc7f7 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -58,7 +58,7 @@ trtllm() { . docker/common/install_tensorrt.sh && python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks else - pip install /tmp/build/trtllm*.whl + pip install /tmp/build/tensorrt_llm*.whl fi } From d1d65f8191258b25610eb0deb357eedadf6d4a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 13:58:10 +0100 Subject: [PATCH 10/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 18 ++++++++++-------- reinstall.sh | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8997466cf..03d5df2a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,6 +38,14 @@ git pull --rebase || true pip install --no-cache-dir --no-deps -e . EOF +FROM ${BASE_IMAGE} as trtllm-wheel +ARG TRTLLM_VERSION +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ + cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library trtllm --mode build && \ + ls -al /opt/TensorRT-LLM + FROM ${BASE_IMAGE} AS final LABEL "nemo.library"="nemo-aligner" WORKDIR /opt @@ -56,14 +64,8 @@ RUN pip uninstall -y apex && \ # TRTLLM ARG TRTLLM_VERSION -COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ - cd /opt/NeMo-Aligner && \ - bash reinstall.sh --library trtllm --mode build && \ - ls -al /opt/TensorRT-LLM && \ - mkdir -p /tmp/build && \ - cp /opt/TensorRT-LLM/build/tensorrt_llm*.whl /tmp/build/tensorrt_llm*.whl && \ - bash reinstall.sh --library trtllm --mode install +COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm +RUN bash reinstall.sh --library trtllm --mode install ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a diff --git a/reinstall.sh b/reinstall.sh index 913bdc7f7..1aa76bde3 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -58,7 +58,7 @@ trtllm() { . docker/common/install_tensorrt.sh && python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks else - pip install /tmp/build/tensorrt_llm*.whl + pip install /tmp/trtllm/tensorrt_llm*.whl fi } From a026bc8aef13161383af52b8af6f5471c617ef43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 15:06:25 +0100 Subject: [PATCH 11/41] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 03d5df2a9..16a620201 100644 --- a/Dockerfile +++ b/Dockerfile @@ -65,8 +65,9 @@ RUN pip uninstall -y apex && \ # TRTLLM ARG TRTLLM_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm -RUN bash reinstall.sh --library trtllm --mode install -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +# RUN bash /opt/NeMo-Aligner/reinstall.sh --library trtllm --mode install +# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a # breaking change. The last known working verison is 11.5.3 From e56c2857a2667a80f26c6c0d7f70f66eedc127a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 15:08:23 +0100 Subject: [PATCH 12/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 16a620201..582e00146 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,8 +66,8 @@ RUN pip uninstall -y apex && \ ARG TRTLLM_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -# RUN bash /opt/NeMo-Aligner/reinstall.sh --library trtllm --mode install -# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ +RUN bash /opt/NeMo-Aligner/reinstall.sh --library trtllm --mode install +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a # breaking change. The last known working verison is 11.5.3 From ad1c3b6d958044fd6088f2ddde30bb33a44019e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 15:17:12 +0100 Subject: [PATCH 13/41] TE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 582e00146..d005866fe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,6 +46,22 @@ RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ bash reinstall.sh --library trtllm --mode build && \ ls -al /opt/TensorRT-LLM +# install TransformerEngine +FROM ${BASE_IMAGE} as te-wheel +ARG MAX_JOBS +ARG TE_TAG +RUN cd /opt && \ + git clone https://github.com/NVIDIA/TransformerEngine.git && \ + cd TransformerEngine && \ + if [ ! -z $TE_TAG ]; then \ + git fetch origin $TE_TAG && \ + git checkout FETCH_HEAD; \ + fi && \ + git submodule init && git submodule update && \ + NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip wheel . && \ + ls -al + + FROM ${BASE_IMAGE} AS final LABEL "nemo.library"="nemo-aligner" WORKDIR /opt @@ -74,17 +90,8 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ RUN pip install pynvml==11.5.3 # install TransformerEngine -ARG MAX_JOBS -ARG TE_TAG -RUN pip uninstall -y transformer-engine && \ - git clone https://github.com/NVIDIA/TransformerEngine.git && \ - cd TransformerEngine && \ - if [ ! -z $TE_TAG ]; then \ - git fetch origin $TE_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - git submodule init && git submodule update && \ - NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . +COPY --from=te-wheel /opt/TransformerEngine /opt/TransformerEngine +RUN pip install /opt/TransformerEngine/*.whl # place any util pkgs here ARG PYTRITON_VERSION From aa468e2c28986dc0df70a69870df95fb44f8a1a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 15:27:45 +0100 Subject: [PATCH 14/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d005866fe..cdbf34e6c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -132,7 +132,7 @@ RUN pip uninstall -y megatron-core && \ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner RUN cd /opt/NeMo-Aligner && \ - bash reinstall.sh + bash reinstall.sh --install aligner RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch From b20261793d81f8443ef4c87106a6b279b28e835f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 15:42:22 +0100 Subject: [PATCH 15/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index cdbf34e6c..56415ff1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -102,16 +102,8 @@ RUN pip install --upgrade-strategy only-if-needed jsonlines # NeMo ARG NEMO_TAG -RUN git clone https://github.com/NVIDIA/NeMo.git && \ - cd NeMo && \ - git pull && \ - if [ ! -z $NEMO_TAG ]; then \ - git fetch origin $NEMO_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip uninstall -y nemo_toolkit sacrebleu && \ - pip install -e ".[nlp]" && \ - cd nemo/collections/nlp/data/language_modeling/megatron && make +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN bash /opt/NeMo-Aligner/reinstall.sh --library nemo --mode install # TODO: While we are on Pytorch 24.07, we need to downgrade triton since 3.2.0 introduced a breaking change # This un-pinned requirement comes from mamba-ssm, and this pin can be removed once Pytorch base image is @@ -132,7 +124,7 @@ RUN pip uninstall -y megatron-core && \ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner RUN cd /opt/NeMo-Aligner && \ - bash reinstall.sh --install aligner + bash reinstall.sh --library aligner --mode install RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch From a257965b820d04b76e96d6cf4cc3638844c9b923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 15:42:34 +0100 Subject: [PATCH 16/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 56415ff1e..7bb6d0137 100644 --- a/Dockerfile +++ b/Dockerfile @@ -110,18 +110,6 @@ RUN bash /opt/NeMo-Aligner/reinstall.sh --library nemo --mode install # updated. RUN pip install triton==3.1.0 -# MLM -ARG MLM_TAG -RUN pip uninstall -y megatron-core && \ - git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git pull && \ - if [ ! -z $MLM_TAG ]; then \ - git fetch origin $MLM_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip install -e . - COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner RUN cd /opt/NeMo-Aligner && \ bash reinstall.sh --library aligner --mode install From ab19e4882bd03635c5eea951bfbf225615b56889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 16:25:28 +0100 Subject: [PATCH 17/41] remove most dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- setup/requirements.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup/requirements.txt b/setup/requirements.txt index 4aa22afa1..80758d8ad 100644 --- a/setup/requirements.txt +++ b/setup/requirements.txt @@ -1,8 +1,8 @@ Jinja2~=3.1.4 jsonlines -megatron_core>=0.8 -nemo_toolkit[nlp] -nvidia-pytriton +# megatron_core>=0.8 +# nemo_toolkit[nlp] +# nvidia-pytriton # pynvml pin is needed for TRTLLM v0.13.0 since 12.0.0 contains a breaking change. -pynvml==11.5.3 -tensorrt-llm==0.13.0 +# pynvml==11.5.3 +# tensorrt-llm==0.13.0 From b5bb36c879b325adec37839660474add05b0c432 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 16:29:42 +0100 Subject: [PATCH 18/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 ++ reinstall.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 7bb6d0137..701e0a2f4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -111,7 +111,9 @@ RUN bash /opt/NeMo-Aligner/reinstall.sh --library nemo --mode install RUN pip install triton==3.1.0 COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner +ARG ALIGNER_COMMIT RUN cd /opt/NeMo-Aligner && \ + export ALIGNER_COMMIT=$ALIGNER_COMMIT && \ bash reinstall.sh --library aligner --mode install RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch diff --git a/reinstall.sh b/reinstall.sh index 1aa76bde3..fbaaa5410 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -6,7 +6,7 @@ export TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea export PYTRITON_VERSION=0.5.10 export NEMO_TAG=ko3n1g/build/improve-installer # On: main export MLM_TAG= # On: main -export ALIGNER_COMMIT=main +export ALIGNER_COMMIT=${ALIGNER_COMMIT:-main} export APEX_TAG=main export TRTLLM_VERSION=v0.13.0 export PROTOBUF_VERSION=4.24.4 From bfeab250dcb3d33ca4bfe09a99347ee12500cc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 22:51:14 +0100 Subject: [PATCH 19/41] manifest json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 32 +++++++++++++++++++++++++- Dockerfile | 13 ++++++++--- reinstall.sh | 40 ++++++++++++++++++++++----------- setup/manifest.json | 25 +++++++++++++++++++++ 4 files changed, 93 insertions(+), 17 deletions(-) create mode 100644 setup/manifest.json diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 728f3b6bf..0ac40c0bb 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -44,12 +44,14 @@ jobs: test_to_run: ${{ steps.test_to_run.outputs.main }} all: ${{ steps.all.outputs.main }} run_ci: ${{ steps.evaluate.outputs.run_ci }} + build_args: ${{ steps.manifest.outputs.build_args }} steps: - name: Parse test_to_run id: test_to_run run: | parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")') echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT" + - name: Parse all id: all run: | @@ -89,6 +91,33 @@ jobs: # Run CI only (on main or if label is attached) and if it's not only docs echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true") && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT" + - name: Checkout repository + uses: actions/checkout@v4 + with: + path: ${{ github.run_id }} + + - name: Parse manifest.json + id: manifest + run: | + BUILD_ARGS=$(cat << EOF + BASE_IMAGE=$(cat setup/manifest.json | jq -r '."ngc-pytorch"') + NEMO_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.repo') + NEMO_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.ref') + MLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.repo') + MLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.ref') + TE_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.repo') + TE_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.ref') + TRTLLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.repo') + TRTLLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.ref') + PROTOBUF_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".protobuf') + PYTRITON_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pytriton') + EOF + ) + + echo "BUILD_ARGS<> $GITHUB_ENV + echo "$BUILD_ARGS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + build-container: if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} needs: [pre-flight] @@ -100,7 +129,8 @@ jobs: build-args: | MAX_JOBS=32 ALIGNER_COMMIT=${{ github.sha }} - + ${{ env.BUILD_ARGS }} + Unit_Tests: name: ${{ matrix.test_case }} needs: [build-container, pre-flight] diff --git a/Dockerfile b/Dockerfile index 701e0a2f4..5679f5124 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,14 +11,21 @@ # if you get errors building TE or Apex, decrease this to 4 ARG MAX_JOBS=8 # Git refs for dependencies -ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea -ARG PYTRITON_VERSION=0.5.10 + +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 +ARG NEMO_REPO=https://github.com/NVIDIA/NeMo ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main +ARG MLM_REPO=https://github.com/NVIDIA/Megatron-LM ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main +ARG ALIGNER_REPO=https://github.com/NVIDIA/NeMo-Aligner ARG ALIGNER_COMMIT=main +ARG TE_REPO=https://github.com/NVIDIA/TransformerEngine +ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea +ARG TRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git ARG TRTLLM_VERSION=v0.13.0 ARG PROTOBUF_VERSION=4.24.4 -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 +ARG PYTRITON_VERSION=0.5.10 + FROM ${BASE_IMAGE} AS aligner-bump ARG ALIGNER_COMMIT diff --git a/reinstall.sh b/reinstall.sh index fbaaa5410..4e2e48f30 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -1,20 +1,7 @@ #!/usr/bin/env bash set -ex - -export MAX_JOBS=8 -export TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea -export PYTRITON_VERSION=0.5.10 -export NEMO_TAG=ko3n1g/build/improve-installer # On: main -export MLM_TAG= # On: main -export ALIGNER_COMMIT=${ALIGNER_COMMIT:-main} -export APEX_TAG=main -export TRTLLM_VERSION=v0.13.0 -export PROTOBUF_VERSION=4.24.4 - cd /opt -#!/bin/bash - # List of all supported libraries (update this list when adding new libraries) ALL_LIBRARIES=( "nemo" @@ -62,6 +49,33 @@ trtllm() { fi } +te() { + local mode="$1" + cd /opt + + cd /opt && \ + git clone https://github.com/NVIDIA/TransformerEngine.git && \ + cd TransformerEngine && \ + if [ ! -z $TE_TAG ]; then \ + git fetch origin $TE_TAG && \ + git checkout FETCH_HEAD; \ + fi && \ + + + (rm -rf TransformerEngine || true) && + git clone https://github.com/NVIDIA/TransformerEngine.git && + pushd TransformerEngine && + git checkout ${TE_TAG} + + if [[ "$mode" == "build" ]]; then + git submodule init && git submodule update && \ + NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip wheel . && \ + ls -al + else + pip install /tmp/te/transformerengine*.whl + fi +} + aligner() { local mode="$1" diff --git a/setup/manifest.json b/setup/manifest.json new file mode 100644 index 000000000..f9c647d87 --- /dev/null +++ b/setup/manifest.json @@ -0,0 +1,25 @@ +{ + "ngc-pytorch": "nvcr.io/nvidia/pytorch:24.07-py3", + "vcs-dependencies": { + "aligner": { + "repo": "https://github.com/NVIDIA/NeMo-Aligner", + "ref": "main" + }, + "trtllm": { + "repo": "https://github.com/NVIDIA/TensorRT-LLM", + "ref": "v0.13.0" + }, + "te": { + "repo": "https://github.com/NVIDIA/TransformerEngine", + "ref": "7d576ed25266a17a7b651f2c12e8498f67e0baea" + }, + "nemo": { + "repo": "https://github.com/NVIDIA/NeMo", + "ref": "ko3n1g/build/improve-installer" + } + }, + "pypi-dependencies": { + "protobuf": "4.24.4", + "pytriton": "0.5.10" + } +} \ No newline at end of file From 6f079f4b3f08e4a5fd8b50bddfb95b5ea5eaef3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 22:55:29 +0100 Subject: [PATCH 20/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 0ac40c0bb..b5e3b36ac 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -44,7 +44,7 @@ jobs: test_to_run: ${{ steps.test_to_run.outputs.main }} all: ${{ steps.all.outputs.main }} run_ci: ${{ steps.evaluate.outputs.run_ci }} - build_args: ${{ steps.manifest.outputs.build_args }} + build_args: ${{ steps.manifest.outputs.BUILD_ARGS }} steps: - name: Parse test_to_run id: test_to_run @@ -114,9 +114,9 @@ jobs: EOF ) - echo "BUILD_ARGS<> $GITHUB_ENV - echo "$BUILD_ARGS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV + echo "BUILD_ARGS<> $GITHUB_OUTPUT + echo "$BUILD_ARGS" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT build-container: if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} @@ -129,8 +129,8 @@ jobs: build-args: | MAX_JOBS=32 ALIGNER_COMMIT=${{ github.sha }} - ${{ env.BUILD_ARGS }} - + ${{ needs.pre-flight.outputs.BUILD_ARGS }} + Unit_Tests: name: ${{ matrix.test_case }} needs: [build-container, pre-flight] From 77353ac3f6e609fad24e9dc5a9d398bf1b4f0de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 22:59:33 +0100 Subject: [PATCH 21/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 2 ++ Dockerfile | 15 ++++----------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index b5e3b36ac..ad0881227 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -99,6 +99,8 @@ jobs: - name: Parse manifest.json id: manifest run: | + cd ${{ github.run_id }} + BUILD_ARGS=$(cat << EOF BASE_IMAGE=$(cat setup/manifest.json | jq -r '."ngc-pytorch"') NEMO_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.repo') diff --git a/Dockerfile b/Dockerfile index 5679f5124..e1f24e5c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,17 +57,10 @@ RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ FROM ${BASE_IMAGE} as te-wheel ARG MAX_JOBS ARG TE_TAG -RUN cd /opt && \ - git clone https://github.com/NVIDIA/TransformerEngine.git && \ - cd TransformerEngine && \ - if [ ! -z $TE_TAG ]; then \ - git fetch origin $TE_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - git submodule init && git submodule update && \ - NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip wheel . && \ - ls -al - +RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ + cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library te --mode build && \ + ls -al /opt/TransformerEngine FROM ${BASE_IMAGE} AS final LABEL "nemo.library"="nemo-aligner" From a0042511e61be14216c3d092568780ee3f18e805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 23:01:16 +0100 Subject: [PATCH 22/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index e1f24e5c2..ae3afc63f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,6 +57,7 @@ RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ FROM ${BASE_IMAGE} as te-wheel ARG MAX_JOBS ARG TE_TAG +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ cd /opt/NeMo-Aligner && \ bash reinstall.sh --library te --mode build && \ From ab67cdeb576cc5219c44375fef438dc8f74716bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 23:07:40 +0100 Subject: [PATCH 23/41] fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/cicd-main.yml | 3 ++- Dockerfile | 21 +++++++-------------- reinstall.sh | 2 ++ setup/manifest.json | 3 ++- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index ad0881227..8789a0432 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -100,7 +100,7 @@ jobs: id: manifest run: | cd ${{ github.run_id }} - + BUILD_ARGS=$(cat << EOF BASE_IMAGE=$(cat setup/manifest.json | jq -r '."ngc-pytorch"') NEMO_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.repo') @@ -113,6 +113,7 @@ jobs: TRTLLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.ref') PROTOBUF_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".protobuf') PYTRITON_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pytriton') + PYNVML_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pynvml') EOF ) diff --git a/Dockerfile b/Dockerfile index ae3afc63f..f6ce0433b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ ARG TRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git ARG TRTLLM_VERSION=v0.13.0 ARG PROTOBUF_VERSION=4.24.4 ARG PYTRITON_VERSION=0.5.10 - +ARG PYNVML_VERSION=11.5.3 FROM ${BASE_IMAGE} AS aligner-bump ARG ALIGNER_COMMIT @@ -81,26 +81,17 @@ RUN pip uninstall -y apex && \ # TRTLLM ARG TRTLLM_VERSION +ARG PYNVML_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN bash /opt/NeMo-Aligner/reinstall.sh --library trtllm --mode install +RUN bash /opt/NeMo-Aligner/reinstall.sh --library trtllm --mode install && \ +pip install --no-cache-dir pynvml==${PYNVML_VERSION} ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ -# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a -# breaking change. The last known working verison is 11.5.3 -RUN pip install pynvml==11.5.3 - -# install TransformerEngine +# TransformerEngine COPY --from=te-wheel /opt/TransformerEngine /opt/TransformerEngine RUN pip install /opt/TransformerEngine/*.whl -# place any util pkgs here -ARG PYTRITON_VERSION -RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION -ARG PROTOBUF_VERSION -RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION -RUN pip install --upgrade-strategy only-if-needed jsonlines - # NeMo ARG NEMO_TAG COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh @@ -113,6 +104,8 @@ RUN pip install triton==3.1.0 COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG ALIGNER_COMMIT +ARG PYTRITON_VERSION +ARG PROTOBUF_VERSION RUN cd /opt/NeMo-Aligner && \ export ALIGNER_COMMIT=$ALIGNER_COMMIT && \ bash reinstall.sh --library aligner --mode install diff --git a/reinstall.sh b/reinstall.sh index 4e2e48f30..b1f579634 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -87,6 +87,8 @@ aligner() { # case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail git pull --rebase || true && + pip install --no-cache-dir --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION && + pip install --no-cache-dir -U --no-deps protobuf==$PROTOBUF_VERSION && pip install --no-cache-dir -e . && popd } diff --git a/setup/manifest.json b/setup/manifest.json index f9c647d87..96efe34cb 100644 --- a/setup/manifest.json +++ b/setup/manifest.json @@ -20,6 +20,7 @@ }, "pypi-dependencies": { "protobuf": "4.24.4", - "pytriton": "0.5.10" + "pytriton": "0.5.10", + "pynvml": "11.5.3" } } \ No newline at end of file From 1a1c2282ae2d20895a54f2cba3261d829857acd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 23:14:05 +0100 Subject: [PATCH 24/41] apex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 39 ++++++++++++++++----------------------- reinstall.sh | 29 ++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/Dockerfile b/Dockerfile index f6ce0433b..e64f82293 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,44 +48,43 @@ EOF FROM ${BASE_IMAGE} as trtllm-wheel ARG TRTLLM_VERSION COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ - cd /opt/NeMo-Aligner && \ +RUN cd /opt/NeMo-Aligner && \ bash reinstall.sh --library trtllm --mode build && \ ls -al /opt/TensorRT-LLM -# install TransformerEngine FROM ${BASE_IMAGE} as te-wheel ARG MAX_JOBS ARG TE_TAG COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN export TRTLLM_VERSION=$TRTLLM_VERSION && \ - cd /opt/NeMo-Aligner && \ +RUN cd /opt/NeMo-Aligner && \ bash reinstall.sh --library te --mode build && \ ls -al /opt/TransformerEngine +FROM ${BASE_IMAGE} as apex-wheel +ARG APEX_TAG +ARG MAX_JOBS +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library apex --mode build && \ + ls -al /opt/Apex + FROM ${BASE_IMAGE} AS final LABEL "nemo.library"="nemo-aligner" WORKDIR /opt # needed in case git complains that it can't detect a valid email, this email is fake but works RUN git config --global user.email "worker@nvidia.com" -# install latest apex -ARG APEX_TAG -RUN pip uninstall -y apex && \ - git clone https://github.com/NVIDIA/apex && \ - cd apex && \ - if [ ! -z $APEX_TAG ]; then \ - git fetch origin $APEX_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ + +# Apex +COPY --from=apex-wheel /opt/Apex /tmp/apex +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN bash /opt/NeMo-Aligner/reinstall.sh --library apex --mode install # TRTLLM -ARG TRTLLM_VERSION ARG PYNVML_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh RUN bash /opt/NeMo-Aligner/reinstall.sh --library trtllm --mode install && \ -pip install --no-cache-dir pynvml==${PYNVML_VERSION} + pip install --no-cache-dir pynvml==${PYNVML_VERSION} ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine @@ -97,17 +96,11 @@ ARG NEMO_TAG COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh RUN bash /opt/NeMo-Aligner/reinstall.sh --library nemo --mode install -# TODO: While we are on Pytorch 24.07, we need to downgrade triton since 3.2.0 introduced a breaking change -# This un-pinned requirement comes from mamba-ssm, and this pin can be removed once Pytorch base image is -# updated. -RUN pip install triton==3.1.0 - COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG ALIGNER_COMMIT ARG PYTRITON_VERSION ARG PROTOBUF_VERSION RUN cd /opt/NeMo-Aligner && \ - export ALIGNER_COMMIT=$ALIGNER_COMMIT && \ bash reinstall.sh --library aligner --mode install RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch diff --git a/reinstall.sh b/reinstall.sh index b1f579634..cacea6b1f 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -6,6 +6,8 @@ cd /opt ALL_LIBRARIES=( "nemo" "trtllm" + "te" + "apex" "aligner" ) @@ -52,15 +54,6 @@ trtllm() { te() { local mode="$1" cd /opt - - cd /opt && \ - git clone https://github.com/NVIDIA/TransformerEngine.git && \ - cd TransformerEngine && \ - if [ ! -z $TE_TAG ]; then \ - git fetch origin $TE_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - (rm -rf TransformerEngine || true) && git clone https://github.com/NVIDIA/TransformerEngine.git && @@ -76,6 +69,24 @@ te() { fi } +apex() { + local mode="$1" + cd /opt + + (rm -rf Apex || true) && + git clone https://github.com/NVIDIA/Apex.git && + pushd Apex && + git checkout ${APEX_TAG} + + if [[ "$mode" == "build" ]]; then + pip wheel -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ && \ + ls -al + else + pip install /tmp/apex/apex*.whl + fi +} + + aligner() { local mode="$1" From 4eb1403c9fe78d840a4b3040ab9465a34b74203a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 29 Jan 2025 23:36:23 +0100 Subject: [PATCH 25/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 20 ++++++-------------- reinstall.sh | 5 ++++- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/Dockerfile b/Dockerfile index e64f82293..f2ca9b45e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,33 +77,25 @@ RUN git config --global user.email "worker@nvidia.com" # Apex COPY --from=apex-wheel /opt/Apex /tmp/apex COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN bash /opt/NeMo-Aligner/reinstall.sh --library apex --mode install # TRTLLM ARG PYNVML_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN bash /opt/NeMo-Aligner/reinstall.sh --library trtllm --mode install && \ - pip install --no-cache-dir pynvml==${PYNVML_VERSION} ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine COPY --from=te-wheel /opt/TransformerEngine /opt/TransformerEngine -RUN pip install /opt/TransformerEngine/*.whl - -# NeMo -ARG NEMO_TAG -COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh -RUN bash /opt/NeMo-Aligner/reinstall.sh --library nemo --mode install COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner -ARG ALIGNER_COMMIT -ARG PYTRITON_VERSION +ARG NEMO_REPO +ARG NEMO_TAG ARG PROTOBUF_VERSION +ARG PYTRITON_VERSION +ARG PYNVML_VERSION RUN cd /opt/NeMo-Aligner && \ - bash reinstall.sh --library aligner --mode install - -RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch + bash reinstall.sh --library all --mode install && \ + cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs RUN <<"EOF" bash -exu diff --git a/reinstall.sh b/reinstall.sh index cacea6b1f..7e27df844 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -3,6 +3,7 @@ set -ex cd /opt # List of all supported libraries (update this list when adding new libraries) +# This also defines the order in which they will be installed by --libraries "all" ALL_LIBRARIES=( "nemo" "trtllm" @@ -47,7 +48,8 @@ trtllm() { . docker/common/install_tensorrt.sh && python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks else - pip install /tmp/trtllm/tensorrt_llm*.whl + pip install --no-cache-dir /tmp/trtllm/tensorrt_llm*.whl + pip install --no-cache-dir pynvml==${PYNVML_VERSION} fi } @@ -89,6 +91,7 @@ apex() { aligner() { local mode="$1" + cd /opt (rm -rf NeMo-Aligner || true) && git clone https://github.com/NVIDIA/NeMo-Aligner.git && From 36b90023765df953f4b3de4683ee3c6707e4ceac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 17:33:57 +0100 Subject: [PATCH 26/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index f2ca9b45e..6ef3bce44 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,8 +94,9 @@ ARG PROTOBUF_VERSION ARG PYTRITON_VERSION ARG PYNVML_VERSION RUN cd /opt/NeMo-Aligner && \ - bash reinstall.sh --library all --mode install && \ - cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch + bash reinstall.sh --library all --mode install + #&& \ + #cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs RUN <<"EOF" bash -exu From 272036d21a14a7f935dfe4ca74c121b32d60ade2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 19:34:23 +0100 Subject: [PATCH 27/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6ef3bce44..11567cecb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -85,7 +85,7 @@ COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinst ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine -COPY --from=te-wheel /opt/TransformerEngine /opt/TransformerEngine +COPY --from=te-wheel /opt/TransformerEngine /tmp/te COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG NEMO_REPO From d1466cee6d5d95e60201c065b8454bc5c8bc57d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 19:50:00 +0100 Subject: [PATCH 28/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 11567cecb..b4115d390 100644 --- a/Dockerfile +++ b/Dockerfile @@ -59,6 +59,7 @@ COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinst RUN cd /opt/NeMo-Aligner && \ bash reinstall.sh --library te --mode build && \ ls -al /opt/TransformerEngine +RUN ls -al /opt/TransformerEngine FROM ${BASE_IMAGE} as apex-wheel ARG APEX_TAG @@ -74,18 +75,22 @@ WORKDIR /opt # needed in case git complains that it can't detect a valid email, this email is fake but works RUN git config --global user.email "worker@nvidia.com" +# Copy installer script +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh + # Apex COPY --from=apex-wheel /opt/Apex /tmp/apex -COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN /opt/NeMo-Aligner/reinstall.sh --mode install --library apex # TRTLLM ARG PYNVML_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm -COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine COPY --from=te-wheel /opt/TransformerEngine /tmp/te +RUN /opt/NeMo-Aligner/reinstall.sh --mode install --library te COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG NEMO_REPO From 29da950a03d972e04b0debc2baa8d18cc1102c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 19:51:28 +0100 Subject: [PATCH 29/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 1 - reinstall.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index b4115d390..babac1798 100644 --- a/Dockerfile +++ b/Dockerfile @@ -59,7 +59,6 @@ COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinst RUN cd /opt/NeMo-Aligner && \ bash reinstall.sh --library te --mode build && \ ls -al /opt/TransformerEngine -RUN ls -al /opt/TransformerEngine FROM ${BASE_IMAGE} as apex-wheel ARG APEX_TAG diff --git a/reinstall.sh b/reinstall.sh index 7e27df844..1cd4913d8 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -67,7 +67,7 @@ te() { NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip wheel . && \ ls -al else - pip install /tmp/te/transformerengine*.whl + pip install /tmp/te/transformer_engine*.whl fi } From 028d5d03dabf59ba7a8dd2c50eb31686729682d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 20:44:13 +0100 Subject: [PATCH 30/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index babac1798..c1f497fab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -97,8 +97,7 @@ ARG NEMO_TAG ARG PROTOBUF_VERSION ARG PYTRITON_VERSION ARG PYNVML_VERSION -RUN cd /opt/NeMo-Aligner && \ - bash reinstall.sh --library all --mode install +RUN /opt/NeMo-Aligner/reinstall.sh --library all --mode install #&& \ #cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch From 372c8a254b9e6e7aace90917fcd66a5ebd28349f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 20:53:03 +0100 Subject: [PATCH 31/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index c1f497fab..9ba87569c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -97,9 +97,9 @@ ARG NEMO_TAG ARG PROTOBUF_VERSION ARG PYTRITON_VERSION ARG PYNVML_VERSION -RUN /opt/NeMo-Aligner/reinstall.sh --library all --mode install - #&& \ - #cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch +RUN cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library all --mode install && \ + cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs RUN <<"EOF" bash -exu From 0e0224d02c71fc6cfe7041ba777f3eff47d3708f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 20:55:55 +0100 Subject: [PATCH 32/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9ba87569c..ce9f1be18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,17 +79,17 @@ COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinst # Apex COPY --from=apex-wheel /opt/Apex /tmp/apex -RUN /opt/NeMo-Aligner/reinstall.sh --mode install --library apex +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library apex # TRTLLM ARG PYNVML_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm -RUN /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm +RUN bash reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine COPY --from=te-wheel /opt/TransformerEngine /tmp/te -RUN /opt/NeMo-Aligner/reinstall.sh --mode install --library te +RUN bash reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library te COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG NEMO_REPO @@ -97,8 +97,7 @@ ARG NEMO_TAG ARG PROTOBUF_VERSION ARG PYTRITON_VERSION ARG PYNVML_VERSION -RUN cd /opt/NeMo-Aligner && \ - bash reinstall.sh --library all --mode install && \ +RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install && \ cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs From 50a5cc4c1ee2d7f2093508c7eb60120422785b34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 22:11:08 +0100 Subject: [PATCH 33/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ce9f1be18..356aefb4f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -84,7 +84,7 @@ RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library apex # TRTLLM ARG PYNVML_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm -RUN bash reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm +RUN bash /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine From 8d30156e317b908b695179ba057171084bc30f5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 22:17:26 +0100 Subject: [PATCH 34/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 356aefb4f..e91f15acd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -84,12 +84,12 @@ RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library apex # TRTLLM ARG PYNVML_VERSION COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm -RUN bash /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine COPY --from=te-wheel /opt/TransformerEngine /tmp/te -RUN bash reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library te +RUN bash /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library te COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG NEMO_REPO From 7858e0ce1cc843de2b987d3b0ac854bb9e010473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 22:24:57 +0100 Subject: [PATCH 35/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e91f15acd..41d23c07c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -89,7 +89,7 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ # TransformerEngine COPY --from=te-wheel /opt/TransformerEngine /tmp/te -RUN bash /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh --mode install --library te +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library te COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG NEMO_REPO From 1b0c298c87c88d425e57732149ddaebcda0ae198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 22:34:52 +0100 Subject: [PATCH 36/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 41d23c07c..3495900ee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,6 +94,7 @@ RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library te COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG NEMO_REPO ARG NEMO_TAG +ARG ALIGNER_COMMIT ARG PROTOBUF_VERSION ARG PYTRITON_VERSION ARG PYNVML_VERSION From 6f4d460ef8b6bcab1416a57e4710a61997e47c9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 22:44:50 +0100 Subject: [PATCH 37/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3495900ee..33848f51f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -98,8 +98,9 @@ ARG ALIGNER_COMMIT ARG PROTOBUF_VERSION ARG PYTRITON_VERSION ARG PYNVML_VERSION -RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install && \ - cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch +RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install +# && \ +# cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs RUN <<"EOF" bash -exu From 8b4774eb92cefbfef6af0180392b6ca4ac25636d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 22:49:22 +0100 Subject: [PATCH 38/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 33848f51f..3a20a4a2b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -91,9 +91,11 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ COPY --from=te-wheel /opt/TransformerEngine /tmp/te RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library te -COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG NEMO_REPO ARG NEMO_TAG +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library nemo + +COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner ARG ALIGNER_COMMIT ARG PROTOBUF_VERSION ARG PYTRITON_VERSION From a916b274fe6937675011f139392c7003b64f7318 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 1 Feb 2025 23:06:04 +0100 Subject: [PATCH 39/41] f MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- reinstall.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/reinstall.sh b/reinstall.sh index 1cd4913d8..38c23c5b2 100644 --- a/reinstall.sh +++ b/reinstall.sh @@ -97,6 +97,7 @@ aligner() { git clone https://github.com/NVIDIA/NeMo-Aligner.git && pushd NeMo-Aligner && git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge' && + git fetch origin $ALIGNER_COMMIT && git checkout -f $ALIGNER_COMMIT && # case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail From 0aa022d9abe65e24b2fe5eda98a216a52028d9af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 2 Feb 2025 00:31:52 +0100 Subject: [PATCH 40/41] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3a20a4a2b..5a4906aef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -105,25 +105,25 @@ RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install # cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs -RUN <<"EOF" bash -exu -cd NeMo -# Ensures we don't cherry-pick "future" origin/main commits -git fetch -a -# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651 -# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652 -# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 -# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 -for pr_and_commit in \ - "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ - "10652 60e677423667c029dd05875da72bf0719774f844" \ - "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ -; do - pr=$(cut -f1 -d' ' <<<"$pr_and_commit") - head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") - git fetch origin $head_pr_commit:PR-${pr} - # cherry-picks all commits between main and the top of the PR - git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} - # Tag cherry-picks to help - git tag cherry-pick-PR-${pr} -done -EOF +# RUN <<"EOF" bash -exu +# cd NeMo +# # Ensures we don't cherry-pick "future" origin/main commits +# git fetch -a +# # 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651 +# # 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652 +# # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 +# # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 +# for pr_and_commit in \ +# "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ +# "10652 60e677423667c029dd05875da72bf0719774f844" \ +# "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ +# ; do +# pr=$(cut -f1 -d' ' <<<"$pr_and_commit") +# head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") +# git fetch origin $head_pr_commit:PR-${pr} +# # cherry-picks all commits between main and the top of the PR +# git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} +# # Tag cherry-picks to help +# git tag cherry-pick-PR-${pr} +# done +# EOF From 7bab11167595b26af19e648d83448b7c4cecb8f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 2 Feb 2025 00:51:29 +0100 Subject: [PATCH 41/41] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 5a4906aef..1635975c3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -104,6 +104,7 @@ RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install # && \ # cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch + # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs # RUN <<"EOF" bash -exu # cd NeMo