diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 728f3b6bf..8789a0432 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -44,12 +44,14 @@ jobs: test_to_run: ${{ steps.test_to_run.outputs.main }} all: ${{ steps.all.outputs.main }} run_ci: ${{ steps.evaluate.outputs.run_ci }} + build_args: ${{ steps.manifest.outputs.BUILD_ARGS }} steps: - name: Parse test_to_run id: test_to_run run: | parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")') echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT" + - name: Parse all id: all run: | @@ -89,6 +91,36 @@ jobs: # Run CI only (on main or if label is attached) and if it's not only docs echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true") && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT" + - name: Checkout repository + uses: actions/checkout@v4 + with: + path: ${{ github.run_id }} + + - name: Parse manifest.json + id: manifest + run: | + cd ${{ github.run_id }} + + BUILD_ARGS=$(cat << EOF + BASE_IMAGE=$(cat setup/manifest.json | jq -r '."ngc-pytorch"') + NEMO_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.repo') + NEMO_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".nemo.ref') + MLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.repo') + MLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".megatron.ref') + TE_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.repo') + TE_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".te.ref') + TRTLLM_REPO=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.repo') + TRTLLM_TAG=$(cat setup/manifest.json | jq -r '."vcs-dependencies".trtllm.ref') + PROTOBUF_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".protobuf') + PYTRITON_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pytriton') + PYNVML_VERSION=$(cat setup/manifest.json | jq -r '."pypi-dependencies".pynvml') + EOF + ) + + echo "BUILD_ARGS<> $GITHUB_OUTPUT + echo "$BUILD_ARGS" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + build-container: if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} needs: [pre-flight] @@ -100,7 +132,8 @@ jobs: build-args: | MAX_JOBS=32 ALIGNER_COMMIT=${{ github.sha }} - + ${{ needs.pre-flight.outputs.BUILD_ARGS }} + Unit_Tests: name: ${{ matrix.test_case }} needs: [build-container, pre-flight] diff --git a/Dockerfile b/Dockerfile index 6eb6aad40..1635975c3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,14 +11,21 @@ # if you get errors building TE or Apex, decrease this to 4 ARG MAX_JOBS=8 # Git refs for dependencies -ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea -ARG PYTRITON_VERSION=0.5.10 + +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 +ARG NEMO_REPO=https://github.com/NVIDIA/NeMo ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main +ARG MLM_REPO=https://github.com/NVIDIA/Megatron-LM ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main +ARG ALIGNER_REPO=https://github.com/NVIDIA/NeMo-Aligner ARG ALIGNER_COMMIT=main +ARG TE_REPO=https://github.com/NVIDIA/TransformerEngine +ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea +ARG TRTLLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git ARG TRTLLM_VERSION=v0.13.0 ARG PROTOBUF_VERSION=4.24.4 -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 +ARG PYTRITON_VERSION=0.5.10 +ARG PYNVML_VERSION=11.5.3 FROM ${BASE_IMAGE} AS aligner-bump ARG ALIGNER_COMMIT @@ -38,118 +45,86 @@ git pull --rebase || true pip install --no-cache-dir --no-deps -e . EOF +FROM ${BASE_IMAGE} as trtllm-wheel +ARG TRTLLM_VERSION +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library trtllm --mode build && \ + ls -al /opt/TensorRT-LLM + +FROM ${BASE_IMAGE} as te-wheel +ARG MAX_JOBS +ARG TE_TAG +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library te --mode build && \ + ls -al /opt/TransformerEngine + +FROM ${BASE_IMAGE} as apex-wheel +ARG APEX_TAG +ARG MAX_JOBS +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh +RUN cd /opt/NeMo-Aligner && \ + bash reinstall.sh --library apex --mode build && \ + ls -al /opt/Apex + FROM ${BASE_IMAGE} AS final LABEL "nemo.library"="nemo-aligner" WORKDIR /opt # needed in case git complains that it can't detect a valid email, this email is fake but works RUN git config --global user.email "worker@nvidia.com" -# install latest apex -ARG APEX_TAG -RUN pip uninstall -y apex && \ - git clone https://github.com/NVIDIA/apex && \ - cd apex && \ - if [ ! -z $APEX_TAG ]; then \ - git fetch origin $APEX_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ - -# Git LFS -RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ - apt-get install git-lfs && \ - git lfs install && \ - apt-get clean + +# Copy installer script +COPY --from=aligner-bump /opt/NeMo-Aligner/reinstall.sh /opt/NeMo-Aligner/reinstall.sh + +# Apex +COPY --from=apex-wheel /opt/Apex /tmp/apex +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library apex # TRTLLM -ARG TRTLLM_VERSION -RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \ - cd TensorRT-LLM && \ - git checkout ${TRTLLM_VERSION} && \ - . docker/common/install_tensorrt.sh && \ - python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \ - pip install -e . +ARG PYNVML_VERSION +COPY --from=trtllm-wheel /opt/TensorRT-LLM/build/ /tmp/trtllm +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library trtllm ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ -# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a -# breaking change. The last known working verison is 11.5.3 -RUN pip install pynvml==11.5.3 - -# install TransformerEngine -ARG MAX_JOBS -ARG TE_TAG -RUN pip uninstall -y transformer-engine && \ - git clone https://github.com/NVIDIA/TransformerEngine.git && \ - cd TransformerEngine && \ - if [ ! -z $TE_TAG ]; then \ - git fetch origin $TE_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - git submodule init && git submodule update && \ - NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . - -# place any util pkgs here -ARG PYTRITON_VERSION -RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION -ARG PROTOBUF_VERSION -RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION -RUN pip install --upgrade-strategy only-if-needed jsonlines +# TransformerEngine +COPY --from=te-wheel /opt/TransformerEngine /tmp/te +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library te -# NeMo +ARG NEMO_REPO ARG NEMO_TAG -RUN git clone https://github.com/NVIDIA/NeMo.git && \ - cd NeMo && \ - git pull && \ - if [ ! -z $NEMO_TAG ]; then \ - git fetch origin $NEMO_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip uninstall -y nemo_toolkit sacrebleu && \ - pip install -e ".[nlp]" && \ - cd nemo/collections/nlp/data/language_modeling/megatron && make - -# TODO: While we are on Pytorch 24.07, we need to downgrade triton since 3.2.0 introduced a breaking change -# This un-pinned requirement comes from mamba-ssm, and this pin can be removed once Pytorch base image is -# updated. -RUN pip install triton==3.1.0 - -# MLM -ARG MLM_TAG -RUN pip uninstall -y megatron-core && \ - git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git pull && \ - if [ ! -z $MLM_TAG ]; then \ - git fetch origin $MLM_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip install -e . +RUN bash /opt/NeMo-Aligner/reinstall.sh --mode install --library nemo COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner -RUN cd /opt/NeMo-Aligner && \ - pip install --no-deps -e . +ARG ALIGNER_COMMIT +ARG PROTOBUF_VERSION +ARG PYTRITON_VERSION +ARG PYNVML_VERSION +RUN bash /opt/NeMo-Aligner/reinstall.sh --library all --mode install +# && \ +# cd /opt/TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch -RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch # TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs -RUN <<"EOF" bash -exu -cd NeMo -# Ensures we don't cherry-pick "future" origin/main commits -git fetch -a -# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651 -# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652 -# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 -# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 -for pr_and_commit in \ - "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ - "10652 60e677423667c029dd05875da72bf0719774f844" \ - "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ -; do - pr=$(cut -f1 -d' ' <<<"$pr_and_commit") - head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") - git fetch origin $head_pr_commit:PR-${pr} - # cherry-picks all commits between main and the top of the PR - git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} - # Tag cherry-picks to help - git tag cherry-pick-PR-${pr} -done -EOF +# RUN <<"EOF" bash -exu +# cd NeMo +# # Ensures we don't cherry-pick "future" origin/main commits +# git fetch -a +# # 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651 +# # 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652 +# # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 +# # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 +# for pr_and_commit in \ +# "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ +# "10652 60e677423667c029dd05875da72bf0719774f844" \ +# "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ +# ; do +# pr=$(cut -f1 -d' ' <<<"$pr_and_commit") +# head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") +# git fetch origin $head_pr_commit:PR-${pr} +# # cherry-picks all commits between main and the top of the PR +# git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} +# # Tag cherry-picks to help +# git tag cherry-pick-PR-${pr} +# done +# EOF diff --git a/reinstall.sh b/reinstall.sh new file mode 100644 index 000000000..38c23c5b2 --- /dev/null +++ b/reinstall.sh @@ -0,0 +1,195 @@ +#!/usr/bin/env bash +set -ex +cd /opt + +# List of all supported libraries (update this list when adding new libraries) +# This also defines the order in which they will be installed by --libraries "all" +ALL_LIBRARIES=( + "nemo" + "trtllm" + "te" + "apex" + "aligner" +) + +# -------------------------- +# Library Functions (Implement your logic here) +# -------------------------- + +nemo() { + local mode="$1" + cd /opt + + (rm -rf NeMo || true) && + git clone https://github.com/NVIDIA/NeMo.git && + pushd NeMo && + git fetch && + git checkout ${NEMO_TAG} && + bash reinstall.sh && + popd +} + +trtllm() { + local mode="$1" + cd /opt + + (rm -rf TensorRT-LLM || true) && + git clone https://github.com/NVIDIA/TensorRT-LLM.git && + pushd TensorRT-LLM && + git checkout ${TRTLLM_VERSION} + + if [[ "$mode" == "build" ]]; then + curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && + apt-get install git-lfs && + git lfs install && + git lfs pull && + apt-get clean + + . docker/common/install_tensorrt.sh && + python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks + else + pip install --no-cache-dir /tmp/trtllm/tensorrt_llm*.whl + pip install --no-cache-dir pynvml==${PYNVML_VERSION} + fi +} + +te() { + local mode="$1" + cd /opt + + (rm -rf TransformerEngine || true) && + git clone https://github.com/NVIDIA/TransformerEngine.git && + pushd TransformerEngine && + git checkout ${TE_TAG} + + if [[ "$mode" == "build" ]]; then + git submodule init && git submodule update && \ + NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip wheel . && \ + ls -al + else + pip install /tmp/te/transformer_engine*.whl + fi +} + +apex() { + local mode="$1" + cd /opt + + (rm -rf Apex || true) && + git clone https://github.com/NVIDIA/Apex.git && + pushd Apex && + git checkout ${APEX_TAG} + + if [[ "$mode" == "build" ]]; then + pip wheel -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ && \ + ls -al + else + pip install /tmp/apex/apex*.whl + fi +} + + +aligner() { + local mode="$1" + cd /opt + + (rm -rf NeMo-Aligner || true) && + git clone https://github.com/NVIDIA/NeMo-Aligner.git && + pushd NeMo-Aligner && + git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge' && + git fetch origin $ALIGNER_COMMIT && + git checkout -f $ALIGNER_COMMIT && + # case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it + # case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail + git pull --rebase || true && + pip install --no-cache-dir --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION && + pip install --no-cache-dir -U --no-deps protobuf==$PROTOBUF_VERSION && + pip install --no-cache-dir -e . && + popd +} + +# -------------------------- +# Argument Parsing & Validation +# -------------------------- + +# Parse command-line arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --library) + LIBRARY_ARG="$2" + shift 2 + ;; + --mode) + MODE="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +# Validate required arguments +if [[ -z "$LIBRARY_ARG" ]]; then + echo "Error: --library argument is required" + exit 1 +fi + +if [[ -z "$MODE" ]]; then + echo "Error: --mode argument is required" + exit 1 +fi + +# Validate mode +if [[ "$MODE" != "build" && "$MODE" != "install" ]]; then + echo "Error: Invalid mode. Must be 'build' or 'install'" + exit 1 +fi + +# Process library argument +declare -a LIBRARIES +if [[ "$LIBRARY_ARG" == "all" ]]; then + LIBRARIES=("${ALL_LIBRARIES[@]}") +else + IFS=',' read -ra TEMP_ARRAY <<<"$LIBRARY_ARG" + for lib in "${TEMP_ARRAY[@]}"; do + trimmed_lib=$(echo "$lib" | xargs) + if [[ -n "$trimmed_lib" ]]; then + LIBRARIES+=("$trimmed_lib") + fi + done +fi + +# Validate libraries array +if [[ ${#LIBRARIES[@]} -eq 0 ]]; then + echo "Error: No valid libraries specified" + exit 1 +fi + +# Validate each library is supported +for lib in "${LIBRARIES[@]}"; do + if [[ ! " ${ALL_LIBRARIES[@]} " =~ " ${lib} " ]]; then + echo "Error: Unsupported library '$lib'" + exit 1 + fi +done + +# -------------------------- +# Execution Logic +# -------------------------- + +# Run operations for each library +for library in "${LIBRARIES[@]}"; do + echo "Processing $library ($MODE)..." + "$library" "$MODE" + + # Check if function succeeded + if [[ $? -ne 0 ]]; then + echo "Error: Operation failed for $library" + exit 1 + fi +done + +echo "All operations completed successfully" +exit 0 diff --git a/setup/manifest.json b/setup/manifest.json new file mode 100644 index 000000000..96efe34cb --- /dev/null +++ b/setup/manifest.json @@ -0,0 +1,26 @@ +{ + "ngc-pytorch": "nvcr.io/nvidia/pytorch:24.07-py3", + "vcs-dependencies": { + "aligner": { + "repo": "https://github.com/NVIDIA/NeMo-Aligner", + "ref": "main" + }, + "trtllm": { + "repo": "https://github.com/NVIDIA/TensorRT-LLM", + "ref": "v0.13.0" + }, + "te": { + "repo": "https://github.com/NVIDIA/TransformerEngine", + "ref": "7d576ed25266a17a7b651f2c12e8498f67e0baea" + }, + "nemo": { + "repo": "https://github.com/NVIDIA/NeMo", + "ref": "ko3n1g/build/improve-installer" + } + }, + "pypi-dependencies": { + "protobuf": "4.24.4", + "pytriton": "0.5.10", + "pynvml": "11.5.3" + } +} \ No newline at end of file diff --git a/setup/requirements.txt b/setup/requirements.txt index 4aa22afa1..80758d8ad 100644 --- a/setup/requirements.txt +++ b/setup/requirements.txt @@ -1,8 +1,8 @@ Jinja2~=3.1.4 jsonlines -megatron_core>=0.8 -nemo_toolkit[nlp] -nvidia-pytriton +# megatron_core>=0.8 +# nemo_toolkit[nlp] +# nvidia-pytriton # pynvml pin is needed for TRTLLM v0.13.0 since 12.0.0 contains a breaking change. -pynvml==11.5.3 -tensorrt-llm==0.13.0 +# pynvml==11.5.3 +# tensorrt-llm==0.13.0