From ee06038dd0561560e360d1d49a97e519662eb535 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 3 Sep 2024 13:50:02 -0700 Subject: [PATCH 01/14] update pad_sequence_length_to_mult for context parallel Signed-off-by: ashors1 --- nemo_aligner/data/nlp/builders.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py index 89a967fe1..eab920602 100644 --- a/nemo_aligner/data/nlp/builders.py +++ b/nemo_aligner/data/nlp/builders.py @@ -265,12 +265,21 @@ def build_dataset(index, name): def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None): + # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 + # When using sequence parallel, sequence will further be split by TP size + # When using context parallel, sequence is split by CP size as well + pad_seq_length_to_mult = ( + 8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16 + ) + pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1) + dataset_cls = GPTSFTChatDataset if is_chat else GPTSFTDataset dataset = dataset_cls( file_path=data_cfg.file_path, tokenizer=tokenizer, max_seq_length=data_cfg.max_seq_length, min_seq_length=data_cfg.min_seq_length, + pad_seq_length_to_mult=pad_seq_length_to_mult, add_bos=data_cfg.get("add_bos", False), add_eos=data_cfg.get("add_eos", True), add_sep=data_cfg.get("add_sep", False), From 5b26ab3723cf6cc6a49d4eb6b3f23b48c8e4dfdc Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 13 Nov 2024 14:41:12 -0800 Subject: [PATCH 02/14] bug fix Signed-off-by: ashors1 --- nemo_aligner/models/nlp/gpt/gpt_sft_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py index d3a615500..6acccea5b 100644 --- a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py +++ b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py @@ -88,7 +88,7 @@ def get_loss_and_metrics(self, batch, forward_only): set_sync_funcs(self, forward_only) fwd_bwd_function = get_forward_backward_func() - fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only) + fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only, tuning=True) losses_reduced = fwd_bwd_function( forward_step_func=fwd_loss_fn, From 2919f2238b50405c14ad67f067c2f709e14cfef5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Nov 2024 06:24:12 +0000 Subject: [PATCH 03/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: NeMo-Aligner CI --- nemo_aligner/data/nlp/builders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py index 4a320ad2e..86e0244e9 100644 --- a/nemo_aligner/data/nlp/builders.py +++ b/nemo_aligner/data/nlp/builders.py @@ -274,9 +274,9 @@ def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, i # When using sequence parallel, sequence will further be split by TP size # When using context parallel, sequence is split by CP size as well pad_seq_length_to_mult = ( - 8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16 + 8 * self.cfg.get("tensor_model_parallel_size", 1) if self.cfg.get("sequence_parallel", False) else 16 ) - pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1) + pad_seq_length_to_mult *= self.cfg.get("context_parallel_size", 1) if is_chat: assert not packed_sequence, "Sequence packing is currently not supported with chat datasets." From 35f8be9a9c01740a1067df15a00da9795e1f1465 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 27 Nov 2024 12:43:34 -0800 Subject: [PATCH 04/14] make cp size and TE configurable Signed-off-by: ashors1 --- examples/nlp/gpt/conf/gpt_sft.yaml | 2 ++ examples/nlp/gpt/train_gpt_sft.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml index bdd757f31..fd79528c6 100644 --- a/examples/nlp/gpt/conf/gpt_sft.yaml +++ b/examples/nlp/gpt/conf/gpt_sft.yaml @@ -57,12 +57,14 @@ model: seed: 1234 tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism + context_parallel_size: 1 # parallelism along sequence length restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. sync_batch_comm: False megatron_amp_O2: False encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model + transformer_engine: False ## Sequence Parallelism # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py index f52445637..b90cd04c8 100644 --- a/examples/nlp/gpt/train_gpt_sft.py +++ b/examples/nlp/gpt/train_gpt_sft.py @@ -85,7 +85,10 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1) if cfg.model.get("pipeline_model_parallel_size", 1) > 0: gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1) + if cfg.model.get("context_parallel_size", 1) > 0: + gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1) gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0) + gpt_cfg.transformer_engine = cfg.model.get("transformer_engine", False) if cfg.model.data.get("chat", False): # chat model, overwrite the prompt template From c17c489e31b11c81d1aa5041d14f8d265379d8a7 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Wed, 27 Nov 2024 12:47:36 -0800 Subject: [PATCH 05/14] update changelog Signed-off-by: ashors1 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63cd9ba5c..d637bfe3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [Next Version] ### New Features and Optimizations +- Added context parallel support for SFT. CP can be enabled by setting `model.context_parallel_size` in your config. - Added support for Knowledge Distillation with SFT. See the [tutorial](docs/user-guide/knowledge-distillation.rst) for details. - Added support for Megatron Core’s distributed optimizer, which can be configured using `++model.optim.name=mcore_distributed_optim`. - Introduced `ScopedTimer` as a successor to `SyncedTimer`. `SyncedTimer` is marked for deprecation and will be removed in the next version. From 82b51666cbd89f5305b5780bca26d819a00d329a Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 29 Nov 2024 21:59:09 -0800 Subject: [PATCH 06/14] fixes Signed-off-by: ashors1 --- examples/nlp/gpt/train_gpt_sft.py | 5 +++++ nemo_aligner/data/nlp/builders.py | 12 +++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py index e5d34265d..e41a1460d 100644 --- a/examples/nlp/gpt/train_gpt_sft.py +++ b/examples/nlp/gpt/train_gpt_sft.py @@ -88,6 +88,9 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): if cfg.model.get("context_parallel_size", 1) > 0: gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1) gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0) + if cfg.model.get("dist_ckpt_load_strictness", None) is not None: + gpt_cfg.dist_ckpt_load_strictness = cfg.model.get("dist_ckpt_load_strictness", None) + gpt_cfg.transformer_engine = cfg.model.get("transformer_engine", False) if cfg.model.data.get("chat", False): @@ -179,6 +182,7 @@ def main(cfg) -> None: answer_only_loss=True, is_chat=cfg.model.data.chat, special_tokens=cfg.model.data.chat_prompt_tokens, + model_config=cfg.model, ) if cfg.model.data.get("sample", False): num_samples = cfg.trainer.sft.limit_val_batches * val_data_cfg.global_batch_size @@ -191,6 +195,7 @@ def main(cfg) -> None: answer_only_loss=True, is_chat=cfg.model.data.chat, special_tokens=cfg.model.data.chat_prompt_tokens, + model_config=cfg.model, ) train_dataloader = build_dataloader( diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py index f7d130e5a..99ab7c010 100644 --- a/nemo_aligner/data/nlp/builders.py +++ b/nemo_aligner/data/nlp/builders.py @@ -379,17 +379,19 @@ def build_dataset(index, name): ) -def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None): +def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None): packed_sequence = data_cfg.get("packed_sequence", False) dataset_kwargs = {} # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 # When using sequence parallel, sequence will further be split by TP size # When using context parallel, sequence is split by CP size as well - pad_seq_length_to_mult = ( - 8 * self.cfg.get("tensor_model_parallel_size", 1) if self.cfg.get("sequence_parallel", False) else 16 - ) - pad_seq_length_to_mult *= self.cfg.get("context_parallel_size", 1) + pad_seq_length_to_mult=16 + if model_cfg is not None: + pad_seq_length_to_mult = ( + 8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16 + ) + pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1) if is_chat: assert not packed_sequence, "Sequence packing is currently not supported with chat datasets." From 8b31d382caa57cfeb4b80d08b9a87f70d03729a0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 30 Nov 2024 05:59:26 +0000 Subject: [PATCH 07/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: NeMo-Aligner CI --- nemo_aligner/data/nlp/builders.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py index 99ab7c010..711d72b99 100644 --- a/nemo_aligner/data/nlp/builders.py +++ b/nemo_aligner/data/nlp/builders.py @@ -379,14 +379,16 @@ def build_dataset(index, name): ) -def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None): +def build_sft_dataset( + data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None +): packed_sequence = data_cfg.get("packed_sequence", False) dataset_kwargs = {} # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 # When using sequence parallel, sequence will further be split by TP size # When using context parallel, sequence is split by CP size as well - pad_seq_length_to_mult=16 + pad_seq_length_to_mult = 16 if model_cfg is not None: pad_seq_length_to_mult = ( 8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16 From f8b51305b2bcdbf24b58836d721c1e0fc087c3db Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Fri, 29 Nov 2024 22:42:14 -0800 Subject: [PATCH 08/14] fix Signed-off-by: Anna Shors --- examples/nlp/gpt/train_gpt_sft.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py index e41a1460d..1986bf3ba 100644 --- a/examples/nlp/gpt/train_gpt_sft.py +++ b/examples/nlp/gpt/train_gpt_sft.py @@ -85,7 +85,7 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1) if cfg.model.get("pipeline_model_parallel_size", 1) > 0: gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1) - if cfg.model.get("context_parallel_size", 1) > 0: + if cfg.model.get("ontext_parallel_size", 1) > 0: gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1) gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0) if cfg.model.get("dist_ckpt_load_strictness", None) is not None: @@ -182,7 +182,7 @@ def main(cfg) -> None: answer_only_loss=True, is_chat=cfg.model.data.chat, special_tokens=cfg.model.data.chat_prompt_tokens, - model_config=cfg.model, + model_cfg=cfg.model, ) if cfg.model.data.get("sample", False): num_samples = cfg.trainer.sft.limit_val_batches * val_data_cfg.global_batch_size @@ -195,7 +195,7 @@ def main(cfg) -> None: answer_only_loss=True, is_chat=cfg.model.data.chat, special_tokens=cfg.model.data.chat_prompt_tokens, - model_config=cfg.model, + model_cfg=cfg.model, ) train_dataloader = build_dataloader( From 19467e8e12a5c5f972e5eaf90977929ec1b5dcc1 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Mon, 2 Dec 2024 14:09:01 -0800 Subject: [PATCH 09/14] enable te in sft config Signed-off-by: ashors1 --- examples/nlp/gpt/conf/gpt_sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml index fd79528c6..1901b356a 100644 --- a/examples/nlp/gpt/conf/gpt_sft.yaml +++ b/examples/nlp/gpt/conf/gpt_sft.yaml @@ -64,7 +64,7 @@ model: sync_batch_comm: False megatron_amp_O2: False encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model - transformer_engine: False + transformer_engine: True ## Sequence Parallelism # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially From ea3b5baa5c9147e886f97989e62d1c100ecb9235 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Mon, 2 Dec 2024 22:29:53 -0800 Subject: [PATCH 10/14] update build_sft_dataset Signed-off-by: ashors1 --- nemo_aligner/data/nlp/builders.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py index 711d72b99..5d2dcd5db 100644 --- a/nemo_aligner/data/nlp/builders.py +++ b/nemo_aligner/data/nlp/builders.py @@ -380,8 +380,11 @@ def build_dataset(index, name): def build_sft_dataset( - data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None + cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, validation=False ): + + data_cfg = cfg.model.data.validation_ds if validation else cfg.model.data.train_ds + packed_sequence = data_cfg.get("packed_sequence", False) dataset_kwargs = {} @@ -389,11 +392,10 @@ def build_sft_dataset( # When using sequence parallel, sequence will further be split by TP size # When using context parallel, sequence is split by CP size as well pad_seq_length_to_mult = 16 - if model_cfg is not None: - pad_seq_length_to_mult = ( - 8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16 - ) - pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1) + pad_seq_length_to_mult = ( + 8 * cfg.get("tensor_model_parallel_size", 1) if cfg.get("sequence_parallel", False) else 16 + ) + pad_seq_length_to_mult *= cfg.get("context_parallel_size", 1) if is_chat: assert not packed_sequence, "Sequence packing is currently not supported with chat datasets." From aa717f1c56d971c426c413f34d6907f647435189 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Mon, 2 Dec 2024 22:32:21 -0800 Subject: [PATCH 11/14] Revert "update build_sft_dataset" This reverts commit ea3b5baa5c9147e886f97989e62d1c100ecb9235. --- nemo_aligner/data/nlp/builders.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py index 5d2dcd5db..711d72b99 100644 --- a/nemo_aligner/data/nlp/builders.py +++ b/nemo_aligner/data/nlp/builders.py @@ -380,11 +380,8 @@ def build_dataset(index, name): def build_sft_dataset( - cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, validation=False + data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None ): - - data_cfg = cfg.model.data.validation_ds if validation else cfg.model.data.train_ds - packed_sequence = data_cfg.get("packed_sequence", False) dataset_kwargs = {} @@ -392,10 +389,11 @@ def build_sft_dataset( # When using sequence parallel, sequence will further be split by TP size # When using context parallel, sequence is split by CP size as well pad_seq_length_to_mult = 16 - pad_seq_length_to_mult = ( - 8 * cfg.get("tensor_model_parallel_size", 1) if cfg.get("sequence_parallel", False) else 16 - ) - pad_seq_length_to_mult *= cfg.get("context_parallel_size", 1) + if model_cfg is not None: + pad_seq_length_to_mult = ( + 8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16 + ) + pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1) if is_chat: assert not packed_sequence, "Sequence packing is currently not supported with chat datasets." From 165bcb6376dfda3b49ff8625935d6f1072e48061 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 3 Dec 2024 14:42:18 -0800 Subject: [PATCH 12/14] bump nemo and mcore versions Signed-off-by: ashors1 --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 44a9f8651..062f837bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,8 @@ ARG MAX_JOBS=8 # Git refs for dependencies ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG PYTRITON_VERSION=0.5.10 -ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main -ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main +ARG NEMO_TAG=8c921dc19a905d8b5a0f90f6e2a34607c2e0660d # On: main +ARG MLM_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3 # On: main ARG ALIGNER_COMMIT=main ARG TRTLLM_VERSION=v0.13.0 ARG PROTOBUF_VERSION=4.24.4 From 480dd164d56a78018eba8673d4acb1c5b3de1253 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 3 Dec 2024 14:54:10 -0800 Subject: [PATCH 13/14] remove old cherry-picks Signed-off-by: ashors1 --- Dockerfile | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/Dockerfile b/Dockerfile index 062f837bc..7dcb4fb59 100644 --- a/Dockerfile +++ b/Dockerfile @@ -119,28 +119,4 @@ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner RUN cd /opt/NeMo-Aligner && \ pip install --no-deps -e . -RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch - -# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs -RUN <<"EOF" bash -exu -cd NeMo -# Ensures we don't cherry-pick "future" origin/main commits -git fetch -a -# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651 -# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652 -# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 -# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 -for pr_and_commit in \ - "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ - "10652 60e677423667c029dd05875da72bf0719774f844" \ - "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ -; do - pr=$(cut -f1 -d' ' <<<"$pr_and_commit") - head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") - git fetch origin $head_pr_commit:PR-${pr} - # cherry-picks all commits between main and the top of the PR - git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} - # Tag cherry-picks to help - git tag cherry-pick-PR-${pr} -done -EOF +RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch \ No newline at end of file From 77436c9f5fe0402991feffb11c0751821d165cd2 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Tue, 3 Dec 2024 15:10:09 -0800 Subject: [PATCH 14/14] fix typo Signed-off-by: ashors1 --- examples/nlp/gpt/train_gpt_sft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py index 1986bf3ba..4761067b6 100644 --- a/examples/nlp/gpt/train_gpt_sft.py +++ b/examples/nlp/gpt/train_gpt_sft.py @@ -85,7 +85,7 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1) if cfg.model.get("pipeline_model_parallel_size", 1) > 0: gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1) - if cfg.model.get("ontext_parallel_size", 1) > 0: + if cfg.model.get("context_parallel_size", 1) > 0: gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1) gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0) if cfg.model.get("dist_ckpt_load_strictness", None) is not None: