From ee06038dd0561560e360d1d49a97e519662eb535 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 3 Sep 2024 13:50:02 -0700
Subject: [PATCH 01/14] update pad_sequence_length_to_mult for context parallel

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_aligner/data/nlp/builders.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 89a967fe1..eab920602 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -265,12 +265,21 @@ def build_dataset(index, name):
 
 
 def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None):
+    # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
+    # When using sequence parallel, sequence will further be split by TP size
+    # When using context parallel, sequence is split by CP size as well
+    pad_seq_length_to_mult = (
+        8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
+    )
+    pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1)
+
     dataset_cls = GPTSFTChatDataset if is_chat else GPTSFTDataset
     dataset = dataset_cls(
         file_path=data_cfg.file_path,
         tokenizer=tokenizer,
         max_seq_length=data_cfg.max_seq_length,
         min_seq_length=data_cfg.min_seq_length,
+        pad_seq_length_to_mult=pad_seq_length_to_mult,
         add_bos=data_cfg.get("add_bos", False),
         add_eos=data_cfg.get("add_eos", True),
         add_sep=data_cfg.get("add_sep", False),

From 5b26ab3723cf6cc6a49d4eb6b3f23b48c8e4dfdc Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 13 Nov 2024 14:41:12 -0800
Subject: [PATCH 02/14] bug fix

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_aligner/models/nlp/gpt/gpt_sft_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
index d3a615500..6acccea5b 100644
--- a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
+++ b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
@@ -88,7 +88,7 @@ def get_loss_and_metrics(self, batch, forward_only):
         set_sync_funcs(self, forward_only)
 
         fwd_bwd_function = get_forward_backward_func()
-        fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only)
+        fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only, tuning=True)
 
         losses_reduced = fwd_bwd_function(
             forward_step_func=fwd_loss_fn,

From 2919f2238b50405c14ad67f067c2f709e14cfef5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 27 Nov 2024 06:24:12 +0000
Subject: [PATCH 03/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: NeMo-Aligner CI <nemo-aligner-ci@nvidia.com>
---
 nemo_aligner/data/nlp/builders.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 4a320ad2e..86e0244e9 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -274,9 +274,9 @@ def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, i
     # When using sequence parallel, sequence will further be split by TP size
     # When using context parallel, sequence is split by CP size as well
     pad_seq_length_to_mult = (
-        8 * self.cfg.get('tensor_model_parallel_size', 1) if self.cfg.get('sequence_parallel', False) else 16
+        8 * self.cfg.get("tensor_model_parallel_size", 1) if self.cfg.get("sequence_parallel", False) else 16
     )
-    pad_seq_length_to_mult *= self.cfg.get('context_parallel_size', 1)
+    pad_seq_length_to_mult *= self.cfg.get("context_parallel_size", 1)
 
     if is_chat:
         assert not packed_sequence, "Sequence packing is currently not supported with chat datasets."

From 35f8be9a9c01740a1067df15a00da9795e1f1465 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 27 Nov 2024 12:43:34 -0800
Subject: [PATCH 04/14] make cp size and TE configurable

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/nlp/gpt/conf/gpt_sft.yaml | 2 ++
 examples/nlp/gpt/train_gpt_sft.py  | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml
index bdd757f31..fd79528c6 100644
--- a/examples/nlp/gpt/conf/gpt_sft.yaml
+++ b/examples/nlp/gpt/conf/gpt_sft.yaml
@@ -57,12 +57,14 @@ model:
   seed: 1234
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  context_parallel_size: 1 # parallelism along sequence length
   restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
   sync_batch_comm: False
   megatron_amp_O2: False
   encoder_seq_length: 4096  # the sequence length of the encoder model, it will be overwriten by loaded GPT model
+  transformer_engine: False
 
   ## Sequence Parallelism
   # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index f52445637..b90cd04c8 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -85,7 +85,10 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
             gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1)
         if cfg.model.get("pipeline_model_parallel_size", 1) > 0:
             gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1)
+        if cfg.model.get("context_parallel_size", 1) > 0:
+            gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1)
         gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0)
+        gpt_cfg.transformer_engine = cfg.model.get("transformer_engine", False)
 
         if cfg.model.data.get("chat", False):
             # chat model, overwrite the prompt template

From c17c489e31b11c81d1aa5041d14f8d265379d8a7 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Wed, 27 Nov 2024 12:47:36 -0800
Subject: [PATCH 05/14] update changelog

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 63cd9ba5c..d637bfe3f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ## [Next Version]
 
 ### New Features and Optimizations
+- Added context parallel support for SFT. CP can be enabled by setting `model.context_parallel_size` in your config.
 - Added support for Knowledge Distillation with SFT. See the [tutorial](docs/user-guide/knowledge-distillation.rst) for details.
 - Added support for Megatron Core’s distributed optimizer, which can be configured using `++model.optim.name=mcore_distributed_optim`.
 - Introduced `ScopedTimer` as a successor to `SyncedTimer`. `SyncedTimer` is marked for deprecation and will be removed in the next version.

From 82b51666cbd89f5305b5780bca26d819a00d329a Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Fri, 29 Nov 2024 21:59:09 -0800
Subject: [PATCH 06/14] fixes

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/nlp/gpt/train_gpt_sft.py |  5 +++++
 nemo_aligner/data/nlp/builders.py | 12 +++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index e5d34265d..e41a1460d 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -88,6 +88,9 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
         if cfg.model.get("context_parallel_size", 1) > 0:
             gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1)
         gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0)
+        if cfg.model.get("dist_ckpt_load_strictness", None) is not None:
+            gpt_cfg.dist_ckpt_load_strictness = cfg.model.get("dist_ckpt_load_strictness", None)
+
         gpt_cfg.transformer_engine = cfg.model.get("transformer_engine", False)
 
         if cfg.model.data.get("chat", False):
@@ -179,6 +182,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
+        model_config=cfg.model,
     )
     if cfg.model.data.get("sample", False):
         num_samples = cfg.trainer.sft.limit_val_batches * val_data_cfg.global_batch_size
@@ -191,6 +195,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
+        model_config=cfg.model,
     )
 
     train_dataloader = build_dataloader(
diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index f7d130e5a..99ab7c010 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -379,17 +379,19 @@ def build_dataset(index, name):
 )
 
 
-def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None):
+def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None):
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
     # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
     # When using sequence parallel, sequence will further be split by TP size
     # When using context parallel, sequence is split by CP size as well
-    pad_seq_length_to_mult = (
-        8 * self.cfg.get("tensor_model_parallel_size", 1) if self.cfg.get("sequence_parallel", False) else 16
-    )
-    pad_seq_length_to_mult *= self.cfg.get("context_parallel_size", 1)
+    pad_seq_length_to_mult=16
+    if model_cfg is not None:
+        pad_seq_length_to_mult = (
+            8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16
+        )
+        pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1)
 
     if is_chat:
         assert not packed_sequence, "Sequence packing is currently not supported with chat datasets."

From 8b31d382caa57cfeb4b80d08b9a87f70d03729a0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 30 Nov 2024 05:59:26 +0000
Subject: [PATCH 07/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: NeMo-Aligner CI <nemo-aligner-ci@nvidia.com>
---
 nemo_aligner/data/nlp/builders.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 99ab7c010..711d72b99 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -379,14 +379,16 @@ def build_dataset(index, name):
 )
 
 
-def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None):
+def build_sft_dataset(
+    data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None
+):
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
     # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
     # When using sequence parallel, sequence will further be split by TP size
     # When using context parallel, sequence is split by CP size as well
-    pad_seq_length_to_mult=16
+    pad_seq_length_to_mult = 16
     if model_cfg is not None:
         pad_seq_length_to_mult = (
             8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16

From f8b51305b2bcdbf24b58836d721c1e0fc087c3db Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Fri, 29 Nov 2024 22:42:14 -0800
Subject: [PATCH 08/14] fix

Signed-off-by: Anna Shors <ashors@nvidia.com>
---
 examples/nlp/gpt/train_gpt_sft.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index e41a1460d..1986bf3ba 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -85,7 +85,7 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
             gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1)
         if cfg.model.get("pipeline_model_parallel_size", 1) > 0:
             gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1)
-        if cfg.model.get("context_parallel_size", 1) > 0:
+        if cfg.model.get("ontext_parallel_size", 1) > 0:
             gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1)
         gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0)
         if cfg.model.get("dist_ckpt_load_strictness", None) is not None:
@@ -182,7 +182,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
-        model_config=cfg.model,
+        model_cfg=cfg.model,
     )
     if cfg.model.data.get("sample", False):
         num_samples = cfg.trainer.sft.limit_val_batches * val_data_cfg.global_batch_size
@@ -195,7 +195,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
-        model_config=cfg.model,
+        model_cfg=cfg.model,
     )
 
     train_dataloader = build_dataloader(

From 19467e8e12a5c5f972e5eaf90977929ec1b5dcc1 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 2 Dec 2024 14:09:01 -0800
Subject: [PATCH 09/14] enable te in sft config

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/nlp/gpt/conf/gpt_sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml
index fd79528c6..1901b356a 100644
--- a/examples/nlp/gpt/conf/gpt_sft.yaml
+++ b/examples/nlp/gpt/conf/gpt_sft.yaml
@@ -64,7 +64,7 @@ model:
   sync_batch_comm: False
   megatron_amp_O2: False
   encoder_seq_length: 4096  # the sequence length of the encoder model, it will be overwriten by loaded GPT model
-  transformer_engine: False
+  transformer_engine: True
 
   ## Sequence Parallelism
   # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially

From ea3b5baa5c9147e886f97989e62d1c100ecb9235 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 2 Dec 2024 22:29:53 -0800
Subject: [PATCH 10/14] update build_sft_dataset

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_aligner/data/nlp/builders.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 711d72b99..5d2dcd5db 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -380,8 +380,11 @@ def build_dataset(index, name):
 
 
 def build_sft_dataset(
-    data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None
+    cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, validation=False
 ):
+
+    data_cfg = cfg.model.data.validation_ds if validation else cfg.model.data.train_ds
+
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
@@ -389,11 +392,10 @@ def build_sft_dataset(
     # When using sequence parallel, sequence will further be split by TP size
     # When using context parallel, sequence is split by CP size as well
     pad_seq_length_to_mult = 16
-    if model_cfg is not None:
-        pad_seq_length_to_mult = (
-            8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16
-        )
-        pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1)
+    pad_seq_length_to_mult = (
+        8 * cfg.get("tensor_model_parallel_size", 1) if cfg.get("sequence_parallel", False) else 16
+    )
+    pad_seq_length_to_mult *= cfg.get("context_parallel_size", 1)
 
     if is_chat:
         assert not packed_sequence, "Sequence packing is currently not supported with chat datasets."

From aa717f1c56d971c426c413f34d6907f647435189 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Mon, 2 Dec 2024 22:32:21 -0800
Subject: [PATCH 11/14] Revert "update build_sft_dataset"

This reverts commit ea3b5baa5c9147e886f97989e62d1c100ecb9235.
---
 nemo_aligner/data/nlp/builders.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 5d2dcd5db..711d72b99 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -380,11 +380,8 @@ def build_dataset(index, name):
 
 
 def build_sft_dataset(
-    cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, validation=False
+    data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None
 ):
-
-    data_cfg = cfg.model.data.validation_ds if validation else cfg.model.data.train_ds
-
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
@@ -392,10 +389,11 @@ def build_sft_dataset(
     # When using sequence parallel, sequence will further be split by TP size
     # When using context parallel, sequence is split by CP size as well
     pad_seq_length_to_mult = 16
-    pad_seq_length_to_mult = (
-        8 * cfg.get("tensor_model_parallel_size", 1) if cfg.get("sequence_parallel", False) else 16
-    )
-    pad_seq_length_to_mult *= cfg.get("context_parallel_size", 1)
+    if model_cfg is not None:
+        pad_seq_length_to_mult = (
+            8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16
+        )
+        pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1)
 
     if is_chat:
         assert not packed_sequence, "Sequence packing is currently not supported with chat datasets."

From 165bcb6376dfda3b49ff8625935d6f1072e48061 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 3 Dec 2024 14:42:18 -0800
Subject: [PATCH 12/14] bump nemo and mcore versions

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 44a9f8651..062f837bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,8 +13,8 @@ ARG MAX_JOBS=8
 # Git refs for dependencies
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG PYTRITON_VERSION=0.5.10
-ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634  # On: main
-ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3   # On: main
+ARG NEMO_TAG=8c921dc19a905d8b5a0f90f6e2a34607c2e0660d  # On: main
+ARG MLM_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3 # On: main
 ARG ALIGNER_COMMIT=main
 ARG TRTLLM_VERSION=v0.13.0
 ARG PROTOBUF_VERSION=4.24.4

From 480dd164d56a78018eba8673d4acb1c5b3de1253 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 3 Dec 2024 14:54:10 -0800
Subject: [PATCH 13/14] remove old cherry-picks

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 Dockerfile | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 062f837bc..7dcb4fb59 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -119,28 +119,4 @@ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
 RUN cd /opt/NeMo-Aligner && \
     pip install --no-deps -e .
 
-RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
-
-# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
-RUN <<"EOF" bash -exu
-cd NeMo
-# Ensures we don't cherry-pick "future" origin/main commits
-git fetch -a
-# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
-# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
-# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
-# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
-for pr_and_commit in \
-  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
-  "10652 60e677423667c029dd05875da72bf0719774f844" \
-  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
-; do
-  pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
-  head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
-  git fetch origin $head_pr_commit:PR-${pr}
-  # cherry-picks all commits between main and the top of the PR
-  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
-  # Tag cherry-picks to help
-  git tag cherry-pick-PR-${pr}
-done
-EOF
+RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
\ No newline at end of file

From 77436c9f5fe0402991feffb11c0751821d165cd2 Mon Sep 17 00:00:00 2001
From: ashors1 <ashors@nvidia.com>
Date: Tue, 3 Dec 2024 15:10:09 -0800
Subject: [PATCH 14/14] fix typo

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 examples/nlp/gpt/train_gpt_sft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index 1986bf3ba..4761067b6 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -85,7 +85,7 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
             gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1)
         if cfg.model.get("pipeline_model_parallel_size", 1) > 0:
             gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1)
-        if cfg.model.get("ontext_parallel_size", 1) > 0:
+        if cfg.model.get("context_parallel_size", 1) > 0:
             gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1)
         gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0)
         if cfg.model.get("dist_ckpt_load_strictness", None) is not None: