diff --git a/CHANGELOG.md b/CHANGELOG.md
index 63cd9ba5c..d637bfe3f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ## [Next Version]
 
 ### New Features and Optimizations
+- Added context parallel support for SFT. CP can be enabled by setting `model.context_parallel_size` in your config.
 - Added support for Knowledge Distillation with SFT. See the [tutorial](docs/user-guide/knowledge-distillation.rst) for details.
 - Added support for Megatron Core’s distributed optimizer, which can be configured using `++model.optim.name=mcore_distributed_optim`.
 - Introduced `ScopedTimer` as a successor to `SyncedTimer`. `SyncedTimer` is marked for deprecation and will be removed in the next version.
diff --git a/Dockerfile b/Dockerfile
index 44a9f8651..7dcb4fb59 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,8 +13,8 @@ ARG MAX_JOBS=8
 # Git refs for dependencies
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG PYTRITON_VERSION=0.5.10
-ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634  # On: main
-ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3   # On: main
+ARG NEMO_TAG=8c921dc19a905d8b5a0f90f6e2a34607c2e0660d  # On: main
+ARG MLM_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3 # On: main
 ARG ALIGNER_COMMIT=main
 ARG TRTLLM_VERSION=v0.13.0
 ARG PROTOBUF_VERSION=4.24.4
@@ -119,28 +119,4 @@ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
 RUN cd /opt/NeMo-Aligner && \
     pip install --no-deps -e .
 
-RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
-
-# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
-RUN <<"EOF" bash -exu
-cd NeMo
-# Ensures we don't cherry-pick "future" origin/main commits
-git fetch -a
-# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
-# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
-# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
-# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
-for pr_and_commit in \
-  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
-  "10652 60e677423667c029dd05875da72bf0719774f844" \
-  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
-; do
-  pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
-  head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
-  git fetch origin $head_pr_commit:PR-${pr}
-  # cherry-picks all commits between main and the top of the PR
-  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
-  # Tag cherry-picks to help
-  git tag cherry-pick-PR-${pr}
-done
-EOF
+RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
\ No newline at end of file
diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml
index bdd757f31..1901b356a 100644
--- a/examples/nlp/gpt/conf/gpt_sft.yaml
+++ b/examples/nlp/gpt/conf/gpt_sft.yaml
@@ -57,12 +57,14 @@ model:
   seed: 1234
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  context_parallel_size: 1 # parallelism along sequence length
   restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
   sync_batch_comm: False
   megatron_amp_O2: False
   encoder_seq_length: 4096  # the sequence length of the encoder model, it will be overwriten by loaded GPT model
+  transformer_engine: True
 
   ## Sequence Parallelism
   # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index 371c0f5aa..4761067b6 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -85,7 +85,13 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
             gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1)
         if cfg.model.get("pipeline_model_parallel_size", 1) > 0:
             gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1)
+        if cfg.model.get("context_parallel_size", 1) > 0:
+            gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1)
         gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0)
+        if cfg.model.get("dist_ckpt_load_strictness", None) is not None:
+            gpt_cfg.dist_ckpt_load_strictness = cfg.model.get("dist_ckpt_load_strictness", None)
+
+        gpt_cfg.transformer_engine = cfg.model.get("transformer_engine", False)
 
         if cfg.model.data.get("chat", False):
             # chat model, overwrite the prompt template
@@ -176,6 +182,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
+        model_cfg=cfg.model,
     )
     if cfg.model.data.get("sample", False):
         num_samples = cfg.trainer.sft.limit_val_batches * val_data_cfg.global_batch_size
@@ -188,6 +195,7 @@ def main(cfg) -> None:
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
+        model_cfg=cfg.model,
     )
 
     train_dataloader = build_dataloader(
diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 97b68ffe4..711d72b99 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -379,10 +379,22 @@ def build_dataset(index, name):
 )
 
 
-def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None):
+def build_sft_dataset(
+    data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None
+):
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
+    # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8
+    # When using sequence parallel, sequence will further be split by TP size
+    # When using context parallel, sequence is split by CP size as well
+    pad_seq_length_to_mult = 16
+    if model_cfg is not None:
+        pad_seq_length_to_mult = (
+            8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16
+        )
+        pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1)
+
     if is_chat:
         assert not packed_sequence, "Sequence packing is currently not supported with chat datasets."
         dataset_cls = GPTSFTChatDataset
@@ -401,6 +413,7 @@ def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, i
         tokenizer=tokenizer,
         max_seq_length=data_cfg.max_seq_length,
         min_seq_length=data_cfg.min_seq_length,
+        pad_seq_length_to_mult=pad_seq_length_to_mult,
         add_bos=data_cfg.get("add_bos", False),
         add_eos=data_cfg.get("add_eos", True),
         add_sep=data_cfg.get("add_sep", False),
diff --git a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
index d3a615500..6acccea5b 100644
--- a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
+++ b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
@@ -88,7 +88,7 @@ def get_loss_and_metrics(self, batch, forward_only):
         set_sync_funcs(self, forward_only)
 
         fwd_bwd_function = get_forward_backward_func()
-        fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only)
+        fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only, tuning=True)
 
         losses_reduced = fwd_bwd_function(
             forward_step_func=fwd_loss_fn,