diff --git a/CHANGELOG.md b/CHANGELOG.md index 63cd9ba5c..d637bfe3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ## [Next Version] ### New Features and Optimizations +- Added context parallel support for SFT. CP can be enabled by setting `model.context_parallel_size` in your config. - Added support for Knowledge Distillation with SFT. See the [tutorial](docs/user-guide/knowledge-distillation.rst) for details. - Added support for Megatron Core’s distributed optimizer, which can be configured using `++model.optim.name=mcore_distributed_optim`. - Introduced `ScopedTimer` as a successor to `SyncedTimer`. `SyncedTimer` is marked for deprecation and will be removed in the next version. diff --git a/Dockerfile b/Dockerfile index 44a9f8651..7dcb4fb59 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,8 +13,8 @@ ARG MAX_JOBS=8 # Git refs for dependencies ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG PYTRITON_VERSION=0.5.10 -ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main -ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main +ARG NEMO_TAG=8c921dc19a905d8b5a0f90f6e2a34607c2e0660d # On: main +ARG MLM_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3 # On: main ARG ALIGNER_COMMIT=main ARG TRTLLM_VERSION=v0.13.0 ARG PROTOBUF_VERSION=4.24.4 @@ -119,28 +119,4 @@ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner RUN cd /opt/NeMo-Aligner && \ pip install --no-deps -e . -RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch - -# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs -RUN <<"EOF" bash -exu -cd NeMo -# Ensures we don't cherry-pick "future" origin/main commits -git fetch -a -# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651 -# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652 -# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 -# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 -for pr_and_commit in \ - "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ - "10652 60e677423667c029dd05875da72bf0719774f844" \ - "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ -; do - pr=$(cut -f1 -d' ' <<<"$pr_and_commit") - head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") - git fetch origin $head_pr_commit:PR-${pr} - # cherry-picks all commits between main and the top of the PR - git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} - # Tag cherry-picks to help - git tag cherry-pick-PR-${pr} -done -EOF +RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch \ No newline at end of file diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml index bdd757f31..1901b356a 100644 --- a/examples/nlp/gpt/conf/gpt_sft.yaml +++ b/examples/nlp/gpt/conf/gpt_sft.yaml @@ -57,12 +57,14 @@ model: seed: 1234 tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism + context_parallel_size: 1 # parallelism along sequence length restore_from_path: ??? # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. sync_batch_comm: False megatron_amp_O2: False encoder_seq_length: 4096 # the sequence length of the encoder model, it will be overwriten by loaded GPT model + transformer_engine: True ## Sequence Parallelism # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py index 371c0f5aa..4761067b6 100644 --- a/examples/nlp/gpt/train_gpt_sft.py +++ b/examples/nlp/gpt/train_gpt_sft.py @@ -85,7 +85,13 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1) if cfg.model.get("pipeline_model_parallel_size", 1) > 0: gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1) + if cfg.model.get("context_parallel_size", 1) > 0: + gpt_cfg.context_parallel_size = cfg.model.get("context_parallel_size", 1) gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0) + if cfg.model.get("dist_ckpt_load_strictness", None) is not None: + gpt_cfg.dist_ckpt_load_strictness = cfg.model.get("dist_ckpt_load_strictness", None) + + gpt_cfg.transformer_engine = cfg.model.get("transformer_engine", False) if cfg.model.data.get("chat", False): # chat model, overwrite the prompt template @@ -176,6 +182,7 @@ def main(cfg) -> None: answer_only_loss=True, is_chat=cfg.model.data.chat, special_tokens=cfg.model.data.chat_prompt_tokens, + model_cfg=cfg.model, ) if cfg.model.data.get("sample", False): num_samples = cfg.trainer.sft.limit_val_batches * val_data_cfg.global_batch_size @@ -188,6 +195,7 @@ def main(cfg) -> None: answer_only_loss=True, is_chat=cfg.model.data.chat, special_tokens=cfg.model.data.chat_prompt_tokens, + model_cfg=cfg.model, ) train_dataloader = build_dataloader( diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py index 97b68ffe4..711d72b99 100644 --- a/nemo_aligner/data/nlp/builders.py +++ b/nemo_aligner/data/nlp/builders.py @@ -379,10 +379,22 @@ def build_dataset(index, name): ) -def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None): +def build_sft_dataset( + data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None, model_cfg=None +): packed_sequence = data_cfg.get("packed_sequence", False) dataset_kwargs = {} + # TE requires that the first input dim is divisible by 8 and the second by 16 for fp8 + # When using sequence parallel, sequence will further be split by TP size + # When using context parallel, sequence is split by CP size as well + pad_seq_length_to_mult = 16 + if model_cfg is not None: + pad_seq_length_to_mult = ( + 8 * model_cfg.get("tensor_model_parallel_size", 1) if model_cfg.get("sequence_parallel", False) else 16 + ) + pad_seq_length_to_mult *= model_cfg.get("context_parallel_size", 1) + if is_chat: assert not packed_sequence, "Sequence packing is currently not supported with chat datasets." dataset_cls = GPTSFTChatDataset @@ -401,6 +413,7 @@ def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, i tokenizer=tokenizer, max_seq_length=data_cfg.max_seq_length, min_seq_length=data_cfg.min_seq_length, + pad_seq_length_to_mult=pad_seq_length_to_mult, add_bos=data_cfg.get("add_bos", False), add_eos=data_cfg.get("add_eos", True), add_sep=data_cfg.get("add_sep", False), diff --git a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py index d3a615500..6acccea5b 100644 --- a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py +++ b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py @@ -88,7 +88,7 @@ def get_loss_and_metrics(self, batch, forward_only): set_sync_funcs(self, forward_only) fwd_bwd_function = get_forward_backward_func() - fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only) + fwd_loss_fn = self.get_forward_output_and_loss_func(forward_only, tuning=True) losses_reduced = fwd_bwd_function( forward_step_func=fwd_loss_fn,