From 4cd4a584f65098264fc1cde2e04307b2bd6b33f6 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Thu, 7 May 2026 00:00:12 +0000 Subject: [PATCH 1/9] fix: unskip dpo trainer integ tests --- .../tests/integ/train/test_dpo_trainer_integration.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py index 65cbd6c246..c976e0c875 100644 --- a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py @@ -22,7 +22,6 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_lora_complete_workflow(sagemaker_session): """Test complete DPO training workflow with LORA.""" # Create DPOTrainer instance with comprehensive configuration @@ -30,7 +29,7 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", accept_eula=True ) @@ -61,7 +60,6 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_with_validation_dataset(sagemaker_session): """Test DPO trainer with both training and validation datasets.""" @@ -69,8 +67,8 @@ def test_dpo_trainer_with_validation_dataset(sagemaker_session): model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", + validation_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", accept_eula=True ) From adef22ff9a8a51d7b7ab955fcb9d6039fdbcdd68 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Thu, 7 May 2026 00:14:25 +0000 Subject: [PATCH 2/9] fix: unskip sft rlaif rlvr integ tests --- sagemaker-train/tests/integ/train/conftest.py | 10 +++++++++ .../train/test_rlaif_trainer_integration.py | 9 +++----- .../train/test_rlvr_trainer_integration.py | 18 +++++---------- .../train/test_sft_trainer_integration.py | 22 +++++++------------ 4 files changed, 27 insertions(+), 32 deletions(-) diff --git a/sagemaker-train/tests/integ/train/conftest.py b/sagemaker-train/tests/integ/train/conftest.py index 9d9e7f53e0..1857a6262d 100644 --- a/sagemaker-train/tests/integ/train/conftest.py +++ b/sagemaker-train/tests/integ/train/conftest.py @@ -38,3 +38,13 @@ def sagemaker_session(): if region_manual_set and "AWS_DEFAULT_REGION" in os.environ: del os.environ["AWS_DEFAULT_REGION"] + + +NOVA_REGION = "us-east-1" + + +@pytest.fixture(scope="module") +def sagemaker_session_us_east_1(): + """Create a SageMaker session in us-east-1 for Nova model tests.""" + boto_session = boto3.Session(region_name=NOVA_REGION) + return Session(boto_session=boto_session) diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py index 296d62bfd8..bb0558e01c 100644 --- a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py @@ -21,7 +21,6 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" @@ -33,7 +32,7 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): reward_prompt='Builtin.Summarize', mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", accept_eula=True ) @@ -61,7 +60,6 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): """Test RLAIF trainer with different reward model and prompt.""" @@ -73,7 +71,7 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1", mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", accept_eula=True ) @@ -100,7 +98,6 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_continued_finetuning(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" @@ -112,7 +109,7 @@ def test_rlaif_trainer_continued_finetuning(sagemaker_session): reward_prompt='Builtin.Summarize', mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", accept_eula=True ) diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 63d3ae3134..59dfb17fd8 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -13,7 +13,6 @@ """Integration tests for RLVR trainer""" from __future__ import absolute_import -import os import time import pytest import boto3 @@ -22,7 +21,6 @@ from sagemaker.train.common import TrainingType -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLVR training workflow with LORA.""" @@ -32,7 +30,7 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-rlvr-finetuned-models-exp", mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", accept_eula=True ) @@ -60,7 +58,6 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): """Test RLVR trainer with custom reward function.""" @@ -70,7 +67,7 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-rlvr-finetuned-models-exp", mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1", accept_eula=True @@ -98,14 +95,10 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): assert training_job.output_model_package_arn is not None -# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") -@pytest.mark.skip(reason="Skipping GPU resource intensive test") -def test_rlvr_trainer_nova_workflow(sagemaker_session): +def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): """Test RLVR training workflow with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' + # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) - # For fine-tuning rlvr_trainer = RLVRTrainer( model="nova-textgeneration-lite-v2", model_package_group="sdk-test-finetuned-models", @@ -115,7 +108,8 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session): validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", - accept_eula=True + accept_eula=True, + sagemaker_session=sagemaker_session_us_east_1 ) rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket' diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 98dd154c3f..951bb4fdfd 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -13,7 +13,6 @@ """Integration tests for SFT trainer""" from __future__ import absolute_import -import os import time import pytest import boto3 @@ -22,7 +21,6 @@ from sagemaker.train.common import TrainingType -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_lora_complete_workflow(sagemaker_session): """Test complete SFT training workflow with LORA.""" @@ -30,7 +28,7 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session): model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", accept_eula=True ) @@ -58,7 +56,6 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_with_validation_dataset(sagemaker_session): """Test SFT trainer with both training and validation datasets.""" @@ -66,8 +63,8 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session): model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", + validation_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", accept_eula=True ) @@ -92,22 +89,19 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session): assert hasattr(training_job, 'output_model_package_arn') -# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") -@pytest.mark.skip(reason="Skipping GPU resource intensive test") -def test_sft_trainer_nova_workflow(sagemaker_session): +def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): """Test SFT trainer with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' + # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) - # For fine-tuning sft_trainer_nova = SFTTrainer( model="nova-textgeneration-lite-v2", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-nova-finetuned-models-exp", mlflow_run_name="test-nova-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/DataSet/sft-nova-test-dataset/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/" + training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_8_samples.jsonl", + s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", + sagemaker_session=sagemaker_session_us_east_1 ) # Create training job From 8656d6af040d8ddf06bba046154f9f7440048605 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Thu, 7 May 2026 19:16:33 +0000 Subject: [PATCH 3/9] debug: remove unneeded fields for rvlr_nova test + logging in finetuning utils --- .../sagemaker/train/common_utils/finetune_utils.py | 12 +++++++++++- .../integ/train/test_rlvr_trainer_integration.py | 4 ---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py index 4aa67e3d28..318b312ced 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py @@ -105,12 +105,22 @@ def _resolve_mlflow_resource_arn(sagemaker_session, mlflow_resource_arn: Optiona return mlflow_resource_arn try: + target_region = sagemaker_session.boto_session.region_name + logger.info("Resolving MLflow app for region: %s", target_region) + mlflow_apps = MlflowApp.get_all( session=sagemaker_session.boto_session, - region=sagemaker_session.boto_session.region_name + region=target_region ) mlflow_apps_list = list(mlflow_apps) + logger.info("Found %d MLflow apps from ListMlflowApps API (region=%s):", len(mlflow_apps_list), target_region) + for app in mlflow_apps_list: + logger.info(" App ARN: %s | status: %s | account_default_status: %s", + getattr(app, 'arn', 'N/A'), + getattr(app, 'status', 'N/A'), + getattr(app, 'account_default_status', 'N/A')) + current_domain_id = _get_current_domain_id(sagemaker_session) # Check for domain match diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 59dfb17fd8..c9727c13e9 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -111,10 +111,6 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): accept_eula=True, sagemaker_session=sagemaker_session_us_east_1 ) - rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket' - - rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:729646638167:function:rlvr-nova-reward-function' - training_job = rlvr_trainer.train(wait=False) # Manual wait loop From 2bbe72a5cbf4aae5cd8accf675bce3fc782a8458 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Thu, 7 May 2026 21:54:01 +0000 Subject: [PATCH 4/9] fix: change dataset for sft trainer nova test --- .../tests/integ/train/test_sft_trainer_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 951bb4fdfd..11e49d0d4d 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -99,7 +99,7 @@ def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-nova-finetuned-models-exp", mlflow_run_name="test-nova-finetuned-models-run", - training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_8_samples.jsonl", + training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_200_samples.jsonl", s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", sagemaker_session=sagemaker_session_us_east_1 ) From f71d183e61592d18635cdfa741f4ea3d339e3c66 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Fri, 8 May 2026 05:39:42 +0000 Subject: [PATCH 5/9] debug: adding additonal logs to investigate tests --- sagemaker-train/src/sagemaker/train/rlvr_trainer.py | 13 ++++++++++++- sagemaker-train/src/sagemaker/train/sft_trainer.py | 12 +++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/rlvr_trainer.py b/sagemaker-train/src/sagemaker/train/rlvr_trainer.py index c496222bf4..5c37a1443e 100644 --- a/sagemaker-train/src/sagemaker/train/rlvr_trainer.py +++ b/sagemaker-train/src/sagemaker/train/rlvr_trainer.py @@ -276,10 +276,21 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None, if self.stopping_condition is not None: create_args["stopping_condition"] = self.stopping_condition + # Log the IAM role being used + logger.info(f"IAM Role ARN: {role}") + + # Log the full training job arguments for debugging + logger.info(f"TrainingJob.create() arguments: {create_args}") + try: training_job = TrainingJob.create(**create_args) except Exception as e: - logger.error("Error: %s", e) + logger.error(f"Error creating training job: {e}") + logger.error(f"Training job name: {current_training_job_name}") + logger.error(f"Serverless config: {serverless_config}") + logger.error(f"Evaluator ARN: {evaluator_arn}") + logger.error(f"Role ARN: {role}") + logger.error(f"Full create_args: {create_args}") raise e if wait: diff --git a/sagemaker-train/src/sagemaker/train/sft_trainer.py b/sagemaker-train/src/sagemaker/train/sft_trainer.py index 136231bd6f..3a53ad0acd 100644 --- a/sagemaker-train/src/sagemaker/train/sft_trainer.py +++ b/sagemaker-train/src/sagemaker/train/sft_trainer.py @@ -270,10 +270,20 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None, validati if self.stopping_condition is not None: create_args["stopping_condition"] = self.stopping_condition + # Log the IAM role being used + logger.info(f"IAM Role ARN: {role}") + + # Log the full training job arguments for debugging + logger.info(f"TrainingJob.create() arguments: {create_args}") + try: training_job = TrainingJob.create(**create_args) except Exception as e: - logger.error("Error: %s", e) + logger.error(f"Error creating training job: {e}") + logger.error(f"Training job name: {current_training_job_name}") + logger.error(f"Serverless config: {serverless_config}") + logger.error(f"Role ARN: {role}") + logger.error(f"Full create_args: {create_args}") raise e if wait: From 02b25b10294a6ff8ae0dbeb1c532124b6d3bc501 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Fri, 8 May 2026 17:47:58 +0000 Subject: [PATCH 6/9] Revert "debug: adding additonal logs to investigate tests" This reverts commit f71d183e61592d18635cdfa741f4ea3d339e3c66. --- sagemaker-train/src/sagemaker/train/rlvr_trainer.py | 13 +------------ sagemaker-train/src/sagemaker/train/sft_trainer.py | 12 +----------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/rlvr_trainer.py b/sagemaker-train/src/sagemaker/train/rlvr_trainer.py index 5c37a1443e..c496222bf4 100644 --- a/sagemaker-train/src/sagemaker/train/rlvr_trainer.py +++ b/sagemaker-train/src/sagemaker/train/rlvr_trainer.py @@ -276,21 +276,10 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None, if self.stopping_condition is not None: create_args["stopping_condition"] = self.stopping_condition - # Log the IAM role being used - logger.info(f"IAM Role ARN: {role}") - - # Log the full training job arguments for debugging - logger.info(f"TrainingJob.create() arguments: {create_args}") - try: training_job = TrainingJob.create(**create_args) except Exception as e: - logger.error(f"Error creating training job: {e}") - logger.error(f"Training job name: {current_training_job_name}") - logger.error(f"Serverless config: {serverless_config}") - logger.error(f"Evaluator ARN: {evaluator_arn}") - logger.error(f"Role ARN: {role}") - logger.error(f"Full create_args: {create_args}") + logger.error("Error: %s", e) raise e if wait: diff --git a/sagemaker-train/src/sagemaker/train/sft_trainer.py b/sagemaker-train/src/sagemaker/train/sft_trainer.py index 3a53ad0acd..136231bd6f 100644 --- a/sagemaker-train/src/sagemaker/train/sft_trainer.py +++ b/sagemaker-train/src/sagemaker/train/sft_trainer.py @@ -270,20 +270,10 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None, validati if self.stopping_condition is not None: create_args["stopping_condition"] = self.stopping_condition - # Log the IAM role being used - logger.info(f"IAM Role ARN: {role}") - - # Log the full training job arguments for debugging - logger.info(f"TrainingJob.create() arguments: {create_args}") - try: training_job = TrainingJob.create(**create_args) except Exception as e: - logger.error(f"Error creating training job: {e}") - logger.error(f"Training job name: {current_training_job_name}") - logger.error(f"Serverless config: {serverless_config}") - logger.error(f"Role ARN: {role}") - logger.error(f"Full create_args: {create_args}") + logger.error("Error: %s", e) raise e if wait: From 1384d617f9779c4fd0c18aa8ae9baab48a6c3fea Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Fri, 8 May 2026 19:03:06 +0000 Subject: [PATCH 7/9] cleanup: remove extra debug logging from finetune_utils --- .../sagemaker/train/common_utils/finetune_utils.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py index 318b312ced..929cc56d7f 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py @@ -105,22 +105,13 @@ def _resolve_mlflow_resource_arn(sagemaker_session, mlflow_resource_arn: Optiona return mlflow_resource_arn try: - target_region = sagemaker_session.boto_session.region_name - logger.info("Resolving MLflow app for region: %s", target_region) mlflow_apps = MlflowApp.get_all( session=sagemaker_session.boto_session, - region=target_region + region=sagemaker_session.boto_session.region_name ) mlflow_apps_list = list(mlflow_apps) - logger.info("Found %d MLflow apps from ListMlflowApps API (region=%s):", len(mlflow_apps_list), target_region) - for app in mlflow_apps_list: - logger.info(" App ARN: %s | status: %s | account_default_status: %s", - getattr(app, 'arn', 'N/A'), - getattr(app, 'status', 'N/A'), - getattr(app, 'account_default_status', 'N/A')) - current_domain_id = _get_current_domain_id(sagemaker_session) # Check for domain match From 877225c81c591b2b53b6cc0c3f03837e10af987c Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Fri, 8 May 2026 19:10:00 +0000 Subject: [PATCH 8/9] skip: sm-train nova tests to be enabled for us-east-1 --- .../tests/integ/train/test_rlvr_trainer_integration.py | 1 + .../tests/integ/train/test_sft_trainer_integration.py | 1 + 2 files changed, 2 insertions(+) diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index c9727c13e9..5214297f54 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -95,6 +95,7 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): assert training_job.output_model_package_arn is not None +@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): """Test RLVR training workflow with Nova model.""" # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 11e49d0d4d..d2db227d30 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -89,6 +89,7 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session): assert hasattr(training_job, 'output_model_package_arn') +@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): """Test SFT trainer with Nova model.""" # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) From 95c9824c39440f6a2a54666f6050dfde798de48a Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Fri, 8 May 2026 23:55:01 +0000 Subject: [PATCH 9/9] fix: added unique names in sm-train tests to avoid Resource already exists error in CI --- .../integ/train/test_dpo_trainer_integration.py | 8 ++++++-- .../integ/train/test_rlaif_trainer_integration.py | 13 ++++++++++--- .../integ/train/test_rlvr_trainer_integration.py | 13 ++++++++++--- .../integ/train/test_sft_trainer_integration.py | 13 ++++++++++--- 4 files changed, 36 insertions(+), 11 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py index c976e0c875..96a1d7f4d4 100644 --- a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py @@ -24,6 +24,7 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): """Test complete DPO training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" # Create DPOTrainer instance with comprehensive configuration trainer = DPOTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -31,7 +32,8 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): model_package_group="sdk-test-finetuned-models", training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"dpo-lora-integ-{unique_id}", ) # Customize hyperparameters for quick training @@ -62,6 +64,7 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): def test_dpo_trainer_with_validation_dataset(sagemaker_session): """Test DPO trainer with both training and validation datasets.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" dpo_trainer = DPOTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -70,7 +73,8 @@ def test_dpo_trainer_with_validation_dataset(sagemaker_session): training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", validation_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"dpo-val-integ-{unique_id}", ) # Customize hyperparameters for quick training diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py index bb0558e01c..103484c53f 100644 --- a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py @@ -14,6 +14,7 @@ from __future__ import absolute_import import time +import random import boto3 from sagemaker.core.helper.session_helper import Session from sagemaker.train.rlaif_trainer import RLAIFTrainer @@ -23,6 +24,7 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -34,7 +36,8 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): mlflow_run_name="test-rlaif-finetuned-models-run", training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-lora-integ-{unique_id}", ) # Create training job @@ -62,6 +65,7 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): """Test RLAIF trainer with different reward model and prompt.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -73,7 +77,8 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): mlflow_run_name="test-rlaif-finetuned-models-run", training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-rwd-integ-{unique_id}", ) training_job = rlaif_trainer.train(wait=False) @@ -100,6 +105,7 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): def test_rlaif_trainer_continued_finetuning(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1", @@ -111,7 +117,8 @@ def test_rlaif_trainer_continued_finetuning(sagemaker_session): mlflow_run_name="test-rlaif-finetuned-models-run", training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-cont-integ-{unique_id}", ) # Create training job diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 5214297f54..951f9ab35d 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -14,6 +14,7 @@ from __future__ import absolute_import import time +import random import pytest import boto3 from sagemaker.core.helper.session_helper import Session @@ -23,6 +24,7 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLVR training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -32,7 +34,8 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): mlflow_run_name="test-rlvr-finetuned-models-run", training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlvr-lora-integ-{unique_id}", ) # Create training job @@ -60,6 +63,7 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): """Test RLVR trainer with custom reward function.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -70,7 +74,8 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1", - accept_eula=True + accept_eula=True, + base_job_name=f"rlvr-rf-integ-{unique_id}", ) training_job = rlvr_trainer.train(wait=False) @@ -100,6 +105,7 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): """Test RLVR training workflow with Nova model.""" # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="nova-textgeneration-lite-v2", model_package_group="sdk-test-finetuned-models", @@ -110,7 +116,8 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", accept_eula=True, - sagemaker_session=sagemaker_session_us_east_1 + sagemaker_session=sagemaker_session_us_east_1, + base_job_name=f"rlvr-nova-integ-{unique_id}", ) training_job = rlvr_trainer.train(wait=False) diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index d2db227d30..c4f94aba91 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -14,6 +14,7 @@ from __future__ import absolute_import import time +import random import pytest import boto3 from sagemaker.core.helper.session_helper import Session @@ -23,6 +24,7 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session): """Test complete SFT training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer = SFTTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -30,7 +32,8 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session): model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"sft-lora-integ-{unique_id}", ) # Create training job @@ -58,6 +61,7 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session): def test_sft_trainer_with_validation_dataset(sagemaker_session): """Test SFT trainer with both training and validation datasets.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer = SFTTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -65,7 +69,8 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session): model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", validation_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", - accept_eula=True + accept_eula=True, + base_job_name=f"sft-val-integ-{unique_id}", ) training_job = sft_trainer.train(wait=False) @@ -94,6 +99,7 @@ def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): """Test SFT trainer with Nova model.""" # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer_nova = SFTTrainer( model="nova-textgeneration-lite-v2", training_type=TrainingType.LORA, @@ -102,7 +108,8 @@ def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): mlflow_run_name="test-nova-finetuned-models-run", training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_200_samples.jsonl", s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", - sagemaker_session=sagemaker_session_us_east_1 + sagemaker_session=sagemaker_session_us_east_1, + base_job_name=f"sft-nova-integ-{unique_id}", ) # Create training job