diff --git a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py index 4aa67e3d28..929cc56d7f 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py @@ -105,6 +105,7 @@ def _resolve_mlflow_resource_arn(sagemaker_session, mlflow_resource_arn: Optiona return mlflow_resource_arn try: + mlflow_apps = MlflowApp.get_all( session=sagemaker_session.boto_session, region=sagemaker_session.boto_session.region_name diff --git a/sagemaker-train/tests/integ/train/conftest.py b/sagemaker-train/tests/integ/train/conftest.py index 9d9e7f53e0..1857a6262d 100644 --- a/sagemaker-train/tests/integ/train/conftest.py +++ b/sagemaker-train/tests/integ/train/conftest.py @@ -38,3 +38,13 @@ def sagemaker_session(): if region_manual_set and "AWS_DEFAULT_REGION" in os.environ: del os.environ["AWS_DEFAULT_REGION"] + + +NOVA_REGION = "us-east-1" + + +@pytest.fixture(scope="module") +def sagemaker_session_us_east_1(): + """Create a SageMaker session in us-east-1 for Nova model tests.""" + boto_session = boto3.Session(region_name=NOVA_REGION) + return Session(boto_session=boto_session) diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py index 65cbd6c246..96a1d7f4d4 100644 --- a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py @@ -22,17 +22,18 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_lora_complete_workflow(sagemaker_session): """Test complete DPO training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" # Create DPOTrainer instance with comprehensive configuration trainer = DPOTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"dpo-lora-integ-{unique_id}", ) # Customize hyperparameters for quick training @@ -61,18 +62,19 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_with_validation_dataset(sagemaker_session): """Test DPO trainer with both training and validation datasets.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" dpo_trainer = DPOTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", + validation_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"dpo-val-integ-{unique_id}", ) # Customize hyperparameters for quick training diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py index 296d62bfd8..103484c53f 100644 --- a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py @@ -14,6 +14,7 @@ from __future__ import absolute_import import time +import random import boto3 from sagemaker.core.helper.session_helper import Session from sagemaker.train.rlaif_trainer import RLAIFTrainer @@ -21,9 +22,9 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -33,9 +34,10 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): reward_prompt='Builtin.Summarize', mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-lora-integ-{unique_id}", ) # Create training job @@ -61,9 +63,9 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): """Test RLAIF trainer with different reward model and prompt.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -73,9 +75,10 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1", mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-rwd-integ-{unique_id}", ) training_job = rlaif_trainer.train(wait=False) @@ -100,9 +103,9 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_continued_finetuning(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlaif_trainer = RLAIFTrainer( model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1", @@ -112,9 +115,10 @@ def test_rlaif_trainer_continued_finetuning(sagemaker_session): reward_prompt='Builtin.Summarize', mlflow_experiment_name="test-rlaif-finetuned-models-exp", mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlaif-cont-integ-{unique_id}", ) # Create training job diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 63d3ae3134..951f9ab35d 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -13,8 +13,8 @@ """Integration tests for RLVR trainer""" from __future__ import absolute_import -import os import time +import random import pytest import boto3 from sagemaker.core.helper.session_helper import Session @@ -22,9 +22,9 @@ from sagemaker.train.common import TrainingType -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLVR training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -32,9 +32,10 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-rlvr-finetuned-models-exp", mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"rlvr-lora-integ-{unique_id}", ) # Create training job @@ -60,9 +61,9 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): """Test RLVR trainer with custom reward function.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", @@ -70,10 +71,11 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-rlvr-finetuned-models-exp", mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1", - accept_eula=True + accept_eula=True, + base_job_name=f"rlvr-rf-integ-{unique_id}", ) training_job = rlvr_trainer.train(wait=False) @@ -98,14 +100,12 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): assert training_job.output_model_package_arn is not None -# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") -@pytest.mark.skip(reason="Skipping GPU resource intensive test") -def test_rlvr_trainer_nova_workflow(sagemaker_session): +@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") +def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): """Test RLVR training workflow with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' + # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) - # For fine-tuning + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" rlvr_trainer = RLVRTrainer( model="nova-textgeneration-lite-v2", model_package_group="sdk-test-finetuned-models", @@ -115,12 +115,10 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session): validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", - accept_eula=True + accept_eula=True, + sagemaker_session=sagemaker_session_us_east_1, + base_job_name=f"rlvr-nova-integ-{unique_id}", ) - rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket' - - rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:729646638167:function:rlvr-nova-reward-function' - training_job = rlvr_trainer.train(wait=False) # Manual wait loop diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 98dd154c3f..c4f94aba91 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -13,8 +13,8 @@ """Integration tests for SFT trainer""" from __future__ import absolute_import -import os import time +import random import pytest import boto3 from sagemaker.core.helper.session_helper import Session @@ -22,17 +22,18 @@ from sagemaker.train.common import TrainingType -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_lora_complete_workflow(sagemaker_session): """Test complete SFT training workflow with LORA.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer = SFTTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", + training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True + accept_eula=True, + base_job_name=f"sft-lora-integ-{unique_id}", ) # Create training job @@ -58,17 +59,18 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_with_validation_dataset(sagemaker_session): """Test SFT trainer with both training and validation datasets.""" + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer = SFTTrainer( model="meta-textgeneration-llama-3-2-1b-instruct", training_type=TrainingType.LORA, model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - accept_eula=True + training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", + validation_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl", + accept_eula=True, + base_job_name=f"sft-val-integ-{unique_id}", ) training_job = sft_trainer.train(wait=False) @@ -92,22 +94,22 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session): assert hasattr(training_job, 'output_model_package_arn') -# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") -@pytest.mark.skip(reason="Skipping GPU resource intensive test") -def test_sft_trainer_nova_workflow(sagemaker_session): +@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") +def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): """Test SFT trainer with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' + # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) - # For fine-tuning + unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}" sft_trainer_nova = SFTTrainer( model="nova-textgeneration-lite-v2", training_type=TrainingType.LORA, model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-nova-finetuned-models-exp", mlflow_run_name="test-nova-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/DataSet/sft-nova-test-dataset/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/" + training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_200_samples.jsonl", + s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", + sagemaker_session=sagemaker_session_us_east_1, + base_job_name=f"sft-nova-integ-{unique_id}", ) # Create training job