From 38445f6cac1b514f367688d20edbf3fef7dfda4c Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Tue, 5 May 2026 20:35:03 +0000 Subject: [PATCH 1/2] debug: unskip all integ tests --- .../tests/integ/image_retriever/test_image_retriever.py | 5 ----- .../tests/integ/remote_function/test_decorator.py | 4 ---- .../tests/integ/train/test_benchmark_evaluator.py | 9 --------- .../tests/integ/train/test_custom_scorer_evaluator.py | 6 ------ .../tests/integ/train/test_dpo_trainer_integration.py | 2 -- .../tests/integ/train/test_llm_as_judge_evaluator.py | 9 --------- sagemaker-train/tests/integ/train/test_model_trainer.py | 2 +- .../tests/integ/train/test_rlaif_trainer_integration.py | 3 --- .../tests/integ/train/test_rlvr_trainer_integration.py | 4 ---- 9 files changed, 1 insertion(+), 43 deletions(-) diff --git a/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py b/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py index 5ca5a35a28..0a7ff24cfe 100644 --- a/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py +++ b/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py @@ -12,7 +12,6 @@ from sagemaker.core.config.config_manager import SageMakerConfig -@pytest.mark.skip("Disabling this for now, Need to be fixed") @pytest.mark.integ def test_retrieve_image_uri(): image_uri = ImageRetriever.retrieve("clarify", "us-west-2") @@ -56,7 +55,6 @@ def test_retrieve_image_uri(): ) -@pytest.mark.skip("Disabling this for now, Need to be fixed") @pytest.mark.integ def test_retrieve_pytorch_uri(): image_uri = ImageRetriever.retrieve_pytorch_uri( @@ -72,7 +70,6 @@ def test_retrieve_pytorch_uri(): ) -@pytest.mark.skip("Disabling this for now, Need to be fixed") @pytest.mark.integ def test_retrieve_hugging_face_uri(): image_uri = ImageRetriever.retrieve_hugging_face_uri( @@ -88,14 +85,12 @@ def test_retrieve_hugging_face_uri(): ":2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" -@pytest.mark.skip("Disabling this for now, Need to be fixed") @pytest.mark.integ def test_retrieve_base_python_image_uri(): image_uri = ImageRetriever.retrieve_base_python_image_uri() assert image_uri == "236514542706.dkr.ecr.us-west-2.amazonaws.com/sagemaker-base-python-310:1.0" -@pytest.mark.skip("Disabling this for now, Need to be fixed") @pytest.mark.integ @patch.object(SageMakerConfig, "resolve_value_from_config") def test_retrieve_image_uri_intelligent_default(mock_load_config): diff --git a/sagemaker-core/tests/integ/remote_function/test_decorator.py b/sagemaker-core/tests/integ/remote_function/test_decorator.py index 8e1a8c061c..5c5db81e68 100644 --- a/sagemaker-core/tests/integ/remote_function/test_decorator.py +++ b/sagemaker-core/tests/integ/remote_function/test_decorator.py @@ -118,8 +118,6 @@ def divide(x, y): divide(10, 2) -# TODO: add VPC settings, update SageMakerRole with KMS permissions -@pytest.mark.skip def test_advanced_job_setting( sagemaker_session, dummy_container_without_error, cpu_instance_type, s3_kms_key ): @@ -552,7 +550,6 @@ def my_func(): assert client_error_message in str(error) -@pytest.mark.skip def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type): @remote( role=ROLE, @@ -578,7 +575,6 @@ def test_spark_transform(): test_spark_transform() -@pytest.mark.skip def test_decorator_auto_capture(sagemaker_session, auto_capture_test_container): """ This test runs a docker container. The Container invocation will execute a python script diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index 454fdd67c7..f84808dad4 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -72,7 +72,6 @@ } -@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/") class TestBenchmarkEvaluatorIntegration: """Integration tests for BenchmarkEvaluator with fine-tuned model package""" @@ -286,16 +285,12 @@ def test_benchmark_subtasks_validation(self): logger.info("Subtask validation tests passed") - @pytest.mark.skip(reason="Base model only evaluation - to be enabled when needed") def test_benchmark_evaluation_base_model_only(self): """ Test benchmark evaluation with base model only (no fine-tuned model). This test uses a JumpStart model ID directly instead of a model package ARN. Configuration from commented section in benchmark_demo.ipynb. - - Note: This test is currently skipped. Remove the @pytest.mark.skip decorator - when you want to enable it. """ # Get benchmarks Benchmark = get_benchmarks() @@ -339,16 +334,12 @@ def test_benchmark_evaluation_base_model_only(self): assert execution.status.overall_status == "Succeeded" logger.info("Base model only evaluation completed successfully") - @pytest.mark.skip(reason="Nova model evaluation - to be enabled when needed") def test_benchmark_evaluation_nova_model(self): """ Test benchmark evaluation with Nova model. This test uses a Nova fine-tuned model package in us-east-1 region. Configuration from commented section in benchmark_demo.ipynb. - - Note: This test is currently skipped. Remove the @pytest.mark.skip decorator - when you want to enable it. """ # Get benchmarks Benchmark = get_benchmarks() diff --git a/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py b/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py index 0af4ca1838..d2d4388cbf 100644 --- a/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_custom_scorer_evaluator.py @@ -55,7 +55,6 @@ } -@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/") class TestCustomScorerEvaluatorIntegration: """Integration tests for CustomScorerEvaluator with custom evaluator""" @@ -233,16 +232,12 @@ def test_custom_scorer_evaluator_validation(self): logger.info("Validation tests passed") - @pytest.mark.skip(reason="Built-in metric evaluation - to be enabled when needed") def test_custom_scorer_with_builtin_metric(self): """ Test custom scorer evaluation with built-in metric. This test uses a built-in metric (PRIME_MATH) instead of a custom evaluator ARN. Configuration adapted from commented section in custom_scorer_demo.ipynb. - - Note: This test is currently skipped. Remove the @pytest.mark.skip decorator - when you want to enable it. """ # Get built-in metrics BuiltInMetric = get_builtin_metrics() @@ -285,7 +280,6 @@ def test_custom_scorer_with_builtin_metric(self): assert execution.status.overall_status == "Succeeded" logger.info("Built-in metric evaluation completed successfully") - @pytest.mark.skip(reason="Base model only evaluation - not working yet per notebook") def test_custom_scorer_base_model_only(self): """ Test custom scorer evaluation with base model only (no fine-tuned model). diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py index 65cbd6c246..d73568d580 100644 --- a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py @@ -22,7 +22,6 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_lora_complete_workflow(sagemaker_session): """Test complete DPO training workflow with LORA.""" # Create DPOTrainer instance with comprehensive configuration @@ -61,7 +60,6 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_dpo_trainer_with_validation_dataset(sagemaker_session): """Test DPO trainer with both training and validation datasets.""" diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py index 49a68c22d9..6b10a75bf2 100644 --- a/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_evaluator.py @@ -84,7 +84,6 @@ } -@pytest.mark.skip(reason="Temporarily skipped - moved from tests/integ/sagemaker/modules/evaluate/") class TestLLMAsJudgeEvaluatorIntegration: """Integration tests for LLMAsJudgeEvaluator""" @@ -254,15 +253,11 @@ def test_llm_as_judge_builtin_metrics_prefix_handling(self): logger.info("Built-in metrics prefix handling tests passed") - @pytest.mark.skip(reason="Built-in metrics only test - to be enabled when needed") def test_llm_as_judge_builtin_metrics_only(self): """ Test LLM-as-Judge evaluation with only built-in metrics (no custom metrics). This test uses only built-in metrics without custom metrics. - - Note: This test is currently skipped. Remove the @pytest.mark.skip decorator - when you want to enable it. """ logger.info("Creating LLMAsJudgeEvaluator with built-in metrics only") @@ -302,15 +297,11 @@ def test_llm_as_judge_builtin_metrics_only(self): assert execution.status.overall_status == "Succeeded" logger.info("Built-in metrics only evaluation completed successfully") - @pytest.mark.skip(reason="Custom metrics only test - to be enabled when needed") def test_llm_as_judge_custom_metrics_only(self): """ Test LLM-as-Judge evaluation with only custom metrics (no built-in metrics). This test uses only custom metrics without built-in metrics. - - Note: This test is currently skipped. Remove the @pytest.mark.skip decorator - when you want to enable it. """ logger.info("Creating LLMAsJudgeEvaluator with custom metrics only") diff --git a/sagemaker-train/tests/integ/train/test_model_trainer.py b/sagemaker-train/tests/integ/train/test_model_trainer.py index f37d009de8..8692103afe 100644 --- a/sagemaker-train/tests/integ/train/test_model_trainer.py +++ b/sagemaker-train/tests/integ/train/test_model_trainer.py @@ -96,7 +96,7 @@ def test_hp_contract_basic_sh_script(sagemaker_session): # skip this test for now as requirments.txt is not resolved -@pytest.mark.skip +# @pytest.mark.skip def test_hp_contract_mpi_script(sagemaker_session): compute = Compute(instance_type="ml.m5.xlarge", instance_count=2) model_trainer = ModelTrainer( diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py index 296d62bfd8..b36a92f530 100644 --- a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py @@ -21,7 +21,6 @@ import pytest -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" @@ -61,7 +60,6 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): """Test RLAIF trainer with different reward model and prompt.""" @@ -100,7 +98,6 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlaif_trainer_continued_finetuning(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 63d3ae3134..23da854ee6 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -22,7 +22,6 @@ from sagemaker.train.common import TrainingType -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLVR training workflow with LORA.""" @@ -60,7 +59,6 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): assert training_job.output_model_package_arn is not None -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): """Test RLVR trainer with custom reward function.""" @@ -98,8 +96,6 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): assert training_job.output_model_package_arn is not None -# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") -@pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_rlvr_trainer_nova_workflow(sagemaker_session): """Test RLVR training workflow with Nova model.""" import os From 07d38db6d6af3c696d0147666c67cedd22792ab5 Mon Sep 17 00:00:00 2001 From: Syed Jafri Date: Tue, 5 May 2026 23:12:07 +0000 Subject: [PATCH 2/2] fix image retriever tests + add logging for sm-train --- .../core/image_retriever/image_retriever.py | 6 +- .../bootstrap_runtime_environment.py | 2 +- .../image_retriever/test_image_retriever.py | 4 +- .../train/test_dpo_trainer_integration.py | 182 +++++++----- .../tests/integ/train/test_model_trainer.py | 58 +++- .../train/test_rlaif_trainer_integration.py | 270 ++++++++++------- .../train/test_rlvr_trainer_integration.py | 275 +++++++++++------- .../train/test_sft_trainer_integration.py | 252 ++++++++++------ 8 files changed, 661 insertions(+), 388 deletions(-) diff --git a/sagemaker-core/src/sagemaker/core/image_retriever/image_retriever.py b/sagemaker-core/src/sagemaker/core/image_retriever/image_retriever.py index c4c2f5a45e..ca6295dd7f 100644 --- a/sagemaker-core/src/sagemaker/core/image_retriever/image_retriever.py +++ b/sagemaker-core/src/sagemaker/core/image_retriever/image_retriever.py @@ -52,6 +52,8 @@ class ImageRetriever: + _config = SageMakerConfig() + @staticmethod def retrieve_hugging_face_uri( region: str, @@ -110,7 +112,7 @@ def retrieve_hugging_face_uri( args = dict(locals()) for name, val in args.items(): if name in CONFIGURABLE_ATTRIBUTES and not val: - default_value = SageMakerConfig.resolve_value_from_config( + default_value = ImageRetriever._config.resolve_value_from_config( config_path=_simple_path( SAGEMAKER, MODULES, IMAGE_RETRIEVER, to_camel_case(name) ) @@ -499,7 +501,7 @@ def retrieve( args = dict(locals()) for name, val in args.items(): if name in CONFIGURABLE_ATTRIBUTES and not val: - default_value = SageMakerConfig.resolve_value_from_config( + default_value = ImageRetriever._config.resolve_value_from_config( config_path=_simple_path( SAGEMAKER, MODULES, IMAGE_RETRIEVER, to_camel_case(name) ) diff --git a/sagemaker-core/src/sagemaker/core/remote_function/runtime_environment/bootstrap_runtime_environment.py b/sagemaker-core/src/sagemaker/core/remote_function/runtime_environment/bootstrap_runtime_environment.py index 2c20151ed1..ed65a61d84 100644 --- a/sagemaker-core/src/sagemaker/core/remote_function/runtime_environment/bootstrap_runtime_environment.py +++ b/sagemaker-core/src/sagemaker/core/remote_function/runtime_environment/bootstrap_runtime_environment.py @@ -43,7 +43,7 @@ REMOTE_FUNCTION_WORKSPACE = "sm_rf_user_ws" BASE_CHANNEL_PATH = "/opt/ml/input/data" FAILURE_REASON_PATH = "/opt/ml/output/failure" -JOB_OUTPUT_DIRS = ["/opt/ml/input", "/opt/ml/output", "/opt/ml/model", "/tmp"] +JOB_OUTPUT_DIRS = ["/opt/ml/input", "/opt/ml/output", "/opt/ml/model", "/tmp", "/opt/ml/checkpoints"] PRE_EXECUTION_SCRIPT_NAME = "pre_exec.sh" JOB_REMOTE_FUNCTION_WORKSPACE = "sagemaker_remote_function_workspace" SCRIPT_AND_DEPENDENCIES_CHANNEL_NAME = "pre_exec_script_and_dependencies" diff --git a/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py b/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py index 0a7ff24cfe..1ce37e06fa 100644 --- a/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py +++ b/sagemaker-core/tests/integ/image_retriever/test_image_retriever.py @@ -87,14 +87,14 @@ def test_retrieve_hugging_face_uri(): @pytest.mark.integ def test_retrieve_base_python_image_uri(): - image_uri = ImageRetriever.retrieve_base_python_image_uri() + image_uri = ImageRetriever.retrieve_base_python_image_uri(region="us-west-2") assert image_uri == "236514542706.dkr.ecr.us-west-2.amazonaws.com/sagemaker-base-python-310:1.0" @pytest.mark.integ @patch.object(SageMakerConfig, "resolve_value_from_config") def test_retrieve_image_uri_intelligent_default(mock_load_config): - def custom_return(config_path): + def custom_return(config_path=None, **kwargs): if config_path == _simple_path( SAGEMAKER, PYTHON_SDK, MODULES, IMAGE_RETRIEVER, "ImageScope" ): diff --git a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py index d73568d580..31480cb9f8 100644 --- a/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_dpo_trainer_integration.py @@ -14,6 +14,8 @@ from __future__ import absolute_import import time +import logging +import traceback import random import boto3 from sagemaker.core.helper.session_helper import Session @@ -21,78 +23,122 @@ from sagemaker.train.common import TrainingType import pytest +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + def test_dpo_trainer_lora_complete_workflow(sagemaker_session): """Test complete DPO training workflow with LORA.""" - # Create DPOTrainer instance with comprehensive configuration - trainer = DPOTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True - ) - - # Customize hyperparameters for quick training - trainer.hyperparameters.max_epochs = 1 - - # Create training job - training_job = trainer.train(wait=False) - - # Manual wait loop to avoid resource_config issue - max_wait_time = 3600 # 1 hour timeout - poll_interval = 30 # Check every 30 seconds - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_dpo_trainer_lora_complete_workflow ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + # Create DPOTrainer instance with comprehensive configuration + trainer = DPOTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + accept_eula=True + ) + logger.info(f"DPOTrainer created: model={trainer.model}, training_type={trainer.training_type}") + + # Customize hyperparameters for quick training + trainer.hyperparameters.max_epochs = 1 + logger.info(f"Set max_epochs=1") + + # Create training job + logger.info("Calling trainer.train(wait=False)...") + training_job = trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop to avoid resource_config issue + max_wait_time = 3600 # 1 hour timeout + poll_interval = 30 # Check every 30 seconds + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_dpo_trainer_lora_complete_workflow FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_dpo_trainer_lora_complete_workflow - PASSED ===") def test_dpo_trainer_with_validation_dataset(sagemaker_session): """Test DPO trainer with both training and validation datasets.""" - - dpo_trainer = DPOTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True - ) - - # Customize hyperparameters for quick training - dpo_trainer.hyperparameters.max_epochs = 1 - - training_job = dpo_trainer.train(wait=False) - - # Manual wait loop - max_wait_time = 3600 - poll_interval = 30 - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_dpo_trainer_with_validation_dataset ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + dpo_trainer = DPOTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + accept_eula=True + ) + logger.info(f"DPOTrainer created with validation dataset") + + # Customize hyperparameters for quick training + dpo_trainer.hyperparameters.max_epochs = 1 + logger.info(f"Set max_epochs=1") + + logger.info("Calling dpo_trainer.train(wait=False)...") + training_job = dpo_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop + max_wait_time = 3600 + poll_interval = 30 + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_dpo_trainer_with_validation_dataset FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_dpo_trainer_with_validation_dataset - PASSED ===") diff --git a/sagemaker-train/tests/integ/train/test_model_trainer.py b/sagemaker-train/tests/integ/train/test_model_trainer.py index 8692103afe..2e33e56064 100644 --- a/sagemaker-train/tests/integ/train/test_model_trainer.py +++ b/sagemaker-train/tests/integ/train/test_model_trainer.py @@ -14,12 +14,17 @@ from __future__ import absolute_import import os +import logging +import traceback import pytest from sagemaker.train.model_trainer import ModelTrainer from sagemaker.train.configs import SourceCode, Compute from sagemaker.train.distributed import MPI, Torchrun, DistributedConfig +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + DATA_DIR = os.path.join(os.path.dirname(__file__), "../..", "data") EXPECTED_HYPERPARAMETERS = { "integer": 1, @@ -98,18 +103,47 @@ def test_hp_contract_basic_sh_script(sagemaker_session): # skip this test for now as requirments.txt is not resolved # @pytest.mark.skip def test_hp_contract_mpi_script(sagemaker_session): - compute = Compute(instance_type="ml.m5.xlarge", instance_count=2) - model_trainer = ModelTrainer( - sagemaker_session=sagemaker_session, - training_image=DEFAULT_CPU_IMAGE, - compute=compute, - hyperparameters=EXPECTED_HYPERPARAMETERS, - source_code=PARAM_SCRIPT_SOURCE_CODE, - distributed=MPI(), - base_job_name="hp-contract-mpi-script", - ) - - model_trainer.train() + logger.info("=== START test_hp_contract_mpi_script ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + logger.info(f"PARAM_SCRIPT_SOURCE_DIR: {PARAM_SCRIPT_SOURCE_DIR}") + logger.info(f"Source dir exists: {os.path.exists(PARAM_SCRIPT_SOURCE_DIR)}") + + requirements_path = os.path.join(PARAM_SCRIPT_SOURCE_DIR, "requirements.txt") + logger.info(f"requirements.txt path: {requirements_path}") + logger.info(f"requirements.txt exists: {os.path.exists(requirements_path)}") + if os.path.exists(requirements_path): + with open(requirements_path, "r") as f: + logger.info(f"requirements.txt contents:\n{f.read()}") + + # Log all files in source dir + if os.path.exists(PARAM_SCRIPT_SOURCE_DIR): + logger.info(f"Files in source dir: {os.listdir(PARAM_SCRIPT_SOURCE_DIR)}") + + try: + compute = Compute(instance_type="ml.m5.xlarge", instance_count=2) + logger.info(f"Compute config: instance_type=ml.m5.xlarge, instance_count=2") + logger.info(f"Source code: source_dir={PARAM_SCRIPT_SOURCE_CODE.source_dir}, requirements={PARAM_SCRIPT_SOURCE_CODE.requirements}, entry_script={PARAM_SCRIPT_SOURCE_CODE.entry_script}") + + model_trainer = ModelTrainer( + sagemaker_session=sagemaker_session, + training_image=DEFAULT_CPU_IMAGE, + compute=compute, + hyperparameters=EXPECTED_HYPERPARAMETERS, + source_code=PARAM_SCRIPT_SOURCE_CODE, + distributed=MPI(), + base_job_name="hp-contract-mpi-script", + ) + logger.info(f"ModelTrainer created successfully") + logger.info(f"ModelTrainer training_image: {model_trainer.training_image}") + + logger.info("Calling model_trainer.train()...") + model_trainer.train() + logger.info("model_trainer.train() completed successfully") + except Exception as e: + logger.error(f"test_hp_contract_mpi_script FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_hp_contract_mpi_script - PASSED ===") def test_hp_contract_torchrun_script(sagemaker_session): diff --git a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py index b36a92f530..aa924b381e 100644 --- a/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlaif_trainer_integration.py @@ -14,124 +14,186 @@ from __future__ import absolute_import import time +import logging +import traceback import boto3 from sagemaker.core.helper.session_helper import Session from sagemaker.train.rlaif_trainer import RLAIFTrainer from sagemaker.train.common import TrainingType import pytest +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + def test_rlaif_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" - - rlaif_trainer = RLAIFTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - reward_model_id='openai.gpt-oss-120b-1:0', - reward_prompt='Builtin.Summarize', - mlflow_experiment_name="test-rlaif-finetuned-models-exp", - mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True - ) - - # Create training job - training_job = rlaif_trainer.train(wait=False) - - # Manual wait loop to avoid resource_config issue - max_wait_time = 3600 # 1 hour timeout - poll_interval = 30 # Check every 30 seconds - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_rlaif_trainer_lora_complete_workflow ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + rlaif_trainer = RLAIFTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + reward_model_id='openai.gpt-oss-120b-1:0', + reward_prompt='Builtin.Summarize', + mlflow_experiment_name="test-rlaif-finetuned-models-exp", + mlflow_run_name="test-rlaif-finetuned-models-run", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + accept_eula=True + ) + logger.info(f"RLAIFTrainer created: model={rlaif_trainer.model}, training_type={rlaif_trainer.training_type}") + + # Create training job + logger.info("Calling rlaif_trainer.train(wait=False)...") + training_job = rlaif_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop to avoid resource_config issue + max_wait_time = 3600 # 1 hour timeout + poll_interval = 30 # Check every 30 seconds + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_rlaif_trainer_lora_complete_workflow FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_rlaif_trainer_lora_complete_workflow - PASSED ===") def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session): """Test RLAIF trainer with different reward model and prompt.""" - - rlaif_trainer = RLAIFTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - reward_model_id='openai.gpt-oss-120b-1:0', - reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1", - mlflow_experiment_name="test-rlaif-finetuned-models-exp", - mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True - ) - - training_job = rlaif_trainer.train(wait=False) - - # Manual wait loop - max_wait_time = 3600 - poll_interval = 30 - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_rlaif_trainer_with_custom_reward_settings ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + rlaif_trainer = RLAIFTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + reward_model_id='openai.gpt-oss-120b-1:0', + reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1", + mlflow_experiment_name="test-rlaif-finetuned-models-exp", + mlflow_run_name="test-rlaif-finetuned-models-run", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + accept_eula=True + ) + logger.info(f"RLAIFTrainer created with custom reward prompt ARN") + + logger.info("Calling rlaif_trainer.train(wait=False)...") + training_job = rlaif_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop + max_wait_time = 3600 + poll_interval = 30 + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_rlaif_trainer_with_custom_reward_settings FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_rlaif_trainer_with_custom_reward_settings - PASSED ===") def test_rlaif_trainer_continued_finetuning(sagemaker_session): """Test complete RLAIF training workflow with LORA.""" - - rlaif_trainer = RLAIFTrainer( - model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - reward_model_id='openai.gpt-oss-120b-1:0', - reward_prompt='Builtin.Summarize', - mlflow_experiment_name="test-rlaif-finetuned-models-exp", - mlflow_run_name="test-rlaif-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True - ) - - # Create training job - training_job = rlaif_trainer.train(wait=False) - - # Manual wait loop to avoid resource_config issue - max_wait_time = 3600 # 1 hour timeout - poll_interval = 30 # Check every 30 seconds - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_rlaif_trainer_continued_finetuning ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + rlaif_trainer = RLAIFTrainer( + model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + reward_model_id='openai.gpt-oss-120b-1:0', + reward_prompt='Builtin.Summarize', + mlflow_experiment_name="test-rlaif-finetuned-models-exp", + mlflow_run_name="test-rlaif-finetuned-models-run", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + accept_eula=True + ) + logger.info(f"RLAIFTrainer created for continued finetuning from model-package ARN") + + # Create training job + logger.info("Calling rlaif_trainer.train(wait=False)...") + training_job = rlaif_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop to avoid resource_config issue + max_wait_time = 3600 # 1 hour timeout + poll_interval = 30 # Check every 30 seconds + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_rlaif_trainer_continued_finetuning FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_rlaif_trainer_continued_finetuning - PASSED ===") diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 23da854ee6..9e683f3a20 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -15,125 +15,190 @@ import os import time +import logging +import traceback import pytest import boto3 from sagemaker.core.helper.session_helper import Session from sagemaker.train.rlvr_trainer import RLVRTrainer from sagemaker.train.common import TrainingType +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + def test_rlvr_trainer_lora_complete_workflow(sagemaker_session): """Test complete RLVR training workflow with LORA.""" - - rlvr_trainer = RLVRTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - mlflow_experiment_name="test-rlvr-finetuned-models-exp", - mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True - ) - - # Create training job - training_job = rlvr_trainer.train(wait=False) - - # Manual wait loop to avoid resource_config issue - max_wait_time = 3600 # 1 hour timeout - poll_interval = 30 # Check every 30 seconds - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_rlvr_trainer_lora_complete_workflow ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + rlvr_trainer = RLVRTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + mlflow_experiment_name="test-rlvr-finetuned-models-exp", + mlflow_run_name="test-rlvr-finetuned-models-run", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + accept_eula=True + ) + logger.info(f"RLVRTrainer created: model={rlvr_trainer.model}, training_type={rlvr_trainer.training_type}") + + # Create training job + logger.info("Calling rlvr_trainer.train(wait=False)...") + training_job = rlvr_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop to avoid resource_config issue + max_wait_time = 3600 # 1 hour timeout + poll_interval = 30 # Check every 30 seconds + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_rlvr_trainer_lora_complete_workflow FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_rlvr_trainer_lora_complete_workflow - PASSED ===") def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): """Test RLVR trainer with custom reward function.""" - - rlvr_trainer = RLVRTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - mlflow_experiment_name="test-rlvr-finetuned-models-exp", - mlflow_run_name="test-rlvr-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1", - accept_eula=True - ) - - training_job = rlvr_trainer.train(wait=False) - - # Manual wait loop - max_wait_time = 3600 - poll_interval = 30 - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_rlvr_trainer_with_custom_reward_function ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + rlvr_trainer = RLVRTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + mlflow_experiment_name="test-rlvr-finetuned-models-exp", + mlflow_run_name="test-rlvr-finetuned-models-run", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1", + accept_eula=True + ) + logger.info(f"RLVRTrainer created with custom_reward_function ARN") + + logger.info("Calling rlvr_trainer.train(wait=False)...") + training_job = rlvr_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop + max_wait_time = 3600 + poll_interval = 30 + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_rlvr_trainer_with_custom_reward_function FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_rlvr_trainer_with_custom_reward_function - PASSED ===") def test_rlvr_trainer_nova_workflow(sagemaker_session): """Test RLVR training workflow with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' - - # For fine-tuning - rlvr_trainer = RLVRTrainer( - model="nova-textgeneration-lite-v2", - model_package_group="sdk-test-finetuned-models", - mlflow_experiment_name="test-nova-rlvr-finetuned-models-exp", - mlflow_run_name="test-nova-rlvr-finetuned-models-run", - training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", - validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", - s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", - custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", - accept_eula=True - ) - rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket' - - rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:729646638167:function:rlvr-nova-reward-function' - - training_job = rlvr_trainer.train(wait=False) - - # Manual wait loop - max_wait_time = 3600 - poll_interval = 30 - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_rlvr_trainer_nova_workflow ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + import os + os.environ['SAGEMAKER_REGION'] = 'us-east-1' + logger.info("Set SAGEMAKER_REGION=us-east-1") + + # For fine-tuning + rlvr_trainer = RLVRTrainer( + model="nova-textgeneration-lite-v2", + model_package_group="sdk-test-finetuned-models", + mlflow_experiment_name="test-nova-rlvr-finetuned-models-exp", + mlflow_run_name="test-nova-rlvr-finetuned-models-run", + training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", + validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", + s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", + custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", + accept_eula=True + ) + logger.info(f"RLVRTrainer (Nova) created: model={rlvr_trainer.model}") + + rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket' + rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:729646638167:function:rlvr-nova-reward-function' + logger.info(f"Set hyperparameters: data_s3_path={rlvr_trainer.hyperparameters.data_s3_path}, reward_lambda_arn={rlvr_trainer.hyperparameters.reward_lambda_arn}") + + logger.info("Calling rlvr_trainer.train(wait=False)...") + training_job = rlvr_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop + max_wait_time = 3600 + poll_interval = 30 + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_rlvr_trainer_nova_workflow FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_rlvr_trainer_nova_workflow - PASSED ===") diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 98dd154c3f..c42cdbad87 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -15,119 +15,183 @@ import os import time +import logging +import traceback import pytest import boto3 from sagemaker.core.helper.session_helper import Session from sagemaker.train.sft_trainer import SFTTrainer from sagemaker.train.common import TrainingType +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.DEBUG) + @pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_lora_complete_workflow(sagemaker_session): """Test complete SFT training workflow with LORA.""" - - sft_trainer = SFTTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing/output/", - accept_eula=True - ) - - # Create training job - training_job = sft_trainer.train(wait=False) - - # Manual wait loop to avoid resource_config issue - max_wait_time = 3600 # 1 hour timeout - poll_interval = 30 # Check every 30 seconds - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_sft_trainer_lora_complete_workflow ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + sft_trainer = SFTTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing/output/", + accept_eula=True + ) + logger.info(f"SFTTrainer created: model={sft_trainer.model}, training_type={sft_trainer.training_type}") + + # Create training job + logger.info("Calling sft_trainer.train(wait=False)...") + training_job = sft_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop to avoid resource_config issue + max_wait_time = 3600 # 1 hour timeout + poll_interval = 30 # Check every 30 seconds + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_sft_trainer_lora_complete_workflow FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_sft_trainer_lora_complete_workflow - PASSED ===") @pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_with_validation_dataset(sagemaker_session): """Test SFT trainer with both training and validation datasets.""" + logger.info("=== START test_sft_trainer_with_validation_dataset ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + sft_trainer = SFTTrainer( + model="meta-textgeneration-llama-3-2-1b-instruct", + training_type=TrainingType.LORA, + model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", + training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", + validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", + accept_eula=True + ) + logger.info(f"SFTTrainer created with validation dataset") + + logger.info("Calling sft_trainer.train(wait=False)...") + training_job = sft_trainer.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop + max_wait_time = 3600 + poll_interval = 30 + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") - sft_trainer = SFTTrainer( - model="meta-textgeneration-llama-3-2-1b-instruct", - training_type=TrainingType.LORA, - model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models", - training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1", - accept_eula=True - ) - - training_job = sft_trainer.train(wait=False) - - # Manual wait loop - max_wait_time = 3600 - poll_interval = 30 - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + logger.info(f"output_model_package_arn: {getattr(training_job, 'output_model_package_arn', 'N/A')}") + except Exception as e: + logger.error(f"test_sft_trainer_with_validation_dataset FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_sft_trainer_with_validation_dataset - PASSED ===") # @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1") @pytest.mark.skip(reason="Skipping GPU resource intensive test") def test_sft_trainer_nova_workflow(sagemaker_session): """Test SFT trainer with Nova model.""" - import os - os.environ['SAGEMAKER_REGION'] = 'us-east-1' - - # For fine-tuning - sft_trainer_nova = SFTTrainer( - model="nova-textgeneration-lite-v2", - training_type=TrainingType.LORA, - model_package_group="sdk-test-finetuned-models", - mlflow_experiment_name="test-nova-finetuned-models-exp", - mlflow_run_name="test-nova-finetuned-models-run", - training_dataset="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/DataSet/sft-nova-test-dataset/0.0.1", - s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/" - ) - - # Create training job - training_job = sft_trainer_nova.train(wait=False) - - # Manual wait loop - max_wait_time = 3600 # 1 hour timeout - poll_interval = 30 # Check every 30 seconds - start_time = time.time() - - while time.time() - start_time < max_wait_time: - training_job.refresh() - status = training_job.training_job_status - - if status in ["Completed", "Failed", "Stopped"]: - break - - time.sleep(poll_interval) - - # Verify job completed successfully - assert training_job.training_job_status == "Completed" - assert hasattr(training_job, 'output_model_package_arn') - assert training_job.output_model_package_arn is not None + logger.info("=== START test_sft_trainer_nova_workflow ===") + logger.info(f"sagemaker_session region: {sagemaker_session.boto_region_name}") + + try: + import os + os.environ['SAGEMAKER_REGION'] = 'us-east-1' + logger.info("Set SAGEMAKER_REGION=us-east-1") + + # For fine-tuning + sft_trainer_nova = SFTTrainer( + model="nova-textgeneration-lite-v2", + training_type=TrainingType.LORA, + model_package_group="sdk-test-finetuned-models", + mlflow_experiment_name="test-nova-finetuned-models-exp", + mlflow_run_name="test-nova-finetuned-models-run", + training_dataset="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/DataSet/sft-nova-test-dataset/0.0.1", + s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/" + ) + logger.info(f"SFTTrainer (Nova) created: model={sft_trainer_nova.model}") + + # Create training job + logger.info("Calling sft_trainer_nova.train(wait=False)...") + training_job = sft_trainer_nova.train(wait=False) + logger.info(f"Training job created: {training_job}") + + # Manual wait loop + max_wait_time = 3600 # 1 hour timeout + poll_interval = 30 # Check every 30 seconds + start_time = time.time() + + while time.time() - start_time < max_wait_time: + training_job.refresh() + status = training_job.training_job_status + elapsed = int(time.time() - start_time) + logger.info(f"[{elapsed}s] Training job status: {status}") + + if status in ["Completed", "Failed", "Stopped"]: + break + + time.sleep(poll_interval) + + logger.info(f"Final training job status: {training_job.training_job_status}") + if training_job.training_job_status == "Failed": + failure_reason = getattr(training_job, 'failure_reason', 'N/A') + logger.error(f"Training job FAILED. Failure reason: {failure_reason}") + + # Verify job completed successfully + assert training_job.training_job_status == "Completed" + assert hasattr(training_job, 'output_model_package_arn') + assert training_job.output_model_package_arn is not None + logger.info(f"output_model_package_arn: {training_job.output_model_package_arn}") + except Exception as e: + logger.error(f"test_sft_trainer_nova_workflow FAILED: {type(e).__name__}: {e}") + logger.error(traceback.format_exc()) + raise + logger.info("=== END test_sft_trainer_nova_workflow - PASSED ===")