Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def _resolve_mlflow_resource_arn(sagemaker_session, mlflow_resource_arn: Optiona
return mlflow_resource_arn

try:

mlflow_apps = MlflowApp.get_all(
session=sagemaker_session.boto_session,
region=sagemaker_session.boto_session.region_name
Expand Down
10 changes: 10 additions & 0 deletions sagemaker-train/tests/integ/train/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,13 @@ def sagemaker_session():

if region_manual_set and "AWS_DEFAULT_REGION" in os.environ:
del os.environ["AWS_DEFAULT_REGION"]


NOVA_REGION = "us-east-1"


@pytest.fixture(scope="module")
def sagemaker_session_us_east_1():
"""Create a SageMaker session in us-east-1 for Nova model tests."""
boto_session = boto3.Session(region_name=NOVA_REGION)
return Session(boto_session=boto_session)
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,18 @@
import pytest


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_dpo_trainer_lora_complete_workflow(sagemaker_session):
"""Test complete DPO training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
# Create DPOTrainer instance with comprehensive configuration
trainer = DPOTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"dpo-lora-integ-{unique_id}",
)

# Customize hyperparameters for quick training
Expand Down Expand Up @@ -61,18 +62,19 @@ def test_dpo_trainer_lora_complete_workflow(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_dpo_trainer_with_validation_dataset(sagemaker_session):
"""Test DPO trainer with both training and validation datasets."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

dpo_trainer = DPOTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/dpo-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
validation_dataset="s3://mc-flows-sdk-testing/input_data/dpo/preference_dataset_train_256.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"dpo-val-integ-{unique_id}",
)

# Customize hyperparameters for quick training
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,17 @@
from __future__ import absolute_import

import time
import random
import boto3
from sagemaker.core.helper.session_helper import Session
from sagemaker.train.rlaif_trainer import RLAIFTrainer
from sagemaker.train.common import TrainingType
import pytest


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
"""Test complete RLAIF training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlaif_trainer = RLAIFTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
Expand All @@ -33,9 +34,10 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
reward_prompt='Builtin.Summarize',
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
mlflow_run_name="test-rlaif-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlaif-lora-integ-{unique_id}",
)

# Create training job
Expand All @@ -61,9 +63,9 @@ def test_rlaif_trainer_lora_complete_workflow(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
"""Test RLAIF trainer with different reward model and prompt."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlaif_trainer = RLAIFTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
Expand All @@ -73,9 +75,10 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
reward_prompt="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlaif-test-prompt/0.0.1",
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
mlflow_run_name="test-rlaif-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlaif-rwd-integ-{unique_id}",
)

training_job = rlaif_trainer.train(wait=False)
Expand All @@ -100,9 +103,9 @@ def test_rlaif_trainer_with_custom_reward_settings(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlaif_trainer_continued_finetuning(sagemaker_session):
"""Test complete RLAIF training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlaif_trainer = RLAIFTrainer(
model="arn:aws:sagemaker:us-west-2:729646638167:model-package/sdk-test-finetuned-models/1",
Expand All @@ -112,9 +115,10 @@ def test_rlaif_trainer_continued_finetuning(sagemaker_session):
reward_prompt='Builtin.Summarize',
mlflow_experiment_name="test-rlaif-finetuned-models-exp",
mlflow_run_name="test-rlaif-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlaif-cont-integ-{unique_id}",
)

# Create training job
Expand Down
34 changes: 16 additions & 18 deletions sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,29 @@
"""Integration tests for RLVR trainer"""
from __future__ import absolute_import

import os
import time
import random
import pytest
import boto3
from sagemaker.core.helper.session_helper import Session
from sagemaker.train.rlvr_trainer import RLVRTrainer
from sagemaker.train.common import TrainingType


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlvr_trainer_lora_complete_workflow(sagemaker_session):
"""Test complete RLVR training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlvr_trainer = RLVRTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
mlflow_experiment_name="test-rlvr-finetuned-models-exp",
mlflow_run_name="test-rlvr-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"rlvr-lora-integ-{unique_id}",
)

# Create training job
Expand All @@ -60,20 +61,21 @@ def test_rlvr_trainer_lora_complete_workflow(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlvr_trainer_with_custom_reward_function(sagemaker_session):
"""Test RLVR trainer with custom reward function."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

rlvr_trainer = RLVRTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
mlflow_experiment_name="test-rlvr-finetuned-models-exp",
mlflow_run_name="test-rlvr-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/rlvr-rlaif-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/rlvr-rlaif-test-data/train_285.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
custom_reward_function="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/JsonDoc/rlvr-test-rf/0.0.1",
accept_eula=True
accept_eula=True,
base_job_name=f"rlvr-rf-integ-{unique_id}",
)

training_job = rlvr_trainer.train(wait=False)
Expand All @@ -98,14 +100,12 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session):
assert training_job.output_model_package_arn is not None


# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1")
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_rlvr_trainer_nova_workflow(sagemaker_session):
@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1")
def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1):
"""Test RLVR training workflow with Nova model."""
import os
os.environ['SAGEMAKER_REGION'] = 'us-east-1'
# sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region)

# For fine-tuning
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
rlvr_trainer = RLVRTrainer(
model="nova-textgeneration-lite-v2",
model_package_group="sdk-test-finetuned-models",
Expand All @@ -115,12 +115,10 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session):
validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl",
s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/",
custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1",
accept_eula=True
accept_eula=True,
sagemaker_session=sagemaker_session_us_east_1,
base_job_name=f"rlvr-nova-integ-{unique_id}",
)
rlvr_trainer.hyperparameters.data_s3_path = 's3://example-bucket'

rlvr_trainer.hyperparameters.reward_lambda_arn = 'arn:aws:lambda:us-east-1:729646638167:function:rlvr-nova-reward-function'

training_job = rlvr_trainer.train(wait=False)

# Manual wait loop
Expand Down
34 changes: 18 additions & 16 deletions sagemaker-train/tests/integ/train/test_sft_trainer_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,27 @@
"""Integration tests for SFT trainer"""
from __future__ import absolute_import

import os
import time
import random
import pytest
import boto3
from sagemaker.core.helper.session_helper import Session
from sagemaker.train.sft_trainer import SFTTrainer
from sagemaker.train.common import TrainingType


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_sft_trainer_lora_complete_workflow(sagemaker_session):
"""Test complete SFT training workflow with LORA."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

sft_trainer = SFTTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1",
training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl",
s3_output_path="s3://mc-flows-sdk-testing/output/",
accept_eula=True
accept_eula=True,
base_job_name=f"sft-lora-integ-{unique_id}",
)

# Create training job
Expand All @@ -58,17 +59,18 @@ def test_sft_trainer_lora_complete_workflow(sagemaker_session):
assert training_job.output_model_package_arn is not None


@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_sft_trainer_with_validation_dataset(sagemaker_session):
"""Test SFT trainer with both training and validation datasets."""
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"

sft_trainer = SFTTrainer(
model="meta-textgeneration-llama-3-2-1b-instruct",
training_type=TrainingType.LORA,
model_package_group="arn:aws:sagemaker:us-west-2:729646638167:model-package-group/sdk-test-finetuned-models",
training_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1",
validation_dataset="arn:aws:sagemaker:us-west-2:729646638167:hub-content/sdktest/DataSet/sft-oss-test-data/0.0.1",
accept_eula=True
training_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl",
validation_dataset="s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl",
accept_eula=True,
base_job_name=f"sft-val-integ-{unique_id}",
)

training_job = sft_trainer.train(wait=False)
Expand All @@ -92,22 +94,22 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session):
assert hasattr(training_job, 'output_model_package_arn')


# @pytest.mark.skipif(os.environ.get('AWS_DEFAULT_REGION') != 'us-east-1', reason="Nova models only available in us-east-1")
@pytest.mark.skip(reason="Skipping GPU resource intensive test")
def test_sft_trainer_nova_workflow(sagemaker_session):
@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1")
def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1):
"""Test SFT trainer with Nova model."""
import os
os.environ['SAGEMAKER_REGION'] = 'us-east-1'
# sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region)

# For fine-tuning
unique_id = f"{int(time.time())}-{random.randint(1000, 9999)}"
sft_trainer_nova = SFTTrainer(
model="nova-textgeneration-lite-v2",
training_type=TrainingType.LORA,
model_package_group="sdk-test-finetuned-models",
mlflow_experiment_name="test-nova-finetuned-models-exp",
mlflow_run_name="test-nova-finetuned-models-run",
training_dataset="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/DataSet/sft-nova-test-dataset/0.0.1",
s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/"
training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_200_samples.jsonl",
s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/",
sagemaker_session=sagemaker_session_us_east_1,
base_job_name=f"sft-nova-integ-{unique_id}",
)

# Create training job
Expand Down
Loading