From 2486fecf687c5ee7958887039b860b0fa08a7991 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Wed, 3 Jun 2026 15:27:04 -0700 Subject: [PATCH 1/3] Add supervised save_model/load_model API tests. Signed-off-by: Josh Romero --- tests/supervised/CMakeLists.txt | 1 + tests/supervised/scripts/setup_tests.py | 10 ++ tests/supervised/test_checkpoint.cpp | 171 ++++++++++++++++++++++++ 3 files changed, 182 insertions(+) diff --git a/tests/supervised/CMakeLists.txt b/tests/supervised/CMakeLists.txt index abfffd27..9c17855d 100644 --- a/tests/supervised/CMakeLists.txt +++ b/tests/supervised/CMakeLists.txt @@ -72,6 +72,7 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/missing_opt.yaml DESTINATION $ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/missing_loss.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_graphs.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs) +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_trainable.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_multiarg.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_multiarg_extra.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_multiarg_graphs.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs) diff --git a/tests/supervised/scripts/setup_tests.py b/tests/supervised/scripts/setup_tests.py index 693bca17..351edc41 100644 --- a/tests/supervised/scripts/setup_tests.py +++ b/tests/supervised/scripts/setup_tests.py @@ -29,6 +29,14 @@ def forward(self, input1, input2): x = self.layer(input1) return input1 + 0.0 * x, input2 + 0.0 * x +class TrainableNet(torch.nn.Module): + def __init__(self): + super(TrainableNet, self).__init__() + self.layer = torch.nn.Linear(10, 10) + + def forward(self, input1): + return self.layer(input1) + # Create loss functions with various argument combinations class Loss1(torch.nn.Module): @@ -56,12 +64,14 @@ def forward(self, prediction1, prediction2, label1, label2, extra_args1, extra_a def main(): model1 = Net1() model2 = Net2() + model_trainable = TrainableNet() loss1 = Loss1() loss2 = Loss2() loss2_extra = Loss2Extra() save_jit_module(model1, "model.pt") save_jit_module(model2, "model_multiarg.pt") + save_jit_module(model_trainable, "model_trainable.pt") save_jit_module(loss1, "loss.pt") save_jit_module(loss2, "loss_multiarg.pt") save_jit_module(loss2_extra, "loss_multiarg_extra.pt") diff --git a/tests/supervised/test_checkpoint.cpp b/tests/supervised/test_checkpoint.cpp index 4683a63d..fe98ed50 100644 --- a/tests/supervised/test_checkpoint.cpp +++ b/tests/supervised/test_checkpoint.cpp @@ -18,6 +18,11 @@ #include #endif +#include +#include +#include + +#include "internal/defines.h" #include "internal/utils.h" #include "torchfort.h" #include @@ -139,6 +144,138 @@ void checkpoint_save_restore(int first_device, int second_device) { TEST(TorchFort, CheckpointSaveRestoreCPUtoCPU) { checkpoint_save_restore(TORCHFORT_DEVICE_CPU, TORCHFORT_DEVICE_CPU); } +std::string device_suffix(int device) { + return device == TORCHFORT_DEVICE_CPU ? "cpu" : "gpu" + std::to_string(device); +} + +void load_model_mlp_save_restore(int first_device, int second_device) { + torch::manual_seed(666); + + const std::string model_file = + "/tmp/torchfort_load_model_mlp_" + device_suffix(first_device) + "_to_" + device_suffix(second_device) + ".pt"; + std::filesystem::remove(model_file); + + torch::Device cpu_dev = torchfort::get_device(TORCHFORT_DEVICE_CPU); + torch::Device first_dev = torchfort::get_device(first_device); + torch::Device second_dev = torchfort::get_device(second_device); + std::vector input_shape{4, 32}; + std::vector label_shape{4, 1}; + auto source_opts = torch::TensorOptions().device(first_dev).dtype(torch::kFloat32); + auto restore_opts = torch::TensorOptions().device(second_dev).dtype(torch::kFloat32); + auto input = torch::ones(input_shape, source_opts); + auto label_source = torch::zeros(label_shape, source_opts); + auto output = torch::empty(label_shape, source_opts); + float loss; + + CHECK_TORCHFORT(torchfort_create_model("mlp_src", "configs/mlp.yaml", first_device)); + + for (int i = 0; i < 20; ++i) { + CHECK_TORCHFORT(torchfort_train("mlp_src", input.data_ptr(), input_shape.size(), input_shape.data(), + label_source.data_ptr(), label_shape.size(), label_shape.data(), &loss, + TORCHFORT_FLOAT, 0)); + } + + CHECK_TORCHFORT(torchfort_inference("mlp_src", input.data_ptr(), input_shape.size(), input_shape.data(), + output.data_ptr(), label_shape.size(), label_shape.data(), TORCHFORT_FLOAT, + 0)); + auto output_source = output.clone().to(cpu_dev); + + CHECK_TORCHFORT(torchfort_save_model("mlp_src", model_file.c_str())); + + CHECK_TORCHFORT(torchfort_create_model("mlp_restore_load_model", "configs/mlp.yaml", second_device)); + CHECK_TORCHFORT(torchfort_load_model("mlp_restore_load_model", model_file.c_str())); + + input = input.to(second_dev); + auto label_restore = torch::full(label_shape, 2.0f, restore_opts); + output = torch::empty(label_shape, restore_opts); + + CHECK_TORCHFORT(torchfort_inference("mlp_restore_load_model", input.data_ptr(), input_shape.size(), + input_shape.data(), output.data_ptr(), label_shape.size(), + label_shape.data(), TORCHFORT_FLOAT, 0)); + auto output_loaded = output.clone().to(cpu_dev); + + float mean_diff = torch::mean(torch::abs(output_loaded - output_source)).item(); + EXPECT_NEAR(mean_diff, 0.0f, 1e-6f); + + CHECK_TORCHFORT(torchfort_train("mlp_restore_load_model", input.data_ptr(), input_shape.size(), + input_shape.data(), label_restore.data_ptr(), label_shape.size(), + label_shape.data(), &loss, TORCHFORT_FLOAT, 0)); + + CHECK_TORCHFORT(torchfort_inference("mlp_restore_load_model", input.data_ptr(), input_shape.size(), + input_shape.data(), output.data_ptr(), label_shape.size(), + label_shape.data(), TORCHFORT_FLOAT, 0)); + + mean_diff = torch::mean(torch::abs(output.clone().to(cpu_dev) - output_loaded)).item(); + EXPECT_GT(mean_diff, 1e-6f); + + std::filesystem::remove(model_file); +} + +TEST(TorchFort, LoadModelMLPCPUtoCPU) { load_model_mlp_save_restore(TORCHFORT_DEVICE_CPU, TORCHFORT_DEVICE_CPU); } + +void load_model_torchscript_save_restore(int first_device, int second_device) { + torch::manual_seed(666); + + const std::string model_file = "/tmp/torchfort_load_model_torchscript_" + device_suffix(first_device) + "_to_" + + device_suffix(second_device) + ".pt"; + std::filesystem::remove(model_file); + + torch::Device cpu_dev = torchfort::get_device(TORCHFORT_DEVICE_CPU); + torch::Device first_dev = torchfort::get_device(first_device); + torch::Device second_dev = torchfort::get_device(second_device); + std::vector shape{4, 2, 10}; + auto source_opts = torch::TensorOptions().device(first_dev).dtype(torch::kFloat32); + auto restore_opts = torch::TensorOptions().device(second_dev).dtype(torch::kFloat32); + auto input = torch::ones(shape, source_opts); + auto label_source = torch::zeros(shape, source_opts); + auto output = torch::empty(shape, source_opts); + float loss; + + CHECK_TORCHFORT(torchfort_create_model("torchscript_src", "configs/torchscript_trainable.yaml", first_device)); + + for (int i = 0; i < 20; ++i) { + CHECK_TORCHFORT(torchfort_train("torchscript_src", input.data_ptr(), shape.size(), shape.data(), + label_source.data_ptr(), shape.size(), shape.data(), &loss, TORCHFORT_FLOAT, + 0)); + } + + CHECK_TORCHFORT(torchfort_inference("torchscript_src", input.data_ptr(), shape.size(), shape.data(), + output.data_ptr(), shape.size(), shape.data(), TORCHFORT_FLOAT, 0)); + auto output_source = output.clone().to(cpu_dev); + + CHECK_TORCHFORT(torchfort_save_model("torchscript_src", model_file.c_str())); + + CHECK_TORCHFORT(torchfort_create_model("torchscript_restore", "configs/torchscript_trainable.yaml", second_device)); + CHECK_TORCHFORT(torchfort_load_model("torchscript_restore", model_file.c_str())); + + input = input.to(second_dev); + auto label_restore = torch::full(shape, 2.0f, restore_opts); + output = torch::empty(shape, restore_opts); + + CHECK_TORCHFORT(torchfort_inference("torchscript_restore", input.data_ptr(), shape.size(), shape.data(), + output.data_ptr(), shape.size(), shape.data(), TORCHFORT_FLOAT, 0)); + auto output_loaded = output.clone().to(cpu_dev); + + float mean_diff = torch::mean(torch::abs(output_loaded - output_source)).item(); + EXPECT_NEAR(mean_diff, 0.0f, 1e-6f); + + CHECK_TORCHFORT(torchfort_train("torchscript_restore", input.data_ptr(), shape.size(), shape.data(), + label_restore.data_ptr(), shape.size(), shape.data(), &loss, TORCHFORT_FLOAT, + 0)); + + CHECK_TORCHFORT(torchfort_inference("torchscript_restore", input.data_ptr(), shape.size(), shape.data(), + output.data_ptr(), shape.size(), shape.data(), TORCHFORT_FLOAT, 0)); + + mean_diff = torch::mean(torch::abs(output.clone().to(cpu_dev) - output_loaded)).item(); + EXPECT_GT(mean_diff, 1e-6f); + + std::filesystem::remove(model_file); +} + +TEST(TorchFort, LoadModelTorchScriptCPUtoCPU) { + load_model_torchscript_save_restore(TORCHFORT_DEVICE_CPU, TORCHFORT_DEVICE_CPU); +} + #ifdef ENABLE_GPU TEST(TorchFort, CheckpointSaveRestoreGPUtoGPU) { checkpoint_save_restore(0, 0); } @@ -154,6 +291,40 @@ TEST(TorchFort, CheckpointSaveRestoreGPU0toGPU1) { } checkpoint_save_restore(0, 1); } + +TEST(TorchFort, LoadModelMLPGPUtoGPU) { load_model_mlp_save_restore(0, 0); } + +TEST(TorchFort, LoadModelMLPCPUtoGPU) { load_model_mlp_save_restore(TORCHFORT_DEVICE_CPU, 0); } + +TEST(TorchFort, LoadModelMLPGPUtoCPU) { load_model_mlp_save_restore(0, TORCHFORT_DEVICE_CPU); } + +TEST(TorchFort, LoadModelMLPGPU0toGPU1) { + int ngpu; + cudaGetDeviceCount(&ngpu); + if (ngpu < 2) { + GTEST_SKIP() << "This test requires at least 2 GPUs. Skipping."; + } + load_model_mlp_save_restore(0, 1); +} + +TEST(TorchFort, LoadModelTorchScriptGPUtoGPU) { load_model_torchscript_save_restore(0, 0); } + +TEST(TorchFort, LoadModelTorchScriptCPUtoGPU) { + load_model_torchscript_save_restore(TORCHFORT_DEVICE_CPU, 0); +} + +TEST(TorchFort, LoadModelTorchScriptGPUtoCPU) { + load_model_torchscript_save_restore(0, TORCHFORT_DEVICE_CPU); +} + +TEST(TorchFort, LoadModelTorchScriptGPU0toGPU1) { + int ngpu; + cudaGetDeviceCount(&ngpu); + if (ngpu < 2) { + GTEST_SKIP() << "This test requires at least 2 GPUs. Skipping."; + } + load_model_torchscript_save_restore(0, 1); +} #endif int main(int argc, char* argv[]) { From 0a765b2bc808ada114f974409b55a9ab60489b1c Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Wed, 3 Jun 2026 15:35:26 -0700 Subject: [PATCH 2/3] Formatting. Signed-off-by: Josh Romero --- tests/supervised/test_checkpoint.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/supervised/test_checkpoint.cpp b/tests/supervised/test_checkpoint.cpp index fe98ed50..ac03aba6 100644 --- a/tests/supervised/test_checkpoint.cpp +++ b/tests/supervised/test_checkpoint.cpp @@ -309,13 +309,9 @@ TEST(TorchFort, LoadModelMLPGPU0toGPU1) { TEST(TorchFort, LoadModelTorchScriptGPUtoGPU) { load_model_torchscript_save_restore(0, 0); } -TEST(TorchFort, LoadModelTorchScriptCPUtoGPU) { - load_model_torchscript_save_restore(TORCHFORT_DEVICE_CPU, 0); -} +TEST(TorchFort, LoadModelTorchScriptCPUtoGPU) { load_model_torchscript_save_restore(TORCHFORT_DEVICE_CPU, 0); } -TEST(TorchFort, LoadModelTorchScriptGPUtoCPU) { - load_model_torchscript_save_restore(0, TORCHFORT_DEVICE_CPU); -} +TEST(TorchFort, LoadModelTorchScriptGPUtoCPU) { load_model_torchscript_save_restore(0, TORCHFORT_DEVICE_CPU); } TEST(TorchFort, LoadModelTorchScriptGPU0toGPU1) { int ngpu; From 022b719280b899f42f570c53e09bf261805702c3 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Wed, 3 Jun 2026 15:50:33 -0700 Subject: [PATCH 3/3] Add missing file. Signed-off-by: Josh Romero --- tests/supervised/configs/torchscript_trainable.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/supervised/configs/torchscript_trainable.yaml diff --git a/tests/supervised/configs/torchscript_trainable.yaml b/tests/supervised/configs/torchscript_trainable.yaml new file mode 100644 index 00000000..14f21e70 --- /dev/null +++ b/tests/supervised/configs/torchscript_trainable.yaml @@ -0,0 +1,12 @@ +model: + type: torchscript + parameters: + filename: "model_trainable.pt" + +loss: + type: MSE + +optimizer: + type: adam + parameters: + learning_rate: 1e-2