Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/supervised/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/missing_opt.yaml DESTINATION $
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/missing_loss.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_graphs.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_trainable.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_multiarg.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_multiarg_extra.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs)
install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/configs/torchscript_multiarg_graphs.yaml DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/tests/supervised/configs)
Expand Down
12 changes: 12 additions & 0 deletions tests/supervised/configs/torchscript_trainable.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
model:
type: torchscript
parameters:
filename: "model_trainable.pt"

loss:
type: MSE

optimizer:
type: adam
parameters:
learning_rate: 1e-2
10 changes: 10 additions & 0 deletions tests/supervised/scripts/setup_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ def forward(self, input1, input2):
x = self.layer(input1)
return input1 + 0.0 * x, input2 + 0.0 * x

class TrainableNet(torch.nn.Module):
def __init__(self):
super(TrainableNet, self).__init__()
self.layer = torch.nn.Linear(10, 10)

def forward(self, input1):
return self.layer(input1)


# Create loss functions with various argument combinations
class Loss1(torch.nn.Module):
Expand Down Expand Up @@ -56,12 +64,14 @@ def forward(self, prediction1, prediction2, label1, label2, extra_args1, extra_a
def main():
model1 = Net1()
model2 = Net2()
model_trainable = TrainableNet()
loss1 = Loss1()
loss2 = Loss2()
loss2_extra = Loss2Extra()

save_jit_module(model1, "model.pt")
save_jit_module(model2, "model_multiarg.pt")
save_jit_module(model_trainable, "model_trainable.pt")
save_jit_module(loss1, "loss.pt")
save_jit_module(loss2, "loss_multiarg.pt")
save_jit_module(loss2_extra, "loss_multiarg_extra.pt")
Expand Down
167 changes: 167 additions & 0 deletions tests/supervised/test_checkpoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
#include <cuda_runtime.h>
#endif

#include <filesystem>
#include <string>
#include <vector>

#include "internal/defines.h"
#include "internal/utils.h"
#include "torchfort.h"
#include <gtest/gtest.h>
Expand Down Expand Up @@ -139,6 +144,138 @@ void checkpoint_save_restore(int first_device, int second_device) {

TEST(TorchFort, CheckpointSaveRestoreCPUtoCPU) { checkpoint_save_restore(TORCHFORT_DEVICE_CPU, TORCHFORT_DEVICE_CPU); }

std::string device_suffix(int device) {
return device == TORCHFORT_DEVICE_CPU ? "cpu" : "gpu" + std::to_string(device);
}

void load_model_mlp_save_restore(int first_device, int second_device) {
torch::manual_seed(666);

const std::string model_file =
"/tmp/torchfort_load_model_mlp_" + device_suffix(first_device) + "_to_" + device_suffix(second_device) + ".pt";
std::filesystem::remove(model_file);

torch::Device cpu_dev = torchfort::get_device(TORCHFORT_DEVICE_CPU);
torch::Device first_dev = torchfort::get_device(first_device);
torch::Device second_dev = torchfort::get_device(second_device);
std::vector<int64_t> input_shape{4, 32};
std::vector<int64_t> label_shape{4, 1};
auto source_opts = torch::TensorOptions().device(first_dev).dtype(torch::kFloat32);
auto restore_opts = torch::TensorOptions().device(second_dev).dtype(torch::kFloat32);
auto input = torch::ones(input_shape, source_opts);
auto label_source = torch::zeros(label_shape, source_opts);
auto output = torch::empty(label_shape, source_opts);
float loss;

CHECK_TORCHFORT(torchfort_create_model("mlp_src", "configs/mlp.yaml", first_device));

for (int i = 0; i < 20; ++i) {
CHECK_TORCHFORT(torchfort_train("mlp_src", input.data_ptr<float>(), input_shape.size(), input_shape.data(),
label_source.data_ptr<float>(), label_shape.size(), label_shape.data(), &loss,
TORCHFORT_FLOAT, 0));
}

CHECK_TORCHFORT(torchfort_inference("mlp_src", input.data_ptr<float>(), input_shape.size(), input_shape.data(),
output.data_ptr<float>(), label_shape.size(), label_shape.data(), TORCHFORT_FLOAT,
0));
auto output_source = output.clone().to(cpu_dev);

CHECK_TORCHFORT(torchfort_save_model("mlp_src", model_file.c_str()));

CHECK_TORCHFORT(torchfort_create_model("mlp_restore_load_model", "configs/mlp.yaml", second_device));
CHECK_TORCHFORT(torchfort_load_model("mlp_restore_load_model", model_file.c_str()));

input = input.to(second_dev);
auto label_restore = torch::full(label_shape, 2.0f, restore_opts);
output = torch::empty(label_shape, restore_opts);

CHECK_TORCHFORT(torchfort_inference("mlp_restore_load_model", input.data_ptr<float>(), input_shape.size(),
input_shape.data(), output.data_ptr<float>(), label_shape.size(),
label_shape.data(), TORCHFORT_FLOAT, 0));
auto output_loaded = output.clone().to(cpu_dev);

float mean_diff = torch::mean(torch::abs(output_loaded - output_source)).item<float>();
EXPECT_NEAR(mean_diff, 0.0f, 1e-6f);

CHECK_TORCHFORT(torchfort_train("mlp_restore_load_model", input.data_ptr<float>(), input_shape.size(),
input_shape.data(), label_restore.data_ptr<float>(), label_shape.size(),
label_shape.data(), &loss, TORCHFORT_FLOAT, 0));

CHECK_TORCHFORT(torchfort_inference("mlp_restore_load_model", input.data_ptr<float>(), input_shape.size(),
input_shape.data(), output.data_ptr<float>(), label_shape.size(),
label_shape.data(), TORCHFORT_FLOAT, 0));

mean_diff = torch::mean(torch::abs(output.clone().to(cpu_dev) - output_loaded)).item<float>();
EXPECT_GT(mean_diff, 1e-6f);

std::filesystem::remove(model_file);
}

TEST(TorchFort, LoadModelMLPCPUtoCPU) { load_model_mlp_save_restore(TORCHFORT_DEVICE_CPU, TORCHFORT_DEVICE_CPU); }

void load_model_torchscript_save_restore(int first_device, int second_device) {
torch::manual_seed(666);

const std::string model_file = "/tmp/torchfort_load_model_torchscript_" + device_suffix(first_device) + "_to_" +
device_suffix(second_device) + ".pt";
std::filesystem::remove(model_file);

torch::Device cpu_dev = torchfort::get_device(TORCHFORT_DEVICE_CPU);
torch::Device first_dev = torchfort::get_device(first_device);
torch::Device second_dev = torchfort::get_device(second_device);
std::vector<int64_t> shape{4, 2, 10};
auto source_opts = torch::TensorOptions().device(first_dev).dtype(torch::kFloat32);
auto restore_opts = torch::TensorOptions().device(second_dev).dtype(torch::kFloat32);
auto input = torch::ones(shape, source_opts);
auto label_source = torch::zeros(shape, source_opts);
auto output = torch::empty(shape, source_opts);
float loss;

CHECK_TORCHFORT(torchfort_create_model("torchscript_src", "configs/torchscript_trainable.yaml", first_device));

for (int i = 0; i < 20; ++i) {
CHECK_TORCHFORT(torchfort_train("torchscript_src", input.data_ptr<float>(), shape.size(), shape.data(),
label_source.data_ptr<float>(), shape.size(), shape.data(), &loss, TORCHFORT_FLOAT,
0));
}

CHECK_TORCHFORT(torchfort_inference("torchscript_src", input.data_ptr<float>(), shape.size(), shape.data(),
output.data_ptr<float>(), shape.size(), shape.data(), TORCHFORT_FLOAT, 0));
auto output_source = output.clone().to(cpu_dev);

CHECK_TORCHFORT(torchfort_save_model("torchscript_src", model_file.c_str()));

CHECK_TORCHFORT(torchfort_create_model("torchscript_restore", "configs/torchscript_trainable.yaml", second_device));
CHECK_TORCHFORT(torchfort_load_model("torchscript_restore", model_file.c_str()));

input = input.to(second_dev);
auto label_restore = torch::full(shape, 2.0f, restore_opts);
output = torch::empty(shape, restore_opts);

CHECK_TORCHFORT(torchfort_inference("torchscript_restore", input.data_ptr<float>(), shape.size(), shape.data(),
output.data_ptr<float>(), shape.size(), shape.data(), TORCHFORT_FLOAT, 0));
auto output_loaded = output.clone().to(cpu_dev);

float mean_diff = torch::mean(torch::abs(output_loaded - output_source)).item<float>();
EXPECT_NEAR(mean_diff, 0.0f, 1e-6f);

CHECK_TORCHFORT(torchfort_train("torchscript_restore", input.data_ptr<float>(), shape.size(), shape.data(),
label_restore.data_ptr<float>(), shape.size(), shape.data(), &loss, TORCHFORT_FLOAT,
0));

CHECK_TORCHFORT(torchfort_inference("torchscript_restore", input.data_ptr<float>(), shape.size(), shape.data(),
output.data_ptr<float>(), shape.size(), shape.data(), TORCHFORT_FLOAT, 0));

mean_diff = torch::mean(torch::abs(output.clone().to(cpu_dev) - output_loaded)).item<float>();
EXPECT_GT(mean_diff, 1e-6f);

std::filesystem::remove(model_file);
}

TEST(TorchFort, LoadModelTorchScriptCPUtoCPU) {
load_model_torchscript_save_restore(TORCHFORT_DEVICE_CPU, TORCHFORT_DEVICE_CPU);
}

#ifdef ENABLE_GPU
TEST(TorchFort, CheckpointSaveRestoreGPUtoGPU) { checkpoint_save_restore(0, 0); }

Expand All @@ -154,6 +291,36 @@ TEST(TorchFort, CheckpointSaveRestoreGPU0toGPU1) {
}
checkpoint_save_restore(0, 1);
}

TEST(TorchFort, LoadModelMLPGPUtoGPU) { load_model_mlp_save_restore(0, 0); }

TEST(TorchFort, LoadModelMLPCPUtoGPU) { load_model_mlp_save_restore(TORCHFORT_DEVICE_CPU, 0); }

TEST(TorchFort, LoadModelMLPGPUtoCPU) { load_model_mlp_save_restore(0, TORCHFORT_DEVICE_CPU); }

TEST(TorchFort, LoadModelMLPGPU0toGPU1) {
int ngpu;
cudaGetDeviceCount(&ngpu);
if (ngpu < 2) {
GTEST_SKIP() << "This test requires at least 2 GPUs. Skipping.";
}
load_model_mlp_save_restore(0, 1);
}

TEST(TorchFort, LoadModelTorchScriptGPUtoGPU) { load_model_torchscript_save_restore(0, 0); }

TEST(TorchFort, LoadModelTorchScriptCPUtoGPU) { load_model_torchscript_save_restore(TORCHFORT_DEVICE_CPU, 0); }

TEST(TorchFort, LoadModelTorchScriptGPUtoCPU) { load_model_torchscript_save_restore(0, TORCHFORT_DEVICE_CPU); }

TEST(TorchFort, LoadModelTorchScriptGPU0toGPU1) {
int ngpu;
cudaGetDeviceCount(&ngpu);
if (ngpu < 2) {
GTEST_SKIP() << "This test requires at least 2 GPUs. Skipping.";
}
load_model_torchscript_save_restore(0, 1);
}
#endif

int main(int argc, char* argv[]) {
Expand Down
Loading