diff --git a/src/backend_model.cc b/src/backend_model.cc index cefce6b59..c3b0fc2dc 100644 --- a/src/backend_model.cc +++ b/src/backend_model.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -825,10 +825,14 @@ TritonModel::SetConfiguredScheduler( for (const auto& input : config_.input()) { if (input.is_shape_tensor()) { enforce_equal_shape_tensors.insert({input.name(), true}); - } else if ( - !input.allow_ragged_batch() && - (triton::common::GetElementCount(input) == -1)) { - enforce_equal_shape_tensors.insert({input.name(), false}); + } else { + int64_t element_count = 0; + RETURN_IF_ERROR( + GetElementCount(input.dims(), input.name(), &element_count)); + if (!input.allow_ragged_batch() && + (element_count == triton::common::WILDCARD_SIZE)) { + enforce_equal_shape_tensors.insert({input.name(), false}); + } } } diff --git a/src/backend_model_instance.cc b/src/backend_model_instance.cc index b5f595c87..fd8d5eb52 100644 --- a/src/backend_model_instance.cc +++ b/src/backend_model_instance.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -372,9 +372,11 @@ TritonModelInstance::GenerateWarmupData() int64_t max_zero_byte_size = 0; int64_t max_random_byte_size = 0; for (const auto& input_meta : warmup_setting.inputs()) { - auto element_count = - triton::common::GetElementCount(input_meta.second.dims()); - if (element_count == -1) { + int64_t batch_byte_size = 0; + RETURN_IF_ERROR(GetByteSize( + input_meta.second.data_type(), input_meta.second.dims(), + input_meta.first, &batch_byte_size)); + if (batch_byte_size == triton::common::WILDCARD_SIZE) { return Status( Status::Code::INVALID_ARG, "warmup setting expects all variable-size dimensions are specified " @@ -382,13 +384,6 @@ TritonModelInstance::GenerateWarmupData() input_meta.first + "'"); } - int64_t batch_byte_size = - element_count * - triton::common::GetDataTypeByteSize(input_meta.second.data_type()); - if (batch_byte_size == 0) { - batch_byte_size = element_count * sizeof(int32_t); - } - switch (input_meta.second.input_data_type_case()) { case inference::ModelWarmup_Input::InputDataTypeCase::kZeroData: max_zero_byte_size = std::max(batch_byte_size, max_zero_byte_size); @@ -443,14 +438,11 @@ TritonModelInstance::GenerateWarmupData() // Second pass to prepare original inputs. std::vector> input_sps; for (const auto& input_meta : warmup_setting.inputs()) { - auto batch1_element_count = - triton::common::GetElementCount(input_meta.second.dims()); - auto batch_byte_size = - batch1_element_count * - triton::common::GetDataTypeByteSize(input_meta.second.data_type()); - if (batch_byte_size == 0) { - batch_byte_size = batch1_element_count * sizeof(int32_t); - } + int64_t batch_byte_size_signed = 0; + RETURN_IF_ERROR(GetByteSize( + input_meta.second.data_type(), input_meta.second.dims(), + input_meta.first, &batch_byte_size_signed)); + size_t batch_byte_size = static_cast(batch_byte_size_signed); const char* allocated_ptr; switch (input_meta.second.input_data_type_case()) { @@ -476,10 +468,11 @@ TritonModelInstance::GenerateWarmupData() {model_->LocalizedModelPath(), kWarmupDataFolder, input_meta.second.input_data_file()}), input_data)); + if (input_meta.second.data_type() == inference::DataType::TYPE_STRING) { batch_byte_size = input_data->size(); - } else if (((size_t)batch_byte_size) > input_data->size()) { + } else if (batch_byte_size > input_data->size()) { return Status( Status::Code::INVALID_ARG, lrequest->LogRequest() + "warmup setting expects " + diff --git a/src/infer_request.cc b/src/infer_request.cc index 41074effc..bdcc8e031 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -515,9 +515,15 @@ InferenceRequest::Release( return Status::Success; } -InferenceRequest* -InferenceRequest::CopyAsNull(const InferenceRequest& from) +Status +InferenceRequest::CopyAsNull( + const InferenceRequest& from, std::unique_ptr* to) { + if (to == nullptr) { + return Status( + Status::Code::INVALID_ARG, "InferenceRequest 'to' must not be null"); + } + // Create a copy of 'from' request with artificial inputs and no requested // outputs. Maybe more efficient to share inputs and other metadata, // but that binds the Null request with 'from' request's lifecycle. @@ -587,10 +593,11 @@ InferenceRequest::CopyAsNull(const InferenceRequest& from) } if (input.second.DType() == inference::DataType::TYPE_STRING) { - int64_t element_count = - triton::common::GetElementCount(input.second.Shape()); - - size_t str_byte_size = static_cast(4 * element_count); + int64_t str_byte_size_signed = 0; + RETURN_IF_ERROR(GetByteSize( + inference::DataType::TYPE_STRING, input.second.Shape(), input.first, + &str_byte_size_signed)); + size_t str_byte_size = static_cast(str_byte_size_signed); max_str_byte_size = std::max(str_byte_size, max_str_byte_size); if (str_byte_size > max_byte_size) { max_byte_size = str_byte_size; @@ -638,11 +645,12 @@ InferenceRequest::CopyAsNull(const InferenceRequest& from) if (input.first == *max_input_name) { new_input->SetData(data); } else { - if (inference::DataType::TYPE_STRING == input.second.DType()) { - new_input->AppendData( - data_base, - triton::common::GetElementCount(input.second.Shape()) * 4, mem_type, - mem_id); + if (input.second.DType() == inference::DataType::TYPE_STRING) { + int64_t str_byte_size = 0; + RETURN_IF_ERROR(GetByteSize( + inference::DataType::TYPE_STRING, input.second.Shape(), input.first, + &str_byte_size)); + new_input->AppendData(data_base, str_byte_size, mem_type, mem_id); } else { new_input->AppendData( data_base, input.second.Data()->TotalByteSize(), mem_type, mem_id); @@ -662,7 +670,8 @@ InferenceRequest::CopyAsNull(const InferenceRequest& from) std::make_pair(pr.second.Name(), std::addressof(pr.second))); } - return lrequest.release(); + *to = std::move(lrequest); + return Status::Success; } Status @@ -844,8 +853,8 @@ InferenceRequest::LoadInputStates() // Add the input states to the inference request. if (sequence_states_ != nullptr) { if (sequence_states_->IsNullRequest()) { - sequence_states_ = - SequenceStates::CopyAsNull(sequence_states_->NullSequenceStates()); + RETURN_IF_ERROR(SequenceStates::CopyAsNull( + sequence_states_->NullSequenceStates(), &sequence_states_)); } for (auto& input_state_pair : sequence_states_->InputStates()) { auto& input_state = input_state_pair.second; @@ -1173,14 +1182,14 @@ InferenceRequest::Normalize() if (input_config->has_reshape()) { std::deque variable_size_values; for (int64_t idx = 0; idx < input_config->dims_size(); idx++) { - if (input_config->dims(idx) == -1) { + if (input_config->dims(idx) == triton::common::WILDCARD_DIM) { variable_size_values.push_back((*shape)[idx]); } } shape->clear(); for (const auto& dim : input_config->reshape().shape()) { - if (dim == -1) { + if (dim == triton::common::WILDCARD_DIM) { shape->push_back(variable_size_values.front()); variable_size_values.pop_front(); } else { @@ -1219,8 +1228,9 @@ InferenceRequest::Normalize() const std::vector& input_dims = input.IsShapeTensor() ? input.OriginalShape() : input.ShapeWithBatchDim(); - int64_t expected_byte_size = - triton::common::GetByteSize(data_type, input_dims); + int64_t expected_byte_size = 0; + RETURN_IF_ERROR(GetByteSize( + data_type, input_dims, input_name, &expected_byte_size)); const size_t& byte_size = input.Data()->TotalByteSize(); if ((byte_size > LLONG_MAX) || (static_cast(byte_size) != expected_byte_size)) { @@ -1311,7 +1321,7 @@ InferenceRequest::ValidateBytesInputs( { const auto& input_dims = input.ShapeWithBatchDim(); - int64_t element_count = triton::common::GetElementCount(input_dims); + int64_t element_count = 0; int64_t element_checked = 0; size_t remaining_element_size = 0; @@ -1322,6 +1332,8 @@ InferenceRequest::ValidateBytesInputs( size_t remaining_buffer_size = 0; int64_t buffer_memory_id; + RETURN_IF_ERROR(GetElementCount(input_dims, input_name, &element_count)); + // Validate elements until all buffers have been fully processed. while (remaining_buffer_size || buffer_next_idx < buffer_count) { // Get the next buffer if not currently processing one. diff --git a/src/infer_request.h b/src/infer_request.h index 1c7e83d6d..02ab5a4f0 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -1,4 +1,4 @@ -// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -632,7 +632,8 @@ class InferenceRequest { // required for the direct sequence batcher. The returned copy will // contain only the minimum content required for a null request. // The statistics of the copy will not be collected. - static InferenceRequest* CopyAsNull(const InferenceRequest& from); + static Status CopyAsNull( + const InferenceRequest& from, std::unique_ptr* to); uint64_t QueueStartNs() const { return queue_start_ns_; } uint64_t CaptureQueueStartNs() diff --git a/src/model_config_utils.cc b/src/model_config_utils.cc index 79f0c53aa..8920c589e 100644 --- a/src/model_config_utils.cc +++ b/src/model_config_utils.cc @@ -353,9 +353,12 @@ ValidateIOShape( } } - const int64_t dims_size = triton::common::GetElementCount(io.dims()); - const int64_t reshape_size = - triton::common::GetElementCount(io.reshape().shape()); + int64_t dims_size = 0; + int64_t reshape_size = 0; + RETURN_IF_ERROR( + GetElementCount(io.dims(), io.name() + " dims", &dims_size)); + RETURN_IF_ERROR(GetElementCount( + io.reshape().shape(), io.name() + " reshape", &reshape_size)); // dims and reshape must both have same element count // or both have variable-size dimension. @@ -372,12 +375,12 @@ ValidateIOShape( // each pair of the trunks separated by variable-size dimension has // the same element count. For instance, from [2, 4, -1, 6] to [8, -1, 1, 6] // is valid reshape as 2 * 4 = 8 and 6 = 1 * 6. - if (dims_size == -1) { + if (dims_size == triton::common::WILDCARD_SIZE) { std::vector dim_element_cnts; std::vector reshape_element_cnts; int64_t current_cnt = 1; for (const auto& dim : io.dims()) { - if (dim != -1) { + if (dim != triton::common::WILDCARD_DIM) { current_cnt *= dim; } else { dim_element_cnts.push_back(current_cnt); @@ -388,7 +391,7 @@ ValidateIOShape( current_cnt = 1; for (const auto& dim : io.reshape().shape()) { - if (dim != -1) { + if (dim != triton::common::WILDCARD_DIM) { current_cnt *= dim; } else { reshape_element_cnts.push_back(current_cnt); diff --git a/src/model_config_utils.h b/src/model_config_utils.h index 44bccabe7..61547077b 100644 --- a/src/model_config_utils.h +++ b/src/model_config_utils.h @@ -25,12 +25,13 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once +#include + #include "filesystem/api.h" #include "model_config.pb.h" #include "status.h" #include "triton/common/model_config.h" #include "tritonserver_apis.h" - namespace triton { namespace core { /// Enumeration for the different backend types. @@ -319,4 +320,77 @@ bool EquivalentInInstanceConfig( std::string InstanceConfigSignature( const inference::ModelInstanceGroup& instance_config); +template +Status +GetElementCount(const T& dims, const std::string& name, int64_t* cnt) +{ + if (cnt == nullptr) { + return Status(Status::Code::INTERNAL, "argument `cnt` cannot be nullptr"); + } + + int64_t element_count = 0; + element_count = triton::common::GetElementCount(dims); + if (element_count == triton::common::INVALID_SIZE) { + return Status( + Status::Code::INVALID_ARG, + "tensor '" + name + "' contains an invalid dimension in shape " + + triton::common::DimsListToString(dims)); + } else if (element_count == triton::common::OVERFLOW_SIZE) { + return Status( + Status::Code::INVALID_ARG, "element count for tensor '" + name + + "' exceeds maximum size of " + + std::to_string(INT64_MAX)); + } + + *cnt = element_count; + return Status::Success; +} + +template +Status +GetByteSize( + const inference::DataType& dtype, const T& dims, const std::string& name, + int64_t* size) +{ + if (size == nullptr) { + return Status(Status::Code::INTERNAL, "argument `size` cannot be nullptr"); + } + + int64_t byte_size = 0; + if (dtype == inference::DataType::TYPE_STRING) { + int64_t element_count = 0; + RETURN_IF_ERROR(GetElementCount(dims, name, &element_count)); + + if (element_count == triton::common::WILDCARD_SIZE) { + *size = triton::common::WILDCARD_SIZE; + return Status::Success; + } + + // Total number of bytes required is equal to the element count + // multiplied by 4. + if (element_count > static_cast(INT64_MAX / sizeof(int32_t))) { + return Status( + Status::Code::INVALID_ARG, "byte size for tensor '" + name + + "' exceeds maximum size of " + + std::to_string(INT64_MAX)); + } + byte_size = sizeof(int32_t) * element_count; + } else { + byte_size = triton::common::GetByteSize(dtype, dims); + if (byte_size == triton::common::INVALID_SIZE) { + return Status( + Status::Code::INVALID_ARG, + "tensor '" + name + "' contains an invalid dimension " + + triton::common::DimsListToString(dims)); + } else if (byte_size == triton::common::OVERFLOW_SIZE) { + return Status( + Status::Code::INVALID_ARG, "byte size for tensor '" + name + + "' exceeds maximum size of " + + std::to_string(INT64_MAX)); + } + } + *size = byte_size; + return Status::Success; +} + }} // namespace triton::core diff --git a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc index 45e9c037c..f51877301 100644 --- a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc +++ b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc @@ -1,4 +1,4 @@ -// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -384,13 +384,14 @@ SequenceBatchScheduler::GenerateInitialStateData( auto state_dim = state.dims().begin(); for (; initial_state_dim != initial_state.dims().end(); initial_state_dim++, state_dim++) { - if (*initial_state_dim == -1) { + if (*initial_state_dim == triton::common::WILDCARD_DIM) { return Status( Status::Code::INVALID_ARG, std::string("'initial_state' field for state input name '") + state.input_name() + "' contains variable dimensions."); } else { - if (*state_dim != -1 && *initial_state_dim != *state_dim) { + if (*state_dim != triton::common::WILDCARD_DIM && + *initial_state_dim != *state_dim) { return Status( Status::Code::INVALID_ARG, std::string("'initial_state' dim for input name '") + @@ -404,15 +405,11 @@ SequenceBatchScheduler::GenerateInitialStateData( } // Calculate total memory byte size - auto element_count = triton::common::GetElementCount(initial_state.dims()); - size_t dtype_byte_size = - triton::common::GetDataTypeByteSize(initial_state.data_type()); - size_t total_byte_size = element_count * dtype_byte_size; - - // Custom handling for TYPE_BYTES - if (dtype_byte_size == 0) { - total_byte_size = sizeof(int32_t) * element_count; - } + int64_t total_byte_size_signed = 0; + RETURN_IF_ERROR(GetByteSize( + initial_state.data_type(), initial_state.dims(), state.input_name(), + &total_byte_size_signed)); + size_t total_byte_size = static_cast(total_byte_size_signed); switch (initial_state.state_data_case()) { case inference::ModelSequenceBatching_InitialState::StateDataCase:: @@ -1757,8 +1754,12 @@ DirectSequenceBatch::BatcherThread(const int nice) // Use null-request if necessary otherwise use the next // request in the queue... if (use_null_request) { - std::unique_ptr ni( - InferenceRequest::CopyAsNull(*null_irequest)); + std::unique_ptr ni = nullptr; + Status status = InferenceRequest::CopyAsNull(*null_irequest, &ni); + if (!status.IsOk()) { + LOG_ERROR << "internal: unexpected failure copying null request: " + << status.Message(); + } // Note that when the not-ready control input of the // request is "true" the model can't assume that any // other inputs are meaningful, including CORRID. So we diff --git a/src/sequence_batch_scheduler/sequence_utils.cc b/src/sequence_batch_scheduler/sequence_utils.cc index c916ccca7..96abf62bf 100644 --- a/src/sequence_batch_scheduler/sequence_utils.cc +++ b/src/sequence_batch_scheduler/sequence_utils.cc @@ -1,4 +1,4 @@ -// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -43,8 +43,8 @@ IterativeSequencer::RescheduleRequest( else if (!request->IsCancelled()) { // Use a null request to trigger sequence batcher cancellation so // additional request manipulation won't affect the actual request. - std::unique_ptr ni( - InferenceRequest::CopyAsNull(*request)); + std::unique_ptr ni = nullptr; + RETURN_IF_ERROR(InferenceRequest::CopyAsNull(*request, &ni)); ni->SetCorrelationId(request->CorrelationId()); ni->SetFlags(TRITONSERVER_REQUEST_FLAG_SEQUENCE_END); ni->Cancel(); diff --git a/src/sequence_state.cc b/src/sequence_state.cc index e1c4dc13d..66e03b9f8 100644 --- a/src/sequence_state.cc +++ b/src/sequence_state.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -26,8 +26,11 @@ #include "sequence_state.h" +#include + #include "cuda_utils.h" #include "memory.h" +#include "model_config_utils.h" #include "triton/common/logging.h" namespace triton { namespace core { @@ -159,7 +162,7 @@ SequenceStates::Initialize( // Convert the variable dimensions to 1 for the first request. for (auto& dim : state_config.dims()) { - if (dim == -1) { + if (dim == triton::common::WILDCARD_DIM) { dims.push_back(1); } else { dims.push_back(dim); @@ -209,16 +212,10 @@ SequenceStates::Initialize( initial_state_it->second.data_->TotalByteSize()); } } else { - size_t state_size; - if (state.second.data_type() == inference::DataType::TYPE_STRING) { - auto element_count = triton::common::GetElementCount(dims); - // Total number of bytes required is equal to the element count - // multiplied by 4. - state_size = 4 * element_count; - } else { - state_size = - triton::common::GetByteSize(state.second.data_type(), dims); - } + int64_t state_size = 0; + RETURN_IF_ERROR(GetByteSize( + state.second.data_type(), dims, state_config.input_name(), + &state_size)); if (use_growable_memory) { std::unique_ptr growable_memory; RETURN_IF_ERROR(GrowableMemory::Create( @@ -374,9 +371,16 @@ SequenceStates::OutputState( return OutputState(name, datatype, shape.data(), shape.size(), output_state); } -std::shared_ptr -SequenceStates::CopyAsNull(const std::shared_ptr& from) +Status +SequenceStates::CopyAsNull( + const std::shared_ptr& from, + std::shared_ptr* to) { + if (to == nullptr) { + return Status( + Status::Code::INVALID_ARG, "SequenceStates 'to' must not be null"); + } + std::shared_ptr lsequence_states; if (from != nullptr) { lsequence_states.reset(new SequenceStates); @@ -394,10 +398,10 @@ SequenceStates::CopyAsNull(const std::shared_ptr& from) std::shared_ptr data; if (from_input_state_tensor->DType() == inference::DataType::TYPE_STRING) { - // Use all-zero input states for null requests. - auto element_count = - triton::common::GetElementCount(from_input_state_tensor->Shape()); - auto state_size = 4 * element_count; + int64_t state_size = 0; + RETURN_IF_ERROR(GetByteSize( + inference::DataType::TYPE_STRING, from_input_state_tensor->Shape(), + from_input_state_tensor->Name(), &state_size)); data = std::make_shared( state_size, TRITONSERVER_MEMORY_CPU, 0); } else { @@ -424,6 +428,7 @@ SequenceStates::CopyAsNull(const std::shared_ptr& from) false /* use_growable_memory */))); } } - return lsequence_states; + *to = std::move(lsequence_states); + return Status::Success; } }} // namespace triton::core diff --git a/src/sequence_state.h b/src/sequence_state.h index 7faba3429..c2e9fe909 100644 --- a/src/sequence_state.h +++ b/src/sequence_state.h @@ -1,4 +1,4 @@ -// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -160,8 +160,10 @@ class SequenceStates { const std::vector& shape, SequenceState** output_state); // Create a copy of the 'from' sequence states for NULL requests. - static std::shared_ptr CopyAsNull( - const std::shared_ptr& from); + // On success, sets *to and returns Status::Success; on failure returns error. + static Status CopyAsNull( + const std::shared_ptr& from, + std::shared_ptr* to); const std::map>& InputStates() {