From 28782afac742e458249da7770ee826f0e5cd0b9e Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Sat, 21 Mar 2026 22:43:45 +0530 Subject: [PATCH 1/6] Update --- protobuf/model_config.proto | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto index 2ce5cc0..8ab1771 100644 --- a/protobuf/model_config.proto +++ b/protobuf/model_config.proto @@ -1662,11 +1662,12 @@ message ModelEnsembling //@@ .. cpp:var:: uint32 max_inflight_requests //@@ - //@@ The maximum number of concurrent inflight requests allowed at each - //@@ ensemble step per inference request. This limit prevents unbounded - //@@ memory growth when ensemble steps produce responses faster than - //@@ downstream steps can consume, e.g. decoupled models. - //@@ Default value is 0, which indicates that no limit is enforced. + //@@ The maximum number of concurrent in-flight requests allowed at each + //@@ ensemble step across all ongoing ensemble requests for this model. + //@@ This global, per-step limit prevents unbounded memory growth when + //@@ ensemble steps produce responses faster than downstream steps can + //@@ consume them (for example, in decoupled models). + //@@ The default value is 0, which indicates that no limit is enforced. //@@ //@@ Note: Applying this limit may block upstream steps while they wait //@@ for downstream capacity. This blocking does not cancel or internally From 82892981df5c643560aab5bcc496530f312c6abb Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 31 Mar 2026 11:26:38 +0530 Subject: [PATCH 2/6] Update --- protobuf/model_config.proto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto index 8ab1771..81881d6 100644 --- a/protobuf/model_config.proto +++ b/protobuf/model_config.proto @@ -1664,7 +1664,7 @@ message ModelEnsembling //@@ //@@ The maximum number of concurrent in-flight requests allowed at each //@@ ensemble step across all ongoing ensemble requests for this model. - //@@ This global, per-step limit prevents unbounded memory growth when + //@@ This per-step limit prevents unbounded memory growth when //@@ ensemble steps produce responses faster than downstream steps can //@@ consume them (for example, in decoupled models). //@@ The default value is 0, which indicates that no limit is enforced. From 7ad39d2eaf3f78cd1cb09a662a08e325ebfe1def Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 31 Mar 2026 16:20:15 +0530 Subject: [PATCH 3/6] Update protobuf/model_config.proto Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- protobuf/model_config.proto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto index 81881d6..e8f3659 100644 --- a/protobuf/model_config.proto +++ b/protobuf/model_config.proto @@ -1663,7 +1663,7 @@ message ModelEnsembling //@@ .. cpp:var:: uint32 max_inflight_requests //@@ //@@ The maximum number of concurrent in-flight requests allowed at each - //@@ ensemble step across all ongoing ensemble requests for this model. + //@@ ensemble step across all ongoing ensemble requests for this model instance. //@@ This per-step limit prevents unbounded memory growth when //@@ ensemble steps produce responses faster than downstream steps can //@@ consume them (for example, in decoupled models). From a4e6a458c14e0ad69b6c2851d087ba5fe3eab198 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 31 Mar 2026 16:20:33 +0530 Subject: [PATCH 4/6] Update protobuf/model_config.proto Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- protobuf/model_config.proto | 1 + 1 file changed, 1 insertion(+) diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto index e8f3659..fe24cb7 100644 --- a/protobuf/model_config.proto +++ b/protobuf/model_config.proto @@ -1662,6 +1662,7 @@ message ModelEnsembling //@@ .. cpp:var:: uint32 max_inflight_requests //@@ + //@@ BETA (Subject to change) //@@ The maximum number of concurrent in-flight requests allowed at each //@@ ensemble step across all ongoing ensemble requests for this model instance. //@@ This per-step limit prevents unbounded memory growth when From 7945a0b0791e283f6dfbe3cfaeb74e7976a19690 Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 31 Mar 2026 16:20:53 +0530 Subject: [PATCH 5/6] Update protobuf/model_config.proto Co-authored-by: Yingge He <157551214+yinggeh@users.noreply.github.com> --- protobuf/model_config.proto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto index fe24cb7..c5ccfda 100644 --- a/protobuf/model_config.proto +++ b/protobuf/model_config.proto @@ -1,4 +1,4 @@ -// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions From b84b5064ee90a82610d75c935822fbc1fc4ca57a Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Tue, 31 Mar 2026 17:59:04 +0530 Subject: [PATCH 6/6] Update --- protobuf/model_config.proto | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protobuf/model_config.proto b/protobuf/model_config.proto index c5ccfda..5875416 100644 --- a/protobuf/model_config.proto +++ b/protobuf/model_config.proto @@ -1664,8 +1664,8 @@ message ModelEnsembling //@@ //@@ BETA (Subject to change) //@@ The maximum number of concurrent in-flight requests allowed at each - //@@ ensemble step across all ongoing ensemble requests for this model instance. - //@@ This per-step limit prevents unbounded memory growth when + //@@ ensemble step across all ongoing ensemble requests for this model + //@@ instance. This per-step limit prevents unbounded memory growth when //@@ ensemble steps produce responses faster than downstream steps can //@@ consume them (for example, in decoupled models). //@@ The default value is 0, which indicates that no limit is enforced.