diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 8412c9c914..8246b1f8e7 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -537,7 +537,7 @@ def convert_method_to_trt_engine( module, torchtrt_arg_inputs, kwarg_inputs=torchtrt_kwarg_inputs, **kwargs ) - return dynamo_convert_exported_program_to_serialized_trt_engine( # type: ignore[no-any-return] + return dynamo_convert_exported_program_to_serialized_trt_engine( exp_program, arg_inputs=tuple(normalized_arg_inputs), kwarg_inputs=torchtrt_kwarg_inputs, @@ -1272,41 +1272,36 @@ def _save_as_executorch(exp_program: Any, file_path: str, **kwargs: Any) -> None def _normalize_engine_constants_to_python(exp_program: "ExportedProgram") -> None: - pass + """Convert C++ ``torch.classes.tensorrt.Engine`` constants to Python ``TRTEngine``. + The C++ runtime stores engine constants as ``torch._C.ScriptObject`` + (``torch.classes.tensorrt.Engine``). Python ``TRTEngine`` is registered as + an opaque type so ``torch.export`` can serialise it with ``pickle``. By + converting before save the artifact is portable across both runtimes. + """ + import base64 -# TODO: Uncomment this when cross serialization is enabled -# """Convert C++ ``torch.classes.tensorrt.Engine`` constants to Python ``TRTEngine``. - -# The C++ runtime stores engine constants as ``torch._C.ScriptObject`` -# (``torch.classes.tensorrt.Engine``). Python ``TRTEngine`` is registered as -# an opaque type so ``torch.export`` can serialise it with ``pickle``. By -# converting before save the artifact is portable across both runtimes. -# """ -# import base64 - -# from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ENGINE_IDX -# from torch_tensorrt.dynamo.runtime._TRTEngine import ( -# EngineSerializer, -# TRTEngine, -# ) - -# for fqn, constant in list(exp_program.constants.items()): -# if isinstance(constant, (torch._C.ScriptObject, TRTEngine)): + from torch_tensorrt.dynamo.runtime._serialized_engine_layout import ENGINE_IDX + from torch_tensorrt.dynamo.runtime._TRTEngine import ( + EngineSerializer, + TRTEngine, + ) -# state = constant.__getstate__() -# if len(state) == 2 and ( -# state[1] == "TRTEngine" -# or state[1] == "__torch__.torch.classes.tensorrt.Engine" -# ): -# serialized_info = list(state[0]) -# serialized_info[ENGINE_IDX] = base64.b64decode( -# serialized_info[ENGINE_IDX] -# ) -# exp_program.constants[fqn] = EngineSerializer(serialized_info) + for fqn, constant in list(exp_program.constants.items()): + if isinstance(constant, (torch._C.ScriptObject, TRTEngine)): + + state = constant.__getstate__() + if len(state) == 2 and ( + state[1] == "TRTEngine" + or state[1] == "__torch__.torch.classes.tensorrt.Engine" + ): + serialized_info = list(state[0]) + serialized_info[ENGINE_IDX] = base64.b64decode( + serialized_info[ENGINE_IDX] + ) + exp_program.constants[fqn] = EngineSerializer(serialized_info) -# def function_overload_with_kwargs( fn: Callable[..., Any], *args: Any, **kwargs: Any ) -> Any: diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 29c2ed076a..ea7e6f95bc 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -83,7 +83,7 @@ def cross_compile_for_windows( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = False, + use_python_runtime: bool = False, # Deprecated; setting True emits DeprecationWarning. Kept for backward compatibility. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -163,7 +163,7 @@ def cross_compile_for_windows( max_aux_stream (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. + use_python_runtime: (bool): **Deprecated**. Kept for backward compatibility; emits a ``DeprecationWarning`` when set to ``True``. The Python and C++ runtimes are now merged and the runtime is selected automatically based on whether the C++ Torch-TensorRT runtime is available. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -220,6 +220,16 @@ def cross_compile_for_windows( stacklevel=2, ) + if use_python_runtime: + warnings.warn( + "`use_python_runtime` is deprecated and has no effect. The Python and C++ " + "runtimes have been merged; the runtime is now selected automatically based " + "on whether the C++ Torch-TensorRT runtime is available. This argument will " + "be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + if "refit" in kwargs.keys(): warnings.warn( "`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refitted.", @@ -334,7 +344,6 @@ def cross_compile_for_windows( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, - "use_python_runtime": use_python_runtime, } # disable the following settings is not supported for cross compilation for windows feature @@ -424,7 +433,7 @@ def compile( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = False, + use_python_runtime: bool = False, # Deprecated; setting True emits DeprecationWarning. Kept for backward compatibility. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -519,7 +528,7 @@ def compile( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. + use_python_runtime: (bool): **Deprecated**. Kept for backward compatibility; emits a ``DeprecationWarning`` when set to ``True``. The Python and C++ runtimes are now merged and the runtime is selected automatically based on whether the C++ Torch-TensorRT runtime is available. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -579,6 +588,16 @@ def compile( stacklevel=2, ) + if use_python_runtime: + warnings.warn( + "`use_python_runtime` is deprecated and has no effect. The Python and C++ " + "runtimes have been merged; the runtime is now selected automatically based " + "on whether the C++ Torch-TensorRT runtime is available. This argument will " + "be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + if "refit" in kwargs.keys(): warnings.warn( "`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refitted", @@ -731,7 +750,6 @@ def compile( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, - "use_python_runtime": use_python_runtime, } logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB") settings = CompilationSettings(**compilation_options) @@ -1218,7 +1236,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS, version_compatible: bool = _defaults.VERSION_COMPATIBLE, optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL, - use_python_runtime: bool = False, + use_python_runtime: bool = False, # Deprecated; setting True emits DeprecationWarning. Kept for backward compatibility. use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER, enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS, dryrun: bool = _defaults.DRYRUN, @@ -1294,7 +1312,7 @@ def convert_exported_program_to_serialized_trt_engine( max_aux_streams (Optional[int]): Maximum streams in the engine version_compatible (bool): Build the TensorRT engines compatible with future versions of TensorRT (Restrict to lean runtime operators to provide version forward compatibility for the engines) optimization_level: (Optional[int]): Setting a higher optimization level allows TensorRT to spend longer engine building time searching for more optimization options. The resulting engine may have better performance compared to an engine built with a lower optimization level. The default optimization level is 3. Valid values include integers from 0 to the maximum optimization level, which is currently 5. Setting it to be greater than the maximum level results in identical behavior to the maximum level. - use_python_runtime: (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). The default is ``False``, which uses the C++ runtime when available and falls back to the Python runtime automatically when the C++ runtime is unavailable. + use_python_runtime: (bool): **Deprecated**. Kept for backward compatibility; emits a ``DeprecationWarning`` when set to ``True``. The Python and C++ runtimes are now merged and the runtime is selected automatically based on whether the C++ Torch-TensorRT runtime is available. use_fast_partitioner: (bool): Use the adjacency based partitioning scheme instead of the global partitioner. Adjacency partitioning is faster but may not be optimal. Use the global paritioner (``False``) if looking for best performance enable_experimental_decompositions (bool): Use the full set of operator decompositions. These decompositions may not be tested but serve to make the graph easier to convert to TensorRT, potentially increasing the amount of graphs run in TensorRT. dryrun (bool): Toggle for "Dryrun" mode, running everything except conversion to TRT and logging outputs @@ -1344,6 +1362,16 @@ def convert_exported_program_to_serialized_trt_engine( stacklevel=2, ) + if use_python_runtime: + warnings.warn( + "`use_python_runtime` is deprecated and has no effect. The Python and C++ " + "runtimes have been merged; the runtime is now selected automatically based " + "on whether the C++ Torch-TensorRT runtime is available. This argument will " + "be removed in a future release.", + DeprecationWarning, + stacklevel=2, + ) + if "refit" in kwargs.keys(): warnings.warn( "`refit` is deprecated. Please set `immutable_weights=False` to build a refittable engine whose weights can be refitted", @@ -1473,7 +1501,6 @@ def convert_exported_program_to_serialized_trt_engine( "use_distributed_mode_trace": use_distributed_mode_trace, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, - "use_python_runtime": use_python_runtime, } if "runtime_cache_path" in compilation_options: compilation_options.pop("runtime_cache_path") diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 784066cc75..8b2e1f24a4 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -70,7 +70,6 @@ DECOMPOSE_ATTENTION = False ATTN_BIAS_IS_CAUSAL = True DYNAMIC_SHAPES_KERNEL_SPECIALIZATION_STRATEGY = "lazy" -USE_PYTHON_RUNTIME = False if platform.system() == "Linux": import pwd diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py index c92fc77341..d844b8d92c 100644 --- a/py/torch_tensorrt/dynamo/_exporter.py +++ b/py/torch_tensorrt/dynamo/_exporter.py @@ -472,8 +472,6 @@ def inline_trt_modules( continue # Get the TRT submodule trt_module = getattr(gm, name) - if trt_module._use_python_runtime: - raise ValueError("Python runtime is not supported for serialization") # Ensure the trt module node in the main graph (gm) has inputs trt_module_node = [node for node in gm.graph.nodes if node.name == name] diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 3fe18e0a0d..2c3434b706 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -53,7 +53,6 @@ USE_DISTRIBUTED_MODE_TRACE, USE_FAST_PARTITIONER, USE_FP32_ACC, - USE_PYTHON_RUNTIME, VERSION_COMPATIBLE, WORKSPACE_SIZE, default_device, @@ -118,7 +117,6 @@ class CompilationSettings: dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. - use_python_runtime (bool): Force the pure-Python TensorRT runtime (``TRTEngine`` + ``tensorrt::execute_engine_python``). When ``False`` (default) the C++ runtime is used if available and the Python runtime is used as a fallback otherwise. """ workspace_size: int = WORKSPACE_SIZE @@ -181,7 +179,6 @@ class CompilationSettings: dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES decompose_attention: bool = DECOMPOSE_ATTENTION attn_bias_is_causal: bool = ATTN_BIAS_IS_CAUSAL - use_python_runtime: bool = USE_PYTHON_RUNTIME def __getstate__(self) -> dict[str, Any]: from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( @@ -196,6 +193,7 @@ def __getstate__(self) -> dict[str, Any]: return state def __setstate__(self, state: dict[str, Any]) -> None: + state.pop("use_python_runtime", None) self.__dict__.update(state) diff --git a/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py index 74d363752f..391914654a 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py +++ b/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py @@ -137,43 +137,42 @@ def set_runtime_states( # Pickle reconstruction — returns the right engine type for the current runtime # --------------------------------------------------------------------------- -# TODO: Uncomment this when cross serialization is enabled -# def _reconstruct_trt_engine(serialized_info: List[Any]) -> Any: -# """Reconstruct a TRT engine from its serialized info list. +def _reconstruct_trt_engine(serialized_info: List[Any]) -> Any: + """Reconstruct a TRT engine from its serialized info list. -# Called by pickle when deserializing a ``TRTEngine``. Checks which runtime -# is available and returns either a C++ ``torch.classes.tensorrt.Engine`` or -# a Python ``TRTEngine``, so a single ``.pt2`` artifact is portable across -# runtimes. -# """ -# serialized_info = list(serialized_info) -# engine_field = serialized_info[ENGINE_IDX] -# if isinstance(engine_field, str): -# serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) -# elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): -# serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) + Called by pickle when deserializing a ``TRTEngine``. Checks which runtime + is available and returns either a C++ ``torch.classes.tensorrt.Engine`` or + a Python ``TRTEngine``, so a single ``.pt2`` artifact is portable across + runtimes. + """ + serialized_info = list(serialized_info) + engine_field = serialized_info[ENGINE_IDX] + if isinstance(engine_field, str): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field.encode("utf-8")) + elif isinstance(engine_field, bytes) and not engine_field.startswith(b"ftrt"): + serialized_info[ENGINE_IDX] = base64.b64decode(engine_field) -# if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: -# return torch.classes.tensorrt.Engine(tuple(serialized_info)) + if torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: + return torch.classes.tensorrt.Engine(tuple(serialized_info)) -# return TRTEngine(serialized_info) + return TRTEngine(serialized_info) -# class EngineSerializer(OpaqueBase): # type: ignore[misc] -# def __init__(self, serialized_info: SerializedTensorRTEngineFmt) -> None: -# self.serialized_info = serialized_info +class EngineSerializer(OpaqueBase): # type: ignore[misc] + def __init__(self, serialized_info: SerializedTensorRTEngineFmt) -> None: + self.serialized_info = serialized_info -# def __reduce__(self) -> Tuple[Any, Tuple[List[Any]]]: -# """Pickle protocol: delegates to :func:`_reconstruct_trt_engine`. + def __reduce__(self) -> Tuple[Any, Tuple[List[Any]]]: + """Pickle protocol: delegates to :func:`_reconstruct_trt_engine`. -# The reconstruction function checks which runtime is available at -# load time and returns either a C++ ``torch.classes.tensorrt.Engine`` -# or a Python ``TRTEngine``, so a single saved artifact works on both. -# """ -# state = list(self.serialized_info) -# state[ENGINE_IDX] = base64.b64encode(state[ENGINE_IDX]).decode("utf-8") -# return (_reconstruct_trt_engine, (state,)) + The reconstruction function checks which runtime is available at + load time and returns either a C++ ``torch.classes.tensorrt.Engine`` + or a Python ``TRTEngine``, so a single saved artifact works on both. + """ + state = list(self.serialized_info) + state[ENGINE_IDX] = base64.b64encode(state[ENGINE_IDX]).decode("utf-8") + return (_reconstruct_trt_engine, (state,)) # --------------------------------------------------------------------------- @@ -889,45 +888,44 @@ def execute( return self._execute_standard(contiguous_inputs) -# register_opaque_type(EngineSerializer, typ="reference") +register_opaque_type(EngineSerializer, typ="reference") +if not torch_tensorrt.ENABLED_FEATURES.torch_tensorrt_runtime: -register_opaque_type(TRTEngine, typ="reference") + register_opaque_type(TRTEngine, typ="reference") + @torch.library.custom_op( # type: ignore[misc] + "tensorrt::execute_engine", mutates_args=() + ) + def execute_engine( + input_tensors: List[torch.Tensor], engine: TRTEngine + ) -> List[torch.Tensor]: + outputs = engine.execute(input_tensors) + return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) -@torch.library.custom_op( # type: ignore[misc] - "tensorrt::execute_engine_python", mutates_args=() -) -def execute_engine_python( - input_tensors: List[torch.Tensor], engine: TRTEngine -) -> List[torch.Tensor]: - outputs = engine.execute(input_tensors) - return [outputs] if isinstance(outputs, torch.Tensor) else list(outputs) - - -@execute_engine_python.register_fake # type: ignore[misc] -def execute_engine_python_fake( - input_tensors: List[torch.Tensor], engine: TRTEngine -) -> List[torch.Tensor]: - """Abstract/fake kernel for ``tensorrt::execute_engine``. - - Called by FakeTensor propagation and ``torch.export`` to infer output - shapes and dtypes without executing the real TRT engine. Output shapes - are obtained by asking the engine's execution context to propagate the - concrete input shapes symbolically; dtypes come from the engine's - pre-parsed output dtype list. - """ - input_shapes = [list(t.shape) for t in input_tensors] - try: - output_shapes = engine.infer_outputs(input_shapes) - except Exception: - # Fall back to the statically-stored shapes when shape inference is - # unavailable (e.g. engine context not yet initialised in meta mode). - output_shapes = [list(s) for s in engine.output_shapes] - - return [ - torch.empty( - shape, dtype=engine.output_dtypes[i], device=input_tensors[0].device - ) - for i, shape in enumerate(output_shapes) - ] + @execute_engine.register_fake # type: ignore[misc] + def execute_engine_fake( + input_tensors: List[torch.Tensor], engine: TRTEngine + ) -> List[torch.Tensor]: + """Abstract/fake kernel for ``tensorrt::execute_engine``. + + Called by FakeTensor propagation and ``torch.export`` to infer output + shapes and dtypes without executing the real TRT engine. Output shapes + are obtained by asking the engine's execution context to propagate the + concrete input shapes symbolically; dtypes come from the engine's + pre-parsed output dtype list. + """ + input_shapes = [list(t.shape) for t in input_tensors] + try: + output_shapes = engine.infer_outputs(input_shapes) + except Exception: + # Fall back to the statically-stored shapes when shape inference is + # unavailable (e.g. engine context not yet initialised in meta mode). + output_shapes = [list(s) for s in engine.output_shapes] + + return [ + torch.empty( + shape, dtype=engine.output_dtypes[i], device=input_tensors[0].device + ) + for i, shape in enumerate(output_shapes) + ] diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py index 0386c97ea3..807a54b0db 100644 --- a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py @@ -126,10 +126,7 @@ def __init__( self.settings = copy.deepcopy(settings) self.weight_name_map = weight_name_map self.serialized_engine = serialized_engine - self.engine: Any = None - self._use_python_runtime = settings.use_python_runtime - - self.execute_engine_op: Any = None + self.engine = None self.requires_output_allocator = requires_output_allocator self.dynamically_allocate_resources = settings.dynamically_allocate_resources self.symbolic_shape_expressions = symbolic_shape_expressions @@ -275,17 +272,15 @@ def setup_engine(self) -> None: if self.engine is not None: return - if self._use_python_runtime: + if ENABLED_FEATURES.torch_tensorrt_runtime: + self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) + else: from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine - self.engine = TRTEngine( + self.engine = TRTEngine( # type: ignore[assignment] self._pack_engine_info(), profile_execution=self.profiling_enabled, ) - self.execute_engine_op = torch.ops.tensorrt.execute_engine_python - else: - self.engine = torch.classes.tensorrt.Engine(self._pack_engine_info()) - self.execute_engine_op = torch.ops.tensorrt.execute_engine # requires_native_multidevice is set by the C++ constructor from the serialized REQUIRES_NATIVE_MULTIDEVICE_IDX field. if self.engine.requires_native_multidevice: @@ -380,28 +375,18 @@ def set_extra_state(self, state: SerializedTorchTensorRTModuleFmt) -> None: self.weight_name_map = metadata["weight_name_map"] self.symbolic_shape_expressions = metadata["inout_symexprs"] - # Re-resolve the runtime now that we have the loaded settings: the - # original __init__ kwarg may have been False, but a saved engine - # can still pin use_python_runtime=True via the settings blob. - self._use_python_runtime = ( - getattr(self.settings, "use_python_runtime", False) - or not ENABLED_FEATURES.torch_tensorrt_runtime - ) - if self._use_python_runtime: + if ENABLED_FEATURES.torch_tensorrt_runtime: + self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) + else: from torch_tensorrt.dynamo.runtime._TRTEngine import TRTEngine - self.engine = TRTEngine(serialized_engine_info) - self.execute_engine_op = torch.ops.tensorrt.execute_engine_python - else: - self.engine = torch.classes.tensorrt.Engine(serialized_engine_info) - self.execute_engine_op = torch.ops.tensorrt.execute_engine + self.engine = TRTEngine(serialized_engine_info) # type: ignore[assignment] self.engine.set_output_tensors_as_unowned( metadata["output_tensors_are_unowned"] ) else: self.engine = None - self.execute_engine_op = None self.settings = CompilationSettings() self.hardware_compatible = False @@ -454,12 +439,7 @@ def forward(self, *inputs: Any) -> torch.Tensor | Tuple[torch.Tensor, ...]: else: input_tensors.append(torch.tensor(i).cuda()) - if self.execute_engine_op is None: - raise RuntimeError( - "execute_engine op has not been bound. Call setup_engine() first." - ) - - outputs = self.execute_engine_op(input_tensors, self.engine) + outputs = torch.ops.tensorrt.execute_engine(input_tensors, self.engine) if len(outputs) == 1: return outputs[0] diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index db3e1cea45..33595f4709 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -592,15 +592,14 @@ def parse_dynamo_kwargs( if "options" in kwargs and len(kwargs) == 1: kwargs = kwargs["options"] - # TODO: Uncomment this when cross serialization is enabled - # if "use_python_runtime" in kwargs: - # warnings.warn( - # 'torch.compile option "use_python_runtime" was removed; use ' - # "the Python runtime is now selected automatically when the C++ extension is unavailable.", - # DeprecationWarning, - # stacklevel=2, - # ) - # kwargs = {k: v for k, v in kwargs.items() if k != "use_python_runtime"} + if "use_python_runtime" in kwargs: + warnings.warn( + 'torch.compile option "use_python_runtime" was removed; use ' + "the Python runtime is now selected automatically when the C++ extension is unavailable.", + DeprecationWarning, + stacklevel=2, + ) + kwargs = {k: v for k, v in kwargs.items() if k != "use_python_runtime"} if "truncate_long_and_double" in kwargs: if (