diff --git a/cudaq/include/cudaq/Target/CompileTarget.h b/cudaq/include/cudaq/Target/CompileTarget.h
index ffd1371f394..cb741f469ed 100644
--- a/cudaq/include/cudaq/Target/CompileTarget.h
+++ b/cudaq/include/cudaq/Target/CompileTarget.h
@@ -21,6 +21,17 @@ class CompileTarget {
   /// Hook to update the pass pipeline before compilation.
   virtual void updatePassPipeline(std::string &passPipeline) const {}
 
+  /// Whether to recompile the kernel in the presence of an AOT-compiled module.
+  ///
+  /// If this is `false` and an AOT-compiled kernel (in the form of a function
+  /// pointer) is provided, then compilation will be skipped and all other
+  /// options in this class will be ignored.
+  ///
+  /// If this is `true`, the AOT-compiled module (if it exists) will be
+  /// discarded and compilation will start from scratch, according to the
+  /// options in this class.
+  bool overrideAOTCompilation = false;
+
   /// Resolved MLIR pass-pipeline and `codegen` settings.
   struct PipelineConfig {
     /// If set, override compilation pipeline with this string.
diff --git a/cudaq/tools/nvqpp/nvq++.in b/cudaq/tools/nvqpp/nvq++.in
index c7d7bd05d77..7bbbfac9fd7 100644
--- a/cudaq/tools/nvqpp/nvq++.in
+++ b/cudaq/tools/nvqpp/nvq++.in
@@ -724,6 +724,8 @@ NVQIR_LIBS="${NVQIR_LIBS}${NVQIR_SIMULATION_BACKEND}"
 # Add the MLIR-related libraries if requested. 
 if ${ENABLE_MLIR_LIB_LINKING}; then 
     LINKLIBS="${LINKLIBS} -lcudaq-mlir-runtime -lcudaq-builder"
+else
+	PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_DISABLE_JIT_COMPILER"
 fi
 
 if ${ENABLE_REALTIME_LOWERING}; then
diff --git a/python/runtime/interop/CMakeLists.txt b/python/runtime/interop/CMakeLists.txt
index 02135cd4980..6e1bb28a35b 100644
--- a/python/runtime/interop/CMakeLists.txt
+++ b/python/runtime/interop/CMakeLists.txt
@@ -15,8 +15,7 @@ target_include_directories(cudaq-python-interop PRIVATE
     ${Python3_INCLUDE_DIRS}
 )
 target_link_libraries(cudaq-python-interop
-    PRIVATE nanobind-static Python3::Module cudaq
-    PUBLIC cudaq-mlir-runtime-headers)
+    PRIVATE nanobind-static Python3::Module cudaq)
 install (FILES PythonCppInterop.h PythonCppInteropDecls.h DESTINATION include/cudaq/python/)
 
 install(TARGETS cudaq-python-interop EXPORT cudaq-python-interop-targets DESTINATION lib)
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index c00cc025802..f66c01e069a 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -14,7 +14,6 @@
 #include "common/KernelExecution.h"
 #include "common/Resources.h"
 #include "common/ServerHelper.h"
-#include "cudaq_internal/compiler/Compiler.h"
 #include "nvqir/AnalysisScope.h"
 #include "nvqir/resourcecounter/ResourceCounterScope.h"
 #include "cudaq/Target/TargetConfig.h"
@@ -56,8 +55,6 @@ inline observe_result observeResultFromCounts(const observe_policy &policy,
 
 class BaseRemoteRESTQPU : public QPU {
 protected:
-  using Compiler = cudaq_internal::compiler::Compiler;
-
   /// The number of shots
   std::optional<int> nShots;
 
@@ -280,47 +277,6 @@ class BaseRemoteRESTQPU : public QPU {
     return target;
   }
 
-  /// @brief Build the list of kernel executions for the given module under
-  /// a specific sampling policy. Source modules are lowered through the
-  /// configured pass pipeline; pre-compiled modules are emitted directly.
-  /// The resolved kernel name is returned via @p kernelName.
-  template <typename Policy>
-  std::pair<std::string, std::vector<cudaq::KernelExecution>>
-  compileKernelExecutions(Policy &policy, const AnyModule &module,
-                          KernelArgs args) {
-    Compiler compiler(getCompileTarget(policy));
-    std::string kernelName;
-    std::optional<CompiledModule> compiled;
-    if (std::holds_alternative<SourceModule>(module)) {
-      const auto &src = std::get<SourceModule>(module);
-      kernelName = src.getName();
-      CUDAQ_INFO("launching remote rest kernel ({})", kernelName);
-
-      auto [moduleOp, context] = Compiler::loadQuakeCodeByName(kernelName);
-
-      compiled = compiler.runPassPipeline(kernelName, moduleOp, args, true,
-                                          std::move(context));
-      if constexpr (std::is_same_v<Policy, sample_policy>) {
-        if (compiler.hasWarnedNamedMeasurements())
-          policy.warnedNamedMeasurements = true;
-      }
-    } else {
-      compiled = std::get<CompiledModule>(module);
-      kernelName = compiled->getName();
-      CUDAQ_INFO("launching remote rest kernel via module ({})", kernelName);
-    }
-
-    auto codes = compiler.emitKernelExecutions(*compiled);
-
-    // Propagate metadata from the compiled artifact to the execution context.
-    if (auto ctx = getExecutionContext()) {
-      ctx->hasConditionalsOnMeasureResults =
-          compiled->getMetadata().hasConditionalsOnMeasureResults;
-    }
-
-    return {kernelName, codes};
-  }
-
   void completeLaunchKernel(const std::string &kernelName,
                             std::vector<cudaq::KernelExecution> &&codes) {
     auto executionContext = cudaq::getExecutionContext();
@@ -514,7 +470,7 @@ class BaseRemoteRESTQPU : public QPU {
   }
 
   async_observe_result
-  completeLaunchKernel(async_observe_policy &policy,
+  completeLaunchKernel(const async_observe_policy &policy,
                        const std::string &kernelName,
                        std::vector<cudaq::KernelExecution> &&codes) {
     std::size_t localShots = 1000;
diff --git a/runtime/common/CompiledModule.h b/runtime/common/CompiledModule.h
index 9b4b9d11fb0..d5ec2966700 100644
--- a/runtime/common/CompiledModule.h
+++ b/runtime/common/CompiledModule.h
@@ -263,34 +263,13 @@ class FatQuakeModule {
   ArtifactsStore artifacts;
 };
 
-/// @brief A compiled MLIR module, ready for execution or code generation.
-///
-/// Contains any number of named compilation artifacts (we currently support
-/// JIT binaries, optimized MLIR modules, and pre-computed resource metrics)
-/// that result from the compilation of a Quake MLIR module.
-///
-/// This type does not depend on MLIR/LLVM — it only keeps type-erased / opaque
-/// pointers. Build instances with
-/// `cudaq_internal::compiler::CompiledModuleHelper`.
-class CompiledModule : public FatQuakeModule {
-public:
-  using CompilationMetadata = FatQuakeModule::CompilationMetadata;
-
-private:
-  friend class cudaq_internal::compiler::CompiledModuleHelper;
-
-  CompiledModule(std::string kernelName)
-      : FatQuakeModule(std::move(kernelName)) {}
-
-public:
-  CompiledModule() : FatQuakeModule(std::string{}) {}
-};
-
 /// Bundle of artifacts that define a CUDA-Q kernel to be compiled and executed.
 ///
 /// Contains either a `nvq++`-compiled function pointer or an MLIR module,
 /// depending on the provenance of the kernel.
 class SourceModule : public FatQuakeModule {
+  friend class cudaq_internal::compiler::CompiledModuleHelper;
+
 public:
   SourceModule(std::string kernelName)
       : FatQuakeModule(std::move(kernelName)) {}
@@ -309,10 +288,35 @@ class SourceModule : public FatQuakeModule {
   const void *getMlirOpaqueModulePtr() const;
 };
 
-// TODO: remove once C++ launch can be cleanly split into compilation + launch.
-// Used by unifiedLaunchModule to compile kernels if they have not been compiled
-// before. In the future, unifiedLaunchModule should only accept compiled
-// modules.
+/// @brief A compiled MLIR module, ready for execution or code generation.
+///
+/// Contains any number of named compilation artifacts (we currently support
+/// JIT binaries, optimized MLIR modules, and pre-computed resource metrics)
+/// that result from the compilation of a Quake MLIR module.
+///
+/// This type does not depend on MLIR/LLVM — it only keeps type-erased / opaque
+/// pointers. Build instances with
+/// `cudaq_internal::compiler::CompiledModuleHelper`.
+class CompiledModule : public FatQuakeModule {
+public:
+  using CompilationMetadata = FatQuakeModule::CompilationMetadata;
+
+private:
+  friend class cudaq_internal::compiler::CompiledModuleHelper;
+
+  CompiledModule(std::string kernelName)
+      : FatQuakeModule(std::move(kernelName)) {}
+
+public:
+  // The choice of constructors is intentionally limited to:
+  //  - empty compiled modules for default construction
+  //  - compiled module from a source module to explicitly bypass the compiler
+  // For any other use case, you should go through the factory methods in
+  // `CompiledModuleHelper`.
+  CompiledModule() : FatQuakeModule(std::string{}) {}
+  explicit CompiledModule(SourceModule src) : FatQuakeModule(std::move(src)) {}
+};
+
 using AnyModule = std::variant<SourceModule, CompiledModule>;
 
 } // namespace cudaq
diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h
index d1f41164d01..9535ff0cc14 100644
--- a/runtime/common/ExecutionContext.h
+++ b/runtime/common/ExecutionContext.h
@@ -205,8 +205,8 @@ void resetExecutionContext();
 /// @brief Execute the given function within the given policy and execution
 /// context.
 template <typename Policy, typename Callable, typename... Args>
-auto with_policy_and_ctx(Policy &policy, ExecutionContext &ctx, Callable &&f,
-                         Args &&...args)
+auto with_policy_and_ctx(const Policy &policy, ExecutionContext &ctx,
+                         Callable &&f, Args &&...args)
     -> std::invoke_result_t<Callable, Args...> {
 
   // Save the outer execution context (if any) so we can restore it after.
diff --git a/runtime/cudaq/CMakeLists.txt b/runtime/cudaq/CMakeLists.txt
index 67b442e4f4c..082c4df6d81 100644
--- a/runtime/cudaq/CMakeLists.txt
+++ b/runtime/cudaq/CMakeLists.txt
@@ -66,7 +66,6 @@ if (CUDA_FOUND)
       cudaq-common
       cudaq-nlopt
       cudaq-ensmallen
-      cudaq-mlir-runtime-headers # Public until compilation is moved out of the QPUs
       cudaq-logger
       CUDAQTargetConfig
     PRIVATE
@@ -94,7 +93,6 @@ else()
       cudaq-common
       cudaq-nlopt
       cudaq-ensmallen
-      cudaq-mlir-runtime-headers # Public until compilation is moved out of the QPUs
       cudaq-logger
       CUDAQTargetConfig
     PRIVATE
diff --git a/runtime/cudaq/algorithms/launch.h b/runtime/cudaq/algorithms/launch.h
index ada31657dc2..118adc5bc3c 100644
--- a/runtime/cudaq/algorithms/launch.h
+++ b/runtime/cudaq/algorithms/launch.h
@@ -9,6 +9,7 @@
 
 #pragma once
 
+#include "common/CompiledModule.h"
 #include "common/ExecutionContext.h"
 #include "common/KernelArgs.h"
 #include "cudaq/platform.h"
@@ -17,13 +18,34 @@
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq/utils/cudaq_utils.h"
 #include <stdexcept>
+#include <variant>
+
+namespace cudaq_internal::compiler {
+template <typename Policy>
+cudaq::CompiledModule
+compileModule(const Policy &policy,
+              std::unique_ptr<cudaq::CompileTarget> target,
+              const cudaq::SourceModule &src, cudaq::KernelArgs args,
+              bool isEntryPoint = true);
+} // namespace cudaq_internal::compiler
+
+// If JIT compilation is disabled, make compilation a no-op. QPUs may throw an
+// error if they expect a JIT-compiled module.
+#ifdef CUDAQ_DISABLE_JIT_COMPILER
+template <typename Policy>
+cudaq::CompiledModule cudaq_internal::compiler::compileModule(
+    const Policy &policy, std::unique_ptr<cudaq::CompileTarget> target,
+    const cudaq::SourceModule &src, cudaq::KernelArgs args, bool isEntryPoint) {
+  return cudaq::CompiledModule{src};
+}
+#endif
 
 namespace cudaq {
 namespace detail {
 
 /// @brief Execute the given function within the given execution context.
 template <typename Policy, typename Callable, typename... Args>
-auto launch(Policy &policy, std::size_t qpu_id, ExecutionContext &ctx,
+auto launch(const Policy &policy, std::size_t qpu_id, ExecutionContext &ctx,
             quantum_platform &platform, Callable &&f, Args &&...args)
     -> Policy::result_type {
 
@@ -51,7 +73,20 @@ auto launch(Policy &policy, std::size_t qpu_id, ExecutionContext &ctx,
   auto &qpu = platform.getQPU(qpu_id);
   ctx.executeKernelApi = [&qpu, &result, &policy](const AnyModule &module,
                                                   const KernelArgs &args) {
-    result = qpu.launchKernel(policy, module, args);
+    CompiledModule compiled;
+    if (const auto *source = std::get_if<SourceModule>(&module)) {
+      std::unique_ptr<cudaq::CompileTarget> target;
+      if constexpr (requires { policy.inner; }) {
+        target = cudaq::get_compile_target(policy.inner);
+      } else {
+        target = cudaq::get_compile_target(policy);
+      }
+      compiled = cudaq_internal::compiler::compileModule(
+          policy, std::move(target), *source, args);
+    } else {
+      compiled = std::get<CompiledModule>(module);
+    }
+    result = qpu.launchKernel(policy, compiled, args);
   };
 
   if constexpr (requires { policy.inner; })
diff --git a/runtime/cudaq/algorithms/sample/policy.h b/runtime/cudaq/algorithms/sample/policy.h
index 2f28e0086e3..b791e5c3163 100644
--- a/runtime/cudaq/algorithms/sample/policy.h
+++ b/runtime/cudaq/algorithms/sample/policy.h
@@ -37,7 +37,7 @@ struct sample_policy {
 
   /// @brief Flag to indicate that a warning about named measurement registers
   /// in sampling context has already been emitted.
-  bool warnedNamedMeasurements = false;
+  mutable bool warnedNamedMeasurements = false;
 
   /// @brief A vector containing information about how to reorder the global
   /// register after execution. Empty means no reordering.
diff --git a/runtime/cudaq/platform.h b/runtime/cudaq/platform.h
index 70c69a3065a..cd0f903060a 100644
--- a/runtime/cudaq/platform.h
+++ b/runtime/cudaq/platform.h
@@ -47,14 +47,18 @@ std::unique_ptr<cudaq::CompileTarget> get_compile_target(const Policy &policy) {
   return getQuantumPlatformInternal()->getCompileTarget(policy);
 }
 
-/// Get the default compile target configuration used when JITing for Python.
+/// Get the default compile target configuration
+///
+/// This is suitable for local simulators, i.e. it will use
+/// AOT-compiled modules as-is if they exist, and otherwise JIT-compile the
+/// module as appropriate for a Python kernel.
 std::unique_ptr<cudaq::CompileTarget>
-getDefaultPythonCompileTarget(const sample_policy &policy);
+getDefaultCompileTarget(const sample_policy &policy);
 std::unique_ptr<cudaq::CompileTarget>
-getDefaultPythonCompileTarget(const observe_policy &policy);
+getDefaultCompileTarget(const observe_policy &policy);
 std::unique_ptr<cudaq::CompileTarget>
-getDefaultPythonCompileTarget(const other_policies &policy,
-                              ExecutionContext *context);
+getDefaultCompileTarget(const other_policies &policy,
+                        ExecutionContext *context);
 
 // Declare this function, implemented elsewhere
 std::string getQIR(const std::string &);
diff --git a/runtime/cudaq/platform/default/DefaultQPU.cpp b/runtime/cudaq/platform/default/DefaultQPU.cpp
index e14ff483b1b..db732f37979 100644
--- a/runtime/cudaq/platform/default/DefaultQPU.cpp
+++ b/runtime/cudaq/platform/default/DefaultQPU.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "DefaultQPU.h"
+#include "common/CompiledModule.h"
 #include "common/ExecutionContext.h"
 #include "common/Timing.h"
 #include "cudaq/algorithms/policies.h"
@@ -22,17 +23,19 @@ void cudaq::DefaultQPU::enqueue(QuantumTask &task) {
 cudaq::KernelThunkResultType
 cudaq::DefaultQPU::unifiedLaunchModule(const cudaq::AnyModule &module,
                                        cudaq::KernelArgs args) {
-  if (!std::holds_alternative<cudaq::SourceModule>(module))
-    return runJITCompiledModule(std::get<cudaq::CompiledModule>(module), args);
-
-  const auto &src = std::get<cudaq::SourceModule>(module);
   ScopedTraceWithContext(cudaq::TIMING_LAUNCH, "QPU::unifiedLaunchModule");
-  auto rawFn = src.getFunctionPtr();
-  if (!rawFn)
-    throw std::runtime_error(
-        "DefaultQPU::unifiedLaunchModule requires a raw kernel function "
-        "pointer for kernel '" +
-        src.getName() + "'.");
+
+  std::optional<FatQuakeModule::FunctionPtrArtifact> rawFn;
+  if (std::holds_alternative<SourceModule>(module)) {
+    rawFn = std::get<SourceModule>(module).getFunctionPtr();
+    assert(rawFn && "SourceModule must have a valid AOT-compiled thunk");
+  } else {
+    auto &compiled = std::get<CompiledModule>(module);
+    rawFn = compiled.getFunctionPtr();
+    if (!rawFn)
+      return runJITCompiledModule(compiled, args);
+  }
+
   auto packed = args.getPacked();
   void *argData = packed ? packed->data.data() : nullptr;
   return rawFn->getFn()(argData, /*isRemote=*/false);
@@ -40,7 +43,7 @@ cudaq::DefaultQPU::unifiedLaunchModule(const cudaq::AnyModule &module,
 
 cudaq::sample_result
 cudaq::DefaultQPU::launchKernel(const cudaq::sample_policy &policy,
-                                const cudaq::AnyModule &module,
+                                const cudaq::CompiledModule &module,
                                 cudaq::KernelArgs args) {
   CUDAQ_INFO("DefaultQPU::launchKernel {}", policy.name);
   return cudaq::ExecutionManager::with_default_em(
@@ -50,7 +53,7 @@ cudaq::DefaultQPU::launchKernel(const cudaq::sample_policy &policy,
 
 cudaq::async_sample_result
 cudaq::DefaultQPU::launchKernel(const async_sample_policy &policy,
-                                const cudaq::AnyModule &module,
+                                const cudaq::CompiledModule &module,
                                 cudaq::KernelArgs args) {
   throw std::runtime_error(
       "DefaultQPU does not support launching the async_sample_policy.");
@@ -58,7 +61,7 @@ cudaq::DefaultQPU::launchKernel(const async_sample_policy &policy,
 
 cudaq::observe_result
 cudaq::DefaultQPU::launchKernel(const cudaq::observe_policy &policy,
-                                const cudaq::AnyModule &module,
+                                const cudaq::CompiledModule &module,
                                 cudaq::KernelArgs args) {
   CUDAQ_INFO("DefaultQPU::launchKernel {}", policy.name);
   return cudaq::ExecutionManager::with_default_em(
@@ -67,8 +70,8 @@ cudaq::DefaultQPU::launchKernel(const cudaq::observe_policy &policy,
 }
 
 cudaq::async_observe_result
-cudaq::DefaultQPU::launchKernel(async_observe_policy &policy,
-                                const cudaq::AnyModule &module,
+cudaq::DefaultQPU::launchKernel(const async_observe_policy &policy,
+                                const cudaq::CompiledModule &module,
                                 cudaq::KernelArgs args) {
   throw std::runtime_error(
       "DefaultQPU does not support launching the async_observe_policy.");
@@ -76,24 +79,18 @@ cudaq::DefaultQPU::launchKernel(async_observe_policy &policy,
 
 std::unique_ptr<cudaq::CompileTarget>
 cudaq::DefaultQPU::getCompileTarget(const sample_policy &policy) {
-  // Currently this is only used for Python kernels, as C++ kernels skip JIT
-  // compilation and call the AOT-generated function directly.
-  return getDefaultPythonCompileTarget(policy);
+  return getDefaultCompileTarget(policy);
 }
 
 std::unique_ptr<cudaq::CompileTarget>
 cudaq::DefaultQPU::getCompileTarget(const observe_policy &policy) {
-  // Currently this is only used for Python kernels, as C++ kernels skip JIT
-  // compilation and call the AOT-generated function directly.
-  return getDefaultPythonCompileTarget(policy);
+  return getDefaultCompileTarget(policy);
 }
 
 std::unique_ptr<cudaq::CompileTarget>
 cudaq::DefaultQPU::getCompileTarget(const other_policies &policy,
                                     ExecutionContext *context) {
-  // Currently this is only used for Python kernels, as C++ kernels skip JIT
-  // compilation and call the AOT-generated function directly.
-  return getDefaultPythonCompileTarget(policy, context);
+  return getDefaultCompileTarget(policy, context);
 }
 
 void cudaq::DefaultQPU::configureExecutionContext(
diff --git a/runtime/cudaq/platform/default/DefaultQPU.h b/runtime/cudaq/platform/default/DefaultQPU.h
index 4290cbfb320..8dac5e23c1e 100644
--- a/runtime/cudaq/platform/default/DefaultQPU.h
+++ b/runtime/cudaq/platform/default/DefaultQPU.h
@@ -25,18 +25,19 @@ class DefaultQPU : public QPU {
                                             cudaq::KernelArgs args) override;
 
   sample_result launchKernel(const sample_policy &policy,
-                             const AnyModule &module, KernelArgs args) override;
+                             const CompiledModule &module,
+                             KernelArgs args) override;
 
   async_sample_result launchKernel(const async_sample_policy &policy,
-                                   const AnyModule &module,
+                                   const CompiledModule &module,
                                    KernelArgs args) override;
 
   observe_result launchKernel(const observe_policy &policy,
-                              const AnyModule &module,
+                              const CompiledModule &module,
                               KernelArgs args) override;
 
-  async_observe_result launchKernel(async_observe_policy &policy,
-                                    const AnyModule &module,
+  async_observe_result launchKernel(const async_observe_policy &policy,
+                                    const CompiledModule &module,
                                     KernelArgs args) override;
 
   std::unique_ptr<CompileTarget>
diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp
index 32c8444771c..673f1666182 100644
--- a/runtime/cudaq/platform/default/python/QPU.cpp
+++ b/runtime/cudaq/platform/default/python/QPU.cpp
@@ -43,8 +43,8 @@ std::string cudaq::detail::lower_to_qir_llvm(const std::string &name,
                                              const std::string &format) {
   ScopedTraceWithContext(cudaq::TIMING_JIT, "getQIR", name);
 
-  auto target = getDefaultPythonCompileTarget(other_policies{},
-                                              cudaq::getExecutionContext());
+  auto target =
+      getDefaultCompileTarget(other_policies{}, cudaq::getExecutionContext());
   target->fullySpecialize = true;
   cudaq_internal::compiler::Compiler compiler(std::move(target));
 
@@ -87,8 +87,8 @@ std::string cudaq::detail::lower_to_openqasm(const std::string &name,
                                              OpaqueArguments &args) {
   ScopedTraceWithContext(cudaq::TIMING_JIT, "getASM", name);
 
-  auto target = getDefaultPythonCompileTarget(other_policies{},
-                                              cudaq::getExecutionContext());
+  auto target =
+      getDefaultCompileTarget(other_policies{}, cudaq::getExecutionContext());
   target->fullySpecialize = true;
   cudaq_internal::compiler::Compiler compiler(std::move(target));
 
diff --git a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
index c19cae79cea..7f14610359f 100644
--- a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
@@ -7,76 +7,95 @@
  ******************************************************************************/
 
 #include "RemoteRESTQPU.h"
+#include "common/CompiledModule.h"
+#include "cudaq_internal/compiler/Compiler.h"
 
 using namespace cudaq;
 cudaq::RemoteRESTQPU::~RemoteRESTQPU() = default;
 
 sample_result RemoteRESTQPU::launchKernel(const sample_policy &policy,
-                                          const AnyModule &module,
+                                          const CompiledModule &module,
                                           KernelArgs args) {
   CUDAQ_INFO("RemoteRESTQPU::launchKernel {}", policy.name);
-  auto [kernelName, codes] = compileKernelExecutions(policy, module, args);
-  return completeLaunchKernel(policy, kernelName, std::move(codes));
+
+  if (module.getMlirArtifacts().empty())
+    throw std::runtime_error("QPU does not support launching a "
+                             "CompiledModule without MLIR artifacts.");
+
+  cudaq_internal::compiler::Compiler compiler(getCompileTarget(policy));
+  auto codes = compiler.emitKernelExecutions(module);
+
+  if (compiler.hasWarnedNamedMeasurements())
+    policy.warnedNamedMeasurements = true;
+  return completeLaunchKernel(policy, module.getName(), std::move(codes));
 }
 
 async_sample_result
 RemoteRESTQPU::launchKernel(const async_sample_policy &policy,
-                            const AnyModule &module, KernelArgs args) {
+                            const CompiledModule &module, KernelArgs args) {
   CUDAQ_INFO("RemoteRESTQPU::launchKernel async {}", policy.inner.name);
-  auto [kernelName, codes] =
-      compileKernelExecutions(policy.inner, module, args);
-  return completeLaunchKernel(policy, kernelName, std::move(codes));
+
+  if (module.getMlirArtifacts().empty())
+    throw std::runtime_error("QPU does not support launching a "
+                             "CompiledModule without MLIR artifacts.");
+
+  cudaq_internal::compiler::Compiler compiler(getCompileTarget(policy.inner));
+  auto codes = compiler.emitKernelExecutions(module);
+
+  if (compiler.hasWarnedNamedMeasurements())
+    policy.inner.warnedNamedMeasurements = true;
+  return completeLaunchKernel(policy, module.getName(), std::move(codes));
 }
 
 observe_result RemoteRESTQPU::launchKernel(const observe_policy &policy,
-                                           const AnyModule &module,
+                                           const CompiledModule &module,
                                            KernelArgs args) {
   CUDAQ_INFO("RemoteRESTQPU::launchKernel {}", policy.name);
-  auto [kernelName, codes] = compileKernelExecutions(policy, module, args);
-  return completeLaunchKernel(policy, kernelName, std::move(codes));
+
+  if (module.getMlirArtifacts().empty())
+    throw std::runtime_error("QPU does not support launching a "
+                             "CompiledModule without MLIR artifacts.");
+
+  cudaq_internal::compiler::Compiler compiler(getCompileTarget(policy));
+  auto codes = compiler.emitKernelExecutions(module);
+  return completeLaunchKernel(policy, module.getName(), std::move(codes));
 }
 
-async_observe_result RemoteRESTQPU::launchKernel(async_observe_policy &policy,
-                                                 const AnyModule &module,
-                                                 KernelArgs args) {
+async_observe_result
+RemoteRESTQPU::launchKernel(const async_observe_policy &policy,
+                            const CompiledModule &module, KernelArgs args) {
   CUDAQ_INFO("RemoteRESTQPU::launchKernel async {}", policy.inner.name);
-  auto [kernelName, codes] =
-      compileKernelExecutions(policy.inner, module, args);
-  return completeLaunchKernel(policy, kernelName, std::move(codes));
+
+  if (module.getMlirArtifacts().empty())
+    throw std::runtime_error("QPU does not support launching a "
+                             "CompiledModule without MLIR artifacts.");
+
+  cudaq_internal::compiler::Compiler compiler(getCompileTarget(policy.inner));
+  auto codes = compiler.emitKernelExecutions(module);
+  return completeLaunchKernel(policy, module.getName(), std::move(codes));
 }
 
 KernelThunkResultType
 RemoteRESTQPU::unifiedLaunchModule(const AnyModule &module, KernelArgs args) {
-  Compiler compiler(getCompileTarget(other_policies{}, getExecutionContext()));
-
-  std::string kernelName;
-  std::optional<CompiledModule> compiled;
+  CompiledModule compiled;
+  auto target = getCompileTarget(other_policies{}, getExecutionContext());
+  cudaq_internal::compiler::Compiler compiler(std::move(target));
 
   if (std::holds_alternative<SourceModule>(module)) {
-    const auto &src = std::get<SourceModule>(module);
-    kernelName = src.getName();
-    CUDAQ_INFO("launching remote rest kernel ({})", kernelName);
-
-    auto [moduleOp, context] = Compiler::loadQuakeCodeByName(kernelName);
-
-    // Get the Quake code, lowered according to config file.
-    compiled = compiler.runPassPipeline(kernelName, moduleOp, args, true,
-                                        std::move(context));
+    auto source = std::get<SourceModule>(module);
+    CUDAQ_INFO("no compiled kernel found for {}, compiling now",
+               source.getName());
+    cudaq_internal::compiler::CompiledModuleHelper::loadMlirArtifacts(source);
+    compiled = compiler.runPassPipeline(
+        source.getName(), source.getMlirOpaqueModulePtr(), args, true);
   } else {
     compiled = std::get<CompiledModule>(module);
-    kernelName = compiled->getName();
-    CUDAQ_INFO("launching remote rest kernel via module ({})", kernelName);
   }
+  CUDAQ_INFO("launching remote rest kernel ({})", compiled.getName());
 
-  auto codes = compiler.emitKernelExecutions(*compiled);
-
-  // Propagate metadata from the compiled artifact to the execution context.
-  if (auto ctx = getExecutionContext()) {
-    ctx->hasConditionalsOnMeasureResults =
-        compiled->getMetadata().hasConditionalsOnMeasureResults;
-  }
+  auto codes = compiler.emitKernelExecutions(compiled);
 
-  completeLaunchKernel(kernelName, std::move(codes));
+  completeLaunchKernel(compiled.getName(), std::move(codes));
   return {};
 }
 
diff --git a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.h b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.h
index bb700299030..fc319893266 100644
--- a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.h
+++ b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "common/BaseRemoteRESTQPU.h"
+#include "common/CompiledModule.h"
 
 namespace cudaq {
 
@@ -29,18 +30,19 @@ class RemoteRESTQPU : public BaseRemoteRESTQPU {
                                             KernelArgs args) override;
 
   sample_result launchKernel(const sample_policy &policy,
-                             const AnyModule &module, KernelArgs args) override;
+                             const CompiledModule &module,
+                             KernelArgs args) override;
 
   async_sample_result launchKernel(const async_sample_policy &policy,
-                                   const AnyModule &module,
+                                   const CompiledModule &module,
                                    KernelArgs args) override;
 
   observe_result launchKernel(const observe_policy &policy,
-                              const AnyModule &module,
+                              const CompiledModule &module,
                               KernelArgs args) override;
 
-  async_observe_result launchKernel(async_observe_policy &policy,
-                                    const AnyModule &module,
+  async_observe_result launchKernel(const async_observe_policy &policy,
+                                    const CompiledModule &module,
                                     KernelArgs args) override;
 };
 
diff --git a/runtime/cudaq/platform/fermioniq/FermioniqQPU.cpp b/runtime/cudaq/platform/fermioniq/FermioniqQPU.cpp
index cde2e956e84..8d50ff25a75 100644
--- a/runtime/cudaq/platform/fermioniq/FermioniqQPU.cpp
+++ b/runtime/cudaq/platform/fermioniq/FermioniqQPU.cpp
@@ -7,10 +7,9 @@
  ******************************************************************************/
 
 #include "FermioniqQPU.h"
+#include "cudaq_internal/compiler/Compiler.h"
 #include "nlohmann/json.hpp"
 #include "cudaq/runtime/logger/cudaq_fmt.h"
-#include <memory>
-#include <optional>
 
 namespace {
 void attachFermioniqObservable(cudaq::KernelExecution &code,
@@ -34,59 +33,48 @@ void attachFermioniqObservable(cudaq::KernelExecution &code,
 
 cudaq::FermioniqQPU::~FermioniqQPU() = default;
 
-cudaq::sample_result
-cudaq::FermioniqQPU::launchKernel(const cudaq::sample_policy &policy,
-                                  const AnyModule &module, KernelArgs args) {
-  auto [kernelName, codes] = compileKernelExecutions(policy, module, args);
-  CUDAQ_INFO("FermioniqBaseQPU launching kernel ({}) with policy {}",
-             kernelName, policy.name);
-  if (codes.size() != 1)
-    throw std::runtime_error("Provider only allows 1 circuit at a time.");
-
-  return completeLaunchKernel(policy, kernelName, std::move(codes));
-}
-
-cudaq::async_sample_result
-cudaq::FermioniqQPU::launchKernel(const cudaq::async_sample_policy &policy,
-                                  const AnyModule &module, KernelArgs args) {
-  auto [kernelName, codes] =
-      compileKernelExecutions(policy.inner, module, args);
-  CUDAQ_INFO("FermioniqBaseQPU launching kernel ({}) with policy {}",
-             kernelName, policy.inner.name);
-  if (codes.size() != 1)
-    throw std::runtime_error("Provider only allows 1 circuit at a time.");
-
-  return completeLaunchKernel(policy, kernelName, std::move(codes));
-}
-
 cudaq::observe_result
 cudaq::FermioniqQPU::launchKernel(const cudaq::observe_policy &policy,
-                                  const AnyModule &module, KernelArgs args) {
-  auto [kernelName, codes] = compileKernelExecutions(policy, module, args);
+                                  const CompiledModule &module,
+                                  KernelArgs args) {
   CUDAQ_INFO("FermioniqBaseQPU launching kernel ({}) with policy {}",
-             kernelName, policy.name);
+             module.getName(), policy.name);
+
+  if (module.getMlirArtifacts().empty())
+    throw std::runtime_error("QPU does not support launching a "
+                             "CompiledModule without MLIR artifacts.");
+
+  cudaq_internal::compiler::Compiler compiler(getCompileTarget(policy));
+  auto codes = compiler.emitKernelExecutions(module);
   if (codes.size() != 1)
     throw std::runtime_error("Provider only allows 1 circuit at a time.");
 
   attachFermioniqObservable(codes[0], policy.spin);
-  auto result = completeLaunchKernel(policy, kernelName, std::move(codes));
+  auto result =
+      completeLaunchKernel(policy, module.getName(), std::move(codes));
   auto expectation = result.raw_data().expectation(GlobalRegisterName);
   return cudaq::observe_result(expectation, result.get_spin(),
                                result.raw_data());
 }
 
 cudaq::async_observe_result
-cudaq::FermioniqQPU::launchKernel(cudaq::async_observe_policy &policy,
-                                  const AnyModule &module, KernelArgs args) {
-  auto [kernelName, codes] =
-      compileKernelExecutions(policy.inner, module, args);
+cudaq::FermioniqQPU::launchKernel(const cudaq::async_observe_policy &policy,
+                                  const CompiledModule &module,
+                                  KernelArgs args) {
   CUDAQ_INFO("FermioniqBaseQPU launching kernel ({}) with policy {}",
-             kernelName, policy.inner.name);
+             module.getName(), policy.inner.name);
+
+  if (module.getMlirArtifacts().empty())
+    throw std::runtime_error("QPU does not support launching a "
+                             "CompiledModule without MLIR artifacts.");
+
+  cudaq_internal::compiler::Compiler compiler(getCompileTarget(policy.inner));
+  auto codes = compiler.emitKernelExecutions(module);
   if (codes.size() != 1)
     throw std::runtime_error("Provider only allows 1 circuit at a time.");
 
   attachFermioniqObservable(codes[0], policy.inner.spin);
-  return completeLaunchKernel(policy, kernelName, std::move(codes));
+  return completeLaunchKernel(policy, module.getName(), std::move(codes));
 }
 
 CUDAQ_REGISTER_TYPE(cudaq::QPU, cudaq::FermioniqQPU, fermioniq)
diff --git a/runtime/cudaq/platform/fermioniq/FermioniqQPU.h b/runtime/cudaq/platform/fermioniq/FermioniqQPU.h
index 70eab9ef111..0902f30f8bf 100644
--- a/runtime/cudaq/platform/fermioniq/FermioniqQPU.h
+++ b/runtime/cudaq/platform/fermioniq/FermioniqQPU.h
@@ -43,19 +43,13 @@ class FermioniqQPU : public BaseRemoteRESTQPU {
     return target;
   }
 
-  sample_result launchKernel(const sample_policy &policy,
-                             const AnyModule &module, KernelArgs args) override;
-
-  async_sample_result launchKernel(const async_sample_policy &policy,
-                                   const AnyModule &module,
-                                   KernelArgs args) override;
-
+  using BaseRemoteRESTQPU::launchKernel;
   observe_result launchKernel(const observe_policy &policy,
-                              const AnyModule &module,
+                              const CompiledModule &module,
                               KernelArgs args) override;
 
-  async_observe_result launchKernel(async_observe_policy &policy,
-                                    const AnyModule &module,
+  async_observe_result launchKernel(const async_observe_policy &policy,
+                                    const CompiledModule &module,
                                     KernelArgs args) override;
 };
 
diff --git a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.h b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.h
index 63dfa106ca8..d078db3f975 100644
--- a/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.h
+++ b/runtime/cudaq/platform/mqpu/custatevec/GPUEmulatedQPU.h
@@ -22,7 +22,7 @@ class GPUEmulatedQPU : public QPU {
 
   void enqueue(QuantumTask &task) override;
 
-  KernelThunkResultType unifiedLaunchModule(const cudaq::AnyModule &src,
+  KernelThunkResultType unifiedLaunchModule(const cudaq::AnyModule &module,
                                             cudaq::KernelArgs args) override;
 
   void configureExecutionContext(ExecutionContext &context) const override;
diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
index df3bc23de88..cf9368c631b 100644
--- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
@@ -62,8 +62,9 @@ void cudaq::OrcaRemoteRESTQPU::setTargetBackend(const std::string &backend) {
   executor->setServerHelper(serverHelper.get());
 }
 
-KernelThunkResultType cudaq::OrcaRemoteRESTQPU::launchKernelCommon(
-    const std::string &kernelName, KernelThunkType kernelFunc, void *args) {
+KernelThunkResultType
+cudaq::OrcaRemoteRESTQPU::launchKernelCommon(const std::string &kernelName,
+                                             void *args) {
 
   CUDAQ_INFO("OrcaRemoteRESTQPU: Launch kernel named '{}' remote QPU {}",
              kernelName, qpu_id);
diff --git a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
index f92d89fa2e8..acca8013262 100644
--- a/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
+++ b/runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "OrcaExecutor.h"
+#include "common/CompiledModule.h"
 #include "cudaq/platform/qpu.h"
 #include "cudaq/utils/cudaq_utils.h"
 #include "cudaq/utils/owning_ptr.h"
@@ -91,8 +92,7 @@ class OrcaRemoteRESTQPU : public cudaq::QPU {
   void setTargetBackend(const std::string &backend) override;
 
   [[nodiscard]] KernelThunkResultType
-  launchKernelCommon(const std::string &kernelName, KernelThunkType kernelFunc,
-                     void *args);
+  launchKernelCommon(const std::string &kernelName, void *args);
 
   /// @brief Launch the kernel. Handle all pertinent modifications for the
   /// execution context.
@@ -103,11 +103,9 @@ class OrcaRemoteRESTQPU : public cudaq::QPU {
           "OrcaRemoteRESTQPU does not support pre-compiled module launch.");
 
     const auto &src = std::get<SourceModule>(module);
-    auto rawFn = src.getFunctionPtr();
-    KernelThunkType kernelFunc = rawFn ? rawFn->getFn() : nullptr;
     auto packed = args.getPacked();
     void *argData = packed ? packed->data.data() : nullptr;
-    return launchKernelCommon(src.getName(), kernelFunc, argData);
+    return launchKernelCommon(src.getName(), argData);
   }
 };
 } // namespace cudaq
diff --git a/runtime/cudaq/platform/qpu.cpp b/runtime/cudaq/platform/qpu.cpp
index c438ec30986..8f89e9b999d 100644
--- a/runtime/cudaq/platform/qpu.cpp
+++ b/runtime/cudaq/platform/qpu.cpp
@@ -26,39 +26,34 @@ using namespace cudaq;
 
 cudaq::KernelThunkResultType
 cudaq::QPU::unifiedLaunchModule(const AnyModule &module, KernelArgs args) {
-  if (std::holds_alternative<SourceModule>(module))
-    throw std::runtime_error(
-        "This QPU does not support launching uncompiled SourceModule kernels; "
-        "subclasses must override unifiedLaunchModule.");
-
-  const auto &compiled = std::get<CompiledModule>(module);
-  return runJITCompiledModule(compiled, args);
+  throw std::runtime_error(
+      "This QPU does not support launching the other_policies.");
 }
 
 sample_result cudaq::QPU::launchKernel(const sample_policy &policy,
-                                       const AnyModule &module,
+                                       const CompiledModule &module,
                                        KernelArgs args) {
   throw std::runtime_error(
       "This QPU does not support launching the sample_policy.");
 }
 
 async_sample_result cudaq::QPU::launchKernel(const async_sample_policy &policy,
-                                             const AnyModule &module,
+                                             const CompiledModule &module,
                                              KernelArgs args) {
   throw std::runtime_error(
       "This QPU does not support launching the async_sample_policy.");
 }
 
 observe_result cudaq::QPU::launchKernel(const observe_policy &policy,
-                                        const AnyModule &module,
+                                        const CompiledModule &module,
                                         KernelArgs args) {
   throw std::runtime_error(
       "This QPU does not support launching the observe_policy.");
 }
 
-async_observe_result cudaq::QPU::launchKernel(async_observe_policy &policy,
-                                              const AnyModule &module,
-                                              KernelArgs args) {
+async_observe_result
+cudaq::QPU::launchKernel(const async_observe_policy &policy,
+                         const CompiledModule &module, KernelArgs args) {
   throw std::runtime_error(
       "This QPU does not support launching the async_observe_policy.");
 }
diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
index 79175272b1e..ff43d384147 100644
--- a/runtime/cudaq/platform/qpu.h
+++ b/runtime/cudaq/platform/qpu.h
@@ -139,17 +139,19 @@ class QPU : public registry::RegisteredType<QPU> {
                          const std::size_t shots) {}
 
   virtual sample_result launchKernel(const sample_policy &policy,
-                                     const AnyModule &module, KernelArgs args);
+                                     const CompiledModule &module,
+                                     KernelArgs args);
 
   virtual async_sample_result launchKernel(const async_sample_policy &policy,
-                                           const AnyModule &module,
+                                           const CompiledModule &module,
                                            KernelArgs args);
 
   virtual observe_result launchKernel(const observe_policy &policy,
-                                      const AnyModule &module, KernelArgs args);
+                                      const CompiledModule &module,
+                                      KernelArgs args);
 
-  virtual async_observe_result launchKernel(async_observe_policy &policy,
-                                            const AnyModule &module,
+  virtual async_observe_result launchKernel(const async_observe_policy &policy,
+                                            const CompiledModule &module,
                                             KernelArgs args);
 
   [[nodiscard]] virtual KernelThunkResultType
diff --git a/runtime/cudaq/platform/quantum_platform.cpp b/runtime/cudaq/platform/quantum_platform.cpp
index bd04812a2bc..b8a2286a9e6 100644
--- a/runtime/cudaq/platform/quantum_platform.cpp
+++ b/runtime/cudaq/platform/quantum_platform.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "cudaq/platform/quantum_platform.h"
+#include "algorithms/policies.h"
 #include "common/CompiledModule.h"
 #include "common/Environment.h"
 #include "common/ExecutionContext.h"
@@ -16,7 +17,7 @@
 #include "cudaq/algorithms/policy_dispatch.h"
 #include "cudaq/platform/qpu.h"
 #include "cudaq/runtime/logger/logger.h"
-#include "mlir/IR/BuiltinOps.h"
+// #include "mlir/IR/BuiltinOps.h"
 #include <string>
 
 using namespace cudaq_internal::compiler;
@@ -122,17 +123,21 @@ getDefaultPythonCompileTargetImpl() {
 }
 
 std::unique_ptr<cudaq::CompileTarget>
-getDefaultPythonCompileTarget(const sample_policy &) {
-  return getDefaultPythonCompileTargetImpl();
+getDefaultCompileTarget(const sample_policy &) {
+  auto ct = getDefaultPythonCompileTargetImpl();
+  ct->overrideAOTCompilation = false;
+  return ct;
 }
 std::unique_ptr<cudaq::CompileTarget>
-getDefaultPythonCompileTarget(const observe_policy &) {
-  return getDefaultPythonCompileTargetImpl();
+getDefaultCompileTarget(const observe_policy &) {
+  auto ct = getDefaultPythonCompileTargetImpl();
+  ct->overrideAOTCompilation = false;
+  return ct;
 }
 std::unique_ptr<cudaq::CompileTarget>
-getDefaultPythonCompileTarget(const other_policies &,
-                              ExecutionContext *context) {
+getDefaultCompileTarget(const other_policies &, ExecutionContext *context) {
   auto ct = getDefaultPythonCompileTargetImpl();
+  ct->overrideAOTCompilation = false;
 
   if (context && context->name == "dem") {
     ct->emitJit = true;
@@ -274,8 +279,8 @@ quantum_platform::get_remote_capabilities(std::size_t qpu_id) const {
 }
 
 KernelThunkResultType
-quantum_platform::unifiedLaunchModule(const AnyModule &module, KernelArgs args,
-                                      std::size_t qpu_id) {
+quantum_platform::unifiedLaunchModule(const CompiledModule &module,
+                                      KernelArgs args, std::size_t qpu_id) {
   validateQpuId(qpu_id);
   auto &qpu = platformQPUs[qpu_id];
   return qpu->unifiedLaunchModule(module, args);
@@ -338,13 +343,20 @@ cudaq::altLaunchKernel(const char *kernelName,
   std::string kernName = kernelName;
   KernelArgs args{KernelArgs::PackedArgs{kernelArgs, argsSize, resultOffset}};
   SourceModule src{kernName, kernelFunc};
+  // TODO: we are bypassing the compiler to avoid a dependency on the compiler.
+  // This delays compilation until inside the QPU.
+  // CompiledModule compiled{src};
   auto ctx = cudaq::getExecutionContext();
   if (ctx && ctx->executeKernelApi) {
     ctx->executeKernelApi(src, args);
     return {};
   }
+
+  // TODO: we are bypassing the compiler to avoid adding a dependency on the
+  // compiler here. This delays compilation until inside the QPU.
+  CompiledModule compiled{src};
   std::size_t qpu_id = cudaq::getCurrentQpuId();
-  return platform.unifiedLaunchModule(src, args, qpu_id);
+  return platform.unifiedLaunchModule(compiled, args, qpu_id);
 }
 
 cudaq::KernelThunkResultType
@@ -360,9 +372,14 @@ cudaq::streamlinedLaunchKernel(const char *kernelName,
     ctx->executeKernelApi(src, args);
     return {};
   }
+
+  // TODO: we are bypassing the compiler to avoid adding a dependency on the
+  // compiler here. This delays compilation until inside the QPU.
+  CompiledModule compiled{src};
   auto &platform = *getQuantumPlatformInternal();
   std::size_t qpu_id = cudaq::getCurrentQpuId();
-  [[maybe_unused]] auto r = platform.unifiedLaunchModule(src, args, qpu_id);
+  [[maybe_unused]] auto r =
+      platform.unifiedLaunchModule(compiled, args, qpu_id);
   // NB: The streamlined launch will never return results. Use alt or hybrid if
   // the kernel returns results.
   return {};
@@ -379,6 +396,7 @@ cudaq::streamlinedLaunchModule(const CompiledModule &compiled,
     ctx->executeKernelApi(compiled, {rawArgs});
     return {};
   }
+
   auto &platform = *getQuantumPlatformInternal();
   std::size_t qpu_id = getCurrentQpuId();
   return platform.unifiedLaunchModule(compiled, {rawArgs}, qpu_id);
@@ -405,10 +423,15 @@ cudaq::hybridLaunchKernel(const char *kernelName, cudaq::KernelThunkType kernel,
     return {};
   }
 
+  // TODO: we are bypassing the compiler to avoid adding a dependency on the
+  // compiler here. This delays compilation until inside the QPU.
+  CompiledModule compiled{src};
+
   if (platform.is_remote(qpu_id)) {
     // This path should never call a kernel that returns results.
-    [[maybe_unused]] auto r = platform.unifiedLaunchModule(src, kargs, qpu_id);
+    [[maybe_unused]] auto r =
+        platform.unifiedLaunchModule(compiled, kargs, qpu_id);
     return {};
   }
-  return platform.unifiedLaunchModule(src, kargs, qpu_id);
+  return platform.unifiedLaunchModule(compiled, kargs, qpu_id);
 }
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
index 46811abccce..512d9c706da 100644
--- a/runtime/cudaq/platform/quantum_platform.h
+++ b/runtime/cudaq/platform/quantum_platform.h
@@ -204,7 +204,7 @@ class quantum_platform {
                  const std::size_t shots, std::size_t qpu_id = 0);
 
   [[nodiscard]] KernelThunkResultType
-  unifiedLaunchModule(const AnyModule &module, KernelArgs args,
+  unifiedLaunchModule(const CompiledModule &module, KernelArgs args,
                       std::size_t qpu_id = 0);
 
   template <typename Policy>
diff --git a/runtime/internal/compiler/CMakeLists.txt b/runtime/internal/compiler/CMakeLists.txt
index 2787ddbf279..bb7c47b5819 100644
--- a/runtime/internal/compiler/CMakeLists.txt
+++ b/runtime/internal/compiler/CMakeLists.txt
@@ -8,14 +8,6 @@
 
 include(HandleLLVMOptions)
 
-# Header-only target: public headers under include/cudaq_internal/compiler/.
-# Link this when you need declarations only (no cudaq-mlir-runtime .so).
-add_library(cudaq-mlir-runtime-headers INTERFACE)
-target_include_directories(cudaq-mlir-runtime-headers INTERFACE
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  $<INSTALL_INTERFACE:include>
-)
-
 # MLIR/LLVM runtime support shared with cudaq-builder and remote platforms.
 # Single library so LLVM static Options are initialized at most once.
 
@@ -38,9 +30,13 @@ set_source_files_properties(
 )
 set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS cudaq-mlir-runtime)
 
-target_link_libraries(cudaq-mlir-runtime
+target_include_directories(cudaq-mlir-runtime
   PUBLIC
-    cudaq-mlir-runtime-headers
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>
+)
+
+target_link_libraries(cudaq-mlir-runtime
   PRIVATE
     cudaq-common
     $<$<NOT:$<PLATFORM_ID:Darwin>>:cudaq-qir-verifier>
@@ -84,9 +80,6 @@ if(APPLE)
     "LINKER:-flat_namespace")
 endif()
 
-install(TARGETS cudaq-mlir-runtime-headers
-        EXPORT cudaq-mlir-runtime-targets)
-
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
         DESTINATION include)
 
diff --git a/runtime/internal/compiler/CompiledModuleHelper.cpp b/runtime/internal/compiler/CompiledModuleHelper.cpp
index 0218d50f843..0bd2609b9dc 100644
--- a/runtime/internal/compiler/CompiledModuleHelper.cpp
+++ b/runtime/internal/compiler/CompiledModuleHelper.cpp
@@ -7,11 +7,12 @@
  ******************************************************************************/
 
 #include "cudaq_internal/compiler/CompiledModuleHelper.h"
+#include "common/CompiledModule.h"
+#include "cudaq_internal/compiler/Compiler.h"
 #include "cudaq_internal/compiler/LayoutInfo.h"
 #include "cudaq/Optimizer/Builder/RuntimeNames.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
 #include "mlir/IR/Types.h"
 
 using namespace mlir;
@@ -95,4 +96,15 @@ CompiledModule CompiledModuleHelper::createCompiledModule(
   return compiled;
 }
 
+void CompiledModuleHelper::loadMlirArtifacts(cudaq::SourceModule &module) {
+  if (!module.getMlirArtifacts().empty())
+    return;
+
+  auto [moduleOp, context] =
+      cudaq_internal::compiler::Compiler::loadQuakeCodeByName(module.getName());
+  cudaq::FatQuakeModule::MlirArtifact mlirArtifact(moduleOp,
+                                                   std::move(context));
+  module.addArtifact(module.getName(), std::move(mlirArtifact));
+}
+
 } // namespace cudaq_internal::compiler
diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp
index 3d1430d6a2e..2681efdfcd1 100644
--- a/runtime/internal/compiler/Compiler.cpp
+++ b/runtime/internal/compiler/Compiler.cpp
@@ -25,6 +25,8 @@
 #include "cudaq/Optimizer/Transforms/AddMetadata.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
 #include "cudaq/Optimizer/Transforms/ResourceCount.h"
+#include "cudaq/algorithms/observe/policy.h"
+#include "cudaq/algorithms/sample/policy.h"
 #include "cudaq/runtime/logger/logger.h"
 #include "cudaq/utils/cudaq_utils.h"
 #include "llvm/ADT/SmallSet.h"
@@ -703,3 +705,24 @@ mlir::ModuleOp cudaq_internal::compiler::Compiler::lowerQuakeCodeBuildModule(
   }
   return moduleOp;
 }
+
+// Explicit template instantiations, required to link the calls to the compiler
+// from `cudaq::detail::launch` (see runtime/cudaq/algorithms/launch.h).
+template cudaq::CompiledModule
+cudaq_internal::compiler::compileModule<cudaq::sample_policy>(
+    const cudaq::sample_policy &, std::unique_ptr<cudaq::CompileTarget>,
+    const cudaq::SourceModule &, cudaq::KernelArgs, bool);
+template cudaq::CompiledModule
+cudaq_internal::compiler::compileModule<cudaq::observe_policy>(
+    const cudaq::observe_policy &, std::unique_ptr<cudaq::CompileTarget>,
+    const cudaq::SourceModule &, cudaq::KernelArgs, bool);
+template cudaq::CompiledModule cudaq_internal::compiler::compileModule<
+    cudaq::async_policy_wrapper<cudaq::sample_policy>>(
+    const cudaq::async_policy_wrapper<cudaq::sample_policy> &,
+    std::unique_ptr<cudaq::CompileTarget>, const cudaq::SourceModule &,
+    cudaq::KernelArgs, bool);
+template cudaq::CompiledModule cudaq_internal::compiler::compileModule<
+    cudaq::async_policy_wrapper<cudaq::observe_policy>>(
+    const cudaq::async_policy_wrapper<cudaq::observe_policy> &,
+    std::unique_ptr<cudaq::CompileTarget>, const cudaq::SourceModule &,
+    cudaq::KernelArgs, bool);
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
index e04383d1214..d11fabdb87b 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/CompiledModuleHelper.h
@@ -18,6 +18,8 @@ class ModuleOp;
 
 namespace cudaq_internal::compiler {
 
+class Compiler;
+
 /// Compiler-side helper for `cudaq::CompiledModule`: static factory methods and
 /// utilities that depend on MLIR but pair with the MLIR-free `CompiledModule`
 /// API in `common/CompiledModule.h`.
@@ -76,6 +78,9 @@ class CompiledModuleHelper {
       std::string name, cudaq::ResultInfo resultInfo,
       std::vector<NamedCompiledArtifact> compiledArtifacts,
       cudaq::CompiledModule::CompilationMetadata metadata = {});
+
+  /// Load the module's source MLIR using `Compiler::loadQuakeCodeByName`.
+  static void loadMlirArtifacts(cudaq::SourceModule &module);
 };
 
 } // namespace cudaq_internal::compiler
diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
index 3600a878f74..ebbf646877b 100644
--- a/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
+++ b/runtime/internal/compiler/include/cudaq_internal/compiler/Compiler.h
@@ -143,6 +143,11 @@ compileModule(const Policy &policy,
               std::unique_ptr<cudaq::CompileTarget> target,
               const cudaq::SourceModule &src, cudaq::KernelArgs args,
               bool isEntryPoint = true) {
+  if (!target->overrideAOTCompilation && src.getFunctionPtr()) {
+    // We are allowed to use the AOT-compiled module as-is, so nothing to do.
+    return cudaq::CompiledModule{src};
+  }
+
   const auto &kernelName = src.getName();
   auto modulePtr = src.getMlirOpaqueModulePtr();
   assert(modulePtr && "Compiler::compileModule requires an MLIR artifact");
diff --git a/targettests/TargetConfig/check_disable_mlir_links.cpp b/targettests/TargetConfig/check_disable_mlir_links.cpp
index ffffdb3db4d..066d75702ad 100644
--- a/targettests/TargetConfig/check_disable_mlir_links.cpp
+++ b/targettests/TargetConfig/check_disable_mlir_links.cpp
@@ -7,6 +7,9 @@
  ******************************************************************************/
 
 // RUN: nvq++ --disable-mlir-links %s -o %s.x && ! ldd %s.x | grep -q libcudaq-mlir-runtime.so 
+// RUN: nvq++ --disable-mlir-links %s -o %s.x && ./%s.x
+// We expect a failure when emulating a target that requires JIT compilation.
+// RUN: nvq++ --disable-mlir-links --target quantinuum --emulate %s -o %s.x && %s.x 2>&1 | FileCheck %s --check-prefix=FAIL
 
 #include "cudaq.h"
 
@@ -21,3 +24,5 @@ int main() {
   counts.dump();
   return 0;
 }
+
+// FAIL: QPU does not support launching a CompiledModule without MLIR artifacts
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index c9ca33e7d32..a9344cc1f49 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -23,6 +23,10 @@ SET(CMAKE_SHARED_LINKER_FLAGS "")
 # survives as an internal implementation detail.
 add_compile_definitions(CUDAQ_LIBRARY_MODE)
 
+# Disable JIT compilation as it is not needed (and we would otherwise have to
+# link in the compiler library for no reason)
+add_compile_definitions(CUDAQ_DISABLE_JIT_COMPILER)
+
 # ctest's PROCESSORS property tells the scheduler how many CPU slots each test
 # occupies. Without it, `ctest -j N` launches N OpenMP-parallel tests at once
 # (2N threads competing for N cores). This value is only applied to tests that