diff --git a/cudaq/include/cudaq/Target/CompileTarget.h b/cudaq/include/cudaq/Target/CompileTarget.h index ffd1371f394..be25cec470c 100644 --- a/cudaq/include/cudaq/Target/CompileTarget.h +++ b/cudaq/include/cudaq/Target/CompileTarget.h @@ -100,6 +100,10 @@ class CompileTarget { /// Whether to fully specialize the kernel. bool fullySpecialize = true; + /// Whether this target is a local simulator (not remote, not emulated). On + /// this path `i1` vector args are packed as bit-packed `std::vector`. + bool isLocalSimulator = false; + /// Set the `changeSemantics` flag for the argument synthesis pass. bool argumentSynthChangeSemantics = true; diff --git a/python/tests/builder/test_kernel_builder.py b/python/tests/builder/test_kernel_builder.py index 6c09ca9c101..650b85db7f3 100644 --- a/python/tests/builder/test_kernel_builder.py +++ b/python/tests/builder/test_kernel_builder.py @@ -13,6 +13,8 @@ import random import numpy as np import os +import subprocess +import sys from typing import List import cudaq @@ -1535,6 +1537,41 @@ def test_call_invalid_attribute_on_a_kernel(): assert "not supported on PyKernel" in str(e.value) +def test_repeated_builder_launch_no_segfault(): + """A ``list[bool]`` arg used to corrupt the heap during argument synthesis, + crashing a repeated ``make_kernel`` + ``sample`` loop at a random iteration. + + The crash only surfaces when the bit-packed ``std::vector`` padding is + nonzero, so run under ``MALLOC_PERTURB_`` (set before the process starts) in + a subprocess. See ``runtime/test/test_argument_conversion.cpp`` for the + unit-level regression. + """ + script = ( + "import cudaq\n" + "from typing import List\n" + "cudaq.set_target('qpp-cpu')\n" + "for _ in range(512):\n" + " kernel, *_ = cudaq.make_kernel(bool, list[bool], List[int], list[float])\n" + " kernel.qalloc(1)\n" + " cudaq.sample(kernel, False, [False], [3], [3.5])\n" + " cudaq.sample(kernel, False, [], [], [])\n" + "print('OK')\n") + env = dict(os.environ) + # Dirty fresh allocations so the std::vector padding is never zero, + # otherwise the corruption stays latent and the test passes with the bug. + env["MALLOC_PERTURB_"] = "165" + proc = subprocess.run([sys.executable, "-c", script], + env=env, + capture_output=True, + text=True, + timeout=900) + assert proc.returncode == 0, ("repeated make_kernel/sample crashed " + f"(returncode={proc.returncode}).\n" + f"stdout:\n{proc.stdout}\n" + f"stderr (tail):\n{proc.stderr[-3000:]}") + assert "OK" in proc.stdout + + # leave for gdb debugging if __name__ == "__main__": loc = os.path.abspath(__file__) diff --git a/runtime/cudaq/platform/default/python/QPU.cpp b/runtime/cudaq/platform/default/python/QPU.cpp index 4801d8e53bc..57ee82d560b 100644 --- a/runtime/cudaq/platform/default/python/QPU.cpp +++ b/runtime/cudaq/platform/default/python/QPU.cpp @@ -62,6 +62,7 @@ getCompileTarget(cudaq::ExecutionContext *context) { !(cudaq::is_remote_platform() || cudaq::is_emulated_platform()); ct->fullySpecialize = !isLocalSimulator; + ct->isLocalSimulator = isLocalSimulator; ct->supportDeviceCalls = true; ct->emitResourceCounts = context && context->name == "resource-count"; ct->argumentSynthChangeSemantics = false; diff --git a/runtime/internal/compiler/ArgumentConversion.cpp b/runtime/internal/compiler/ArgumentConversion.cpp index dcb0f09efab..32ec4e72864 100644 --- a/runtime/internal/compiler/ArgumentConversion.cpp +++ b/runtime/internal/compiler/ArgumentConversion.cpp @@ -97,15 +97,15 @@ static Value genConstant(OpBuilder &builder, const std::string &v, // Forward declare aggregate type builder as they can be recursive. static Value genRecursiveSpan(OpBuilder &, cudaq::cc::StdvecType, void *, - ModuleOp, llvm::DataLayout &); + ModuleOp, llvm::DataLayout &, bool); static Value genConstant(OpBuilder &, cudaq::cc::StdvecType, void *, ModuleOp, - llvm::DataLayout &); + llvm::DataLayout &, bool); static Value genConstant(OpBuilder &, cudaq::cc::StructType, void *, ModuleOp, - llvm::DataLayout &); + llvm::DataLayout &, bool); static Value genConstant(OpBuilder &, cudaq::cc::ArrayType, void *, ModuleOp, - llvm::DataLayout &); + llvm::DataLayout &, bool); static Value genConstant(OpBuilder &, cudaq::cc::CallableType, void *, ModuleOp, - llvm::DataLayout &); + llvm::DataLayout &, bool); /// Create callee.init_N that initializes the state /// @@ -524,7 +524,7 @@ static bool isSupportedRecursiveSpan(cudaq::cc::StdvecType ty) { // Recursive step processing of aggregates. Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod, - llvm::DataLayout &layout) { + llvm::DataLayout &layout, bool boolVecBitPacked = false) { auto *ctx = builder.getContext(); return TypeSwitch(ty) .Case([&](IntegerType intTy) -> Value { @@ -567,16 +567,16 @@ Value dispatchSubtype(OpBuilder &builder, Type ty, void *p, ModuleOp substMod, substMod); }) .Case([&](cudaq::cc::StdvecType ty) { - return genConstant(builder, ty, p, substMod, layout); + return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked); }) .Case([&](cudaq::cc::StructType ty) { - return genConstant(builder, ty, p, substMod, layout); + return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked); }) .Case([&](cudaq::cc::ArrayType ty) { - return genConstant(builder, ty, p, substMod, layout); + return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked); }) .Case([&](cudaq::cc::CallableType ty) { - return genConstant(builder, ty, p, substMod, layout); + return genConstant(builder, ty, p, substMod, layout, boolVecBitPacked); }) .Default({}); } @@ -597,21 +597,40 @@ static std::size_t getHostSideElementSize(Type eleTy, } /// Recursively builds an `ArrayAttr` containing the constants. +/// +/// Set \p boolVecBitPacked when an `i1` vector arg is a host +/// `std::vector` (bit-packed; not the `{begin, end, capacity}` triple). ArrayAttr genRecursiveConstantArray(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, - llvm::DataLayout &layout) { + llvm::DataLayout &layout, + bool boolVecBitPacked = false) { + auto eleTy = vecTy.getElementType(); + + // Bit-packed `std::vector`: read via the container API, not a triple. + if (boolVecBitPacked && eleTy.isInteger(1)) { + auto *boolVec = reinterpret_cast *>(p); + if (boolVec->empty()) + return {}; + auto intTy = cast(eleTy); + SmallVector members; + members.reserve(boolVec->size()); + for (bool bit : *boolVec) + members.push_back(IntegerAttr::get(intTy, bit ? 1 : 0)); + return ArrayAttr::get(builder.getContext(), members); + } + typedef const char *VectorType[3]; VectorType *vecPtr = static_cast(p); auto delta = (*vecPtr)[1] - (*vecPtr)[0]; if (!delta) return {}; - auto eleTy = vecTy.getElementType(); unsigned stepBy = 0; std::function genAttr; if (auto innerTy = dyn_cast(eleTy)) { stepBy = sizeof(VectorType); genAttr = [&, innerTy](char *p) -> Attribute { - return genRecursiveConstantArray(builder, innerTy, p, layout); + return genRecursiveConstantArray(builder, innerTy, p, layout, + boolVecBitPacked); }; } else if (auto stringTy = dyn_cast(eleTy)) { stepBy = sizeof(std::string); @@ -688,8 +707,10 @@ static Type convertRecursiveSpanType(Type ty) { /// constant propagation through the recursive span structure. The reify /// operation will be lowered to more primitive ops on an as-needed basis. Value genRecursiveSpan(OpBuilder &builder, cudaq::cc::StdvecType ty, void *p, - ModuleOp substMod, llvm::DataLayout &layout) { - ArrayAttr constants = genRecursiveConstantArray(builder, ty, p, layout); + ModuleOp substMod, llvm::DataLayout &layout, + bool boolVecBitPacked = false) { + ArrayAttr constants = + genRecursiveConstantArray(builder, ty, p, layout, boolVecBitPacked); auto loc = builder.getUnknownLoc(); if (!constants) { // Empty vector. Not much to contemplate here. @@ -705,9 +726,11 @@ Value genRecursiveSpan(OpBuilder &builder, cudaq::cc::StdvecType ty, void *p, } Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, - ModuleOp substMod, llvm::DataLayout &layout) { + ModuleOp substMod, llvm::DataLayout &layout, + bool boolVecBitPacked = false) { if (isSupportedRecursiveSpan(vecTy)) - return genRecursiveSpan(builder, vecTy, p, substMod, layout); + return genRecursiveSpan(builder, vecTy, p, substMod, layout, + boolVecBitPacked); typedef const char *VectorType[3]; VectorType *vecPtr = static_cast(p); auto delta = (*vecPtr)[1] - (*vecPtr)[0]; @@ -727,7 +750,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, for (std::int32_t i = 0; i < vecSize; ++i) { if (Value val = dispatchSubtype( builder, eleTy, static_cast(const_cast(cursor)), - substMod, layout)) { + substMod, layout, boolVecBitPacked)) { auto atLoc = cudaq::cc::ComputePtrOp::create( builder, loc, elePtrTy, buffer, ArrayRef{i}); @@ -740,7 +763,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StdvecType vecTy, void *p, } Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p, - ModuleOp substMod, llvm::DataLayout &layout) { + ModuleOp substMod, llvm::DataLayout &layout, + bool boolVecBitPacked = false) { if (strTy.getMembers().empty()) return {}; const char *cursor = static_cast(p); @@ -752,7 +776,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p, builder, iter.value(), static_cast(const_cast( cursor + cudaq::opt::getDataOffset(layout, strTy, i))), - substMod, layout)) + substMod, layout, boolVecBitPacked)) aggie = cudaq::cc::InsertValueOp::create(builder, loc, strTy, aggie, v, i); } @@ -760,7 +784,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::StructType strTy, void *p, } Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p, - ModuleOp substMod, llvm::DataLayout &layout) { + ModuleOp substMod, llvm::DataLayout &layout, + bool boolVecBitPacked = false) { if (!p) return {}; auto loc = builder.getUnknownLoc(); @@ -787,7 +812,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p, if (hasLiftedArgs) { for (unsigned i = liftedPos, j = 0; i < liftedArity; ++i, ++j) { Value v = dispatchSubtype(builder, calleeInpTys[i], closureArgs[j], - substMod, layout); + substMod, layout, boolVecBitPacked); assert(v && "lifted argument must be handled"); args.push_back(v); } @@ -806,7 +831,8 @@ Value genConstant(OpBuilder &builder, cudaq::cc::CallableType callTy, void *p, } Value genConstant(OpBuilder &builder, cudaq::cc::ArrayType arrTy, void *p, - ModuleOp substMod, llvm::DataLayout &layout) { + ModuleOp substMod, llvm::DataLayout &layout, + bool boolVecBitPacked = false) { if (arrTy.isUnknownSize()) return {}; auto eleTy = arrTy.getElementType(); @@ -818,7 +844,7 @@ Value genConstant(OpBuilder &builder, cudaq::cc::ArrayType arrTy, void *p, for (std::size_t i = 0; i < arrSize; ++i) { if (Value v = dispatchSubtype( builder, eleTy, static_cast(const_cast(cursor)), - substMod, layout)) + substMod, layout, boolVecBitPacked)) aggie = cudaq::cc::InsertValueOp::create(builder, loc, arrTy, aggie, v, i); cursor += eleSize; @@ -854,8 +880,9 @@ Value genConstant(OpBuilder &builder, cudaq::cc::IndirectCallableType indCallTy, //===----------------------------------------------------------------------===// cudaq_internal::compiler::ArgumentConverter::ArgumentConverter( - StringRef kernelName, ModuleOp sourceModule) - : sourceModule(sourceModule), kernelName(kernelName) {} + StringRef kernelName, ModuleOp sourceModule, bool boolVecBitPacked) + : sourceModule(sourceModule), kernelName(kernelName), + boolVecBitPacked(boolVecBitPacked) {} void cudaq_internal::compiler::ArgumentConverter::gen( std::span arguments) { @@ -954,19 +981,23 @@ void cudaq_internal::compiler::ArgumentConverter::gen( return {}; }) .Case([&](cudaq::cc::StdvecType ty) { - return buildSubst(ty, argPtr, substModule, dataLayout); + return buildSubst(ty, argPtr, substModule, dataLayout, + boolVecBitPacked); }) .Case([&](cudaq::cc::StructType ty) { - return buildSubst(ty, argPtr, substModule, dataLayout); + return buildSubst(ty, argPtr, substModule, dataLayout, + boolVecBitPacked); }) .Case([&](cudaq::cc::ArrayType ty) { - return buildSubst(ty, argPtr, substModule, dataLayout); + return buildSubst(ty, argPtr, substModule, dataLayout, + boolVecBitPacked); }) .Case([&](cudaq::cc::IndirectCallableType ty) { return buildSubst(ty, argPtr, substModule, dataLayout); }) .Case([&](cudaq::cc::CallableType ty) { - return buildSubst(ty, argPtr, substModule, dataLayout); + return buildSubst(ty, argPtr, substModule, dataLayout, + boolVecBitPacked); }) .Default({}); if (subst) diff --git a/runtime/internal/compiler/Compiler.cpp b/runtime/internal/compiler/Compiler.cpp index ab6e6c16986..aaf01878c91 100644 --- a/runtime/internal/compiler/Compiler.cpp +++ b/runtime/internal/compiler/Compiler.cpp @@ -224,7 +224,11 @@ cudaq_internal::compiler::Compiler::prepareModule(const std::string &kernelName, // For quantum devices, we generate a collection of `init` and // `num_qubits` functions and their substitutions created // from a kernel and arguments that generated a state argument. - cudaq_internal::compiler::ArgumentConverter argCon(kernelName, moduleOp); + // Local simulators marshal `i1` vectors as bit-packed `std::vector` + // (argsCreator); remote/emulated targets use `std::vector`. + const bool boolVecBitPacked = target->isLocalSimulator; + cudaq_internal::compiler::ArgumentConverter argCon(kernelName, moduleOp, + boolVecBitPacked); // Must stay in scope as `eraseNonCallableArguments` may populate it std::vector closureArgs; if (cudaq::opt::marshal::isFullySynthesized(epFunc)) { diff --git a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h index 8abf11f7096..d83a2c673db 100644 --- a/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h +++ b/runtime/internal/compiler/include/cudaq_internal/compiler/ArgumentConversion.h @@ -51,7 +51,11 @@ class ArgumentConverter { public: /// Build an instance to create argument substitutions for a specified \p /// kernelName in \p sourceModule. - ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule); + /// + /// Set \p boolVecBitPacked when `i1` vector args are host `std::vector` + /// (local-simulator launch path), not `std::vector`. + ArgumentConverter(mlir::StringRef kernelName, mlir::ModuleOp sourceModule, + bool boolVecBitPacked = false); ~ArgumentConverter() { for (auto *kInfo : kernelSubstitutions) { @@ -110,6 +114,10 @@ class ArgumentConverter { /// Kernel we are substituting the arguments for. mlir::StringRef kernelName; + + /// Whether `i1` vector args are bit-packed `std::vector` (vs + /// `std::vector`). See the constructor. + bool boolVecBitPacked; }; /// Merge modules from any CallableClosureArgument arguments into \p intoModule. diff --git a/runtime/test/test_argument_conversion.cpp b/runtime/test/test_argument_conversion.cpp index 82f3e3f814a..863a0d710bc 100644 --- a/runtime/test/test_argument_conversion.cpp +++ b/runtime/test/test_argument_conversion.cpp @@ -152,7 +152,8 @@ void dumpSubstitutionModules(ArgumentConverter &con) { void doSimpleTest(mlir::MLIRContext *ctx, const std::string &typeName, std::vector args, - const std::string &additionalCode = "") { + const std::string &additionalCode = "", + bool boolVecBitPacked = false) { std::string code = additionalCode + R"#( func.func private @callee(%0: )#" + typeName + R"#() @@ -166,7 +167,7 @@ func.func @__nvqpp__mlirgen__testy(%0: )#" + // Create the Module auto mod = mlir::parseSourceString(code, ctx); llvm::outs() << "Source module:\n" << *mod << '\n'; - ArgumentConverter ab{"testy", *mod}; + ArgumentConverter ab{"testy", *mod, boolVecBitPacked}; // Create the argument conversions ab.gen(args); // Dump all conversions @@ -400,6 +401,22 @@ void test_vectors(mlir::MLIRContext *ctx) { // CHECK: } // clang-format on + { + // Real bit-packed `std::vector`, as the local-simulator launch path + // passes it. Reading this as a `{begin, end, capacity}` triple corrupts the + // heap; `boolVecBitPacked` selects the correct reader. + std::vector x = {true, false, true, true}; + std::vector v = {static_cast(&x)}; + doSimpleTest(ctx, "!cc.stdvec", v, /*additionalCode=*/"", + /*boolVecBitPacked=*/true); + } + // clang-format off +// CHECK-LABEL: cc.arg_subst[0] { +// CHECK: %[[VAL_0:.*]] = cc.const_array [true, false, true, true] : !cc.array +// CHECK: %[[VAL_1:.*]] = cc.reify_span %[[VAL_0]] : (!cc.array) -> !cc.stdvec + // CHECK: } + // clang-format on + { std::vector> x = { {cudaq::pauli_word{"XX"}, cudaq::pauli_word{"XY"}},