diff --git a/docs/sphinx/examples/cpp/other/builder/builder.cpp b/docs/sphinx/examples/cpp/other/builder/builder.cpp index 8c9a109864..79afbc0bbd 100644 --- a/docs/sphinx/examples/cpp/other/builder/builder.cpp +++ b/docs/sphinx/examples/cpp/other/builder/builder.cpp @@ -17,6 +17,14 @@ #include #include +static bool results_are_close(const cudaq::sample_result &f1, + const cudaq::sample_result &f2) { + // Stub for a fancy comparison. + f1.dump(); + f2.dump(); + return true; +} + // This example demonstrates various uses for the `cudaq::builder` // type. This type enables one to dynamically construct callable // CUDA Quantum kernels via just-in-time compilation. The typical workflow @@ -114,6 +122,37 @@ int main() { counts.dump(); } + { + // In a simulated environment, it is sometimes useful to be able to + // specify an initial state vector. The initial state vector is 2 to the + // power `n` where `n` is the number of qubits. + + // In this example, we create a kernel template `sim_kernel` that captures + // the variable `init_state` by reference. + auto sim_builder = cudaq::make_kernel(); + std::vector init_state; + auto q = sim_builder.qalloc(init_state); + // Build the quantum circuit template here. + sim_builder.mz(q); + + // Now we are ready to instantiate the kernel and invoke it. So we can set + // the `init_state` to a vector with 2 complex values (1 qubit) and + // get the results. + init_state = {{0.0, 1.0}, {1.0, 0.0}}; + auto counts0 = cudaq::sample(sim_builder); + + // Now suppose we have a different initial state with 4 complex values (2 + // qubits). Let's rerun the kernel with the new `init_state`. + init_state = {{1.0, 0.0}, {0.0, 1.0}, {0.0, 1.0}, {1.0, 0.0}}; + auto counts1 = cudaq::sample(sim_builder); + + // Finally in this wholly contrived example, we test the results to make + // sure they are "close". + if (results_are_close(counts0, counts1)) { + printf("The two initial states generated results that are \"close\".\n"); + } + } + { // Let's do a final sampling task. Let's diff --git a/include/cudaq/Frontend/nvqpp/ASTBridge.h b/include/cudaq/Frontend/nvqpp/ASTBridge.h index a65c3fed95..51a3d8e2e8 100644 --- a/include/cudaq/Frontend/nvqpp/ASTBridge.h +++ b/include/cudaq/Frontend/nvqpp/ASTBridge.h @@ -249,13 +249,8 @@ class QuakeBridgeVisitor bool TraverseConditionalOperator(clang::ConditionalOperator *x, DataRecursionQueue *q = nullptr); bool VisitReturnStmt(clang::ReturnStmt *x); - bool VisitCXXFunctionalCastExpr(clang::CXXFunctionalCastExpr *x) { - return true; - } bool TraverseInitListExpr(clang::InitListExpr *x, DataRecursionQueue *q = nullptr); - bool TraverseCXXTemporaryObjectExpr(clang::CXXTemporaryObjectExpr *x, - DataRecursionQueue *q = nullptr); // These misc. statements are not (yet) handled by lowering. bool TraverseAsmStmt(clang::AsmStmt *x, DataRecursionQueue *q = nullptr); @@ -287,16 +282,58 @@ class QuakeBridgeVisitor bool TraverseCXXConstructExpr(clang::CXXConstructExpr *x, DataRecursionQueue *q = nullptr); bool VisitCXXConstructExpr(clang::CXXConstructExpr *x); - bool VisitCXXTemporaryObjectExpr(clang::CXXTemporaryObjectExpr *x); bool VisitCXXOperatorCallExpr(clang::CXXOperatorCallExpr *x); bool WalkUpFromCXXOperatorCallExpr(clang::CXXOperatorCallExpr *x); bool TraverseDeclRefExpr(clang::DeclRefExpr *x, DataRecursionQueue *q = nullptr); bool VisitDeclRefExpr(clang::DeclRefExpr *x); bool VisitFloatingLiteral(clang::FloatingLiteral *x); + + // Cast operations. + bool TraverseCastExpr(clang::CastExpr *x, DataRecursionQueue *q = nullptr); + bool VisitCastExpr(clang::CastExpr *x); + bool TraverseImplicitCastExpr(clang::ImplicitCastExpr *x, - DataRecursionQueue *q = nullptr); - bool VisitImplicitCastExpr(clang::ImplicitCastExpr *x); + DataRecursionQueue *q = nullptr) { + return TraverseCastExpr(x, q); + } + bool TraverseExplicitCastExpr(clang::ExplicitCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseCastExpr(x, q); + } + bool TraverseCStyleCastExpr(clang::CStyleCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool TraverseCXXFunctionalCastExpr(clang::CXXFunctionalCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool TraverseCXXAddrspaceCastExpr(clang::CXXAddrspaceCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool TraverseCXXConstCastExpr(clang::CXXConstCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool TraverseCXXDynamicCastExpr(clang::CXXDynamicCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool TraverseCXXReinterpretCastExpr(clang::CXXReinterpretCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool TraverseCXXStaticCastExpr(clang::CXXStaticCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool TraverseBuiltinBitCastExpr(clang::BuiltinBitCastExpr *x, + DataRecursionQueue *q = nullptr) { + return TraverseExplicitCastExpr(x, q); + } + bool VisitInitListExpr(clang::InitListExpr *x); bool VisitIntegerLiteral(clang::IntegerLiteral *x); bool VisitCXXBoolLiteralExpr(clang::CXXBoolLiteralExpr *x); @@ -579,7 +616,6 @@ class QuakeBridgeVisitor llvm::DenseMap records; // State Flags - bool skipCompoundScope : 1 = false; bool isEntry : 1 = false; /// If there is a catastrophic error in the bridge (there is no rational way @@ -589,6 +625,7 @@ class QuakeBridgeVisitor bool visitImplicitCode : 1 = false; bool inRecType : 1 = false; bool allowUnknownRecordType : 1 = false; + bool initializerIsGlobal : 1 = false; }; } // namespace details diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h index 9da117f169..8e07d9418f 100644 --- a/include/cudaq/Optimizer/Builder/Factory.h +++ b/include/cudaq/Optimizer/Builder/Factory.h @@ -159,6 +159,12 @@ inline mlir::Value createF64Constant(mlir::Location loc, return createFloatConstant(loc, builder, value, builder.getF64Type()); } +/// Return the integer value if \p v is an integer constant. +std::optional maybeValueOfIntConstant(mlir::Value v); + +/// Return the floating point value if \p v is a floating-point constant. +std::optional maybeValueOfFloatConstant(mlir::Value v); + //===----------------------------------------------------------------------===// inline mlir::Block *addEntryBlock(mlir::LLVM::GlobalOp initVar) { diff --git a/include/cudaq/Optimizer/CodeGen/CMakeLists.txt b/include/cudaq/Optimizer/CodeGen/CMakeLists.txt index 123158afea..5c2c15f8d7 100644 --- a/include/cudaq/Optimizer/CodeGen/CMakeLists.txt +++ b/include/cudaq/Optimizer/CodeGen/CMakeLists.txt @@ -6,6 +6,9 @@ # the terms of the Apache License 2.0 which accompanies this distribution. # # ============================================================================ # +add_cudaq_dialect(CodeGen codegen) +add_cudaq_dialect_doc(CodeGenDialect codegen) + set(LLVM_TARGET_DEFINITIONS Passes.td) mlir_tablegen(Passes.h.inc -gen-pass-decls -name OptCodeGen) add_public_tablegen_target(OptCodeGenPassIncGen) @@ -13,4 +16,3 @@ add_public_tablegen_target(OptCodeGenPassIncGen) set(LLVM_TARGET_DEFINITIONS Peephole.td) mlir_tablegen(Peephole.inc -gen-rewriters) add_public_tablegen_target(OptPeepholeIncGen) - diff --git a/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td b/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td new file mode 100644 index 0000000000..6a0964fc9d --- /dev/null +++ b/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td @@ -0,0 +1,34 @@ +/********************************************************** -*- tablegen -*- *** + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#ifndef CUDAQ_OPTIMIZER_CODEGEN_DIALECT +#define CUDAQ_OPTIMIZER_CODEGEN_DIALECT + +include "mlir/Interfaces/SideEffectInterfaces.td" + +//===----------------------------------------------------------------------===// +// Dialect definition. +//===----------------------------------------------------------------------===// + +def CodeGenDialect : Dialect { + let name = "codegen"; + let summary = "Code generation helpers"; + let description = [{ + Do not use this dialect outside of code generation. + }]; + + let cppNamespace = "cudaq::codegen"; + let useDefaultTypePrinterParser = 1; + let useFoldAPI = kEmitFoldAdaptorFolder; + + let extraClassDeclaration = [{ + void registerTypes(); // register at least a bogo type. + }]; +} + +#endif diff --git a/include/cudaq/Optimizer/CodeGen/CodeGenOps.td b/include/cudaq/Optimizer/CodeGen/CodeGenOps.td new file mode 100644 index 0000000000..29f0ccbefc --- /dev/null +++ b/include/cudaq/Optimizer/CodeGen/CodeGenOps.td @@ -0,0 +1,47 @@ +/********************************************************** -*- tablegen -*- *** + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#ifndef CUDAQ_OPTIMIZER_CODEGEN_OPS +#define CUDAQ_OPTIMIZER_CODEGEN_OPS + +include "cudaq/Optimizer/CodeGen/CodeGenDialect.td" +include "mlir/Interfaces/InferTypeOpInterface.td" +include "cudaq/Optimizer/Dialect/Common/Traits.td" +include "cudaq/Optimizer/Dialect/CC/CCTypes.td" +include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.td" + +//===----------------------------------------------------------------------===// +// The codegen quake dialect is a transitory set of operations used exclusively +// during codegen. The ops defined here make the process of converting the Quake +// code to a another target dialect (e.g. LLVM-IR) easier. They should not be +// used outside of the codegen passes. +//===----------------------------------------------------------------------===// + +class CGQOp traits = []> + : Op; + +def cgq_RAIIOp : CGQOp<"qmem_raii", [MemoryEffects<[MemAlloc, MemWrite]>]> { + let summary = "Combine allocation and initialization of set of qubits."; + let description = [{ + Used only in QIR code generation. + }]; + + let arguments = (ins + cc_PointerType:$initState, + TypeAttr:$allocType, + Optional:$allocSize + ); + let results = (outs VeqType); + + let assemblyFormat = [{ + $initState `(` $allocType ( `[` $allocSize^ `]` )? `)` `:` + functional-type(operands, results) attr-dict + }]; +} + +#endif diff --git a/include/cudaq/Optimizer/CodeGen/CodeGenTypes.td b/include/cudaq/Optimizer/CodeGen/CodeGenTypes.td new file mode 100644 index 0000000000..e268494f34 --- /dev/null +++ b/include/cudaq/Optimizer/CodeGen/CodeGenTypes.td @@ -0,0 +1,34 @@ +/********************************************************** -*- tablegen -*- *** + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#ifndef CUDAQ_OPTIMIZER_CODEGEN_TYPES +#define CUDAQ_OPTIMIZER_CODEGEN_TYPES + +include "cudaq/Optimizer/CodeGen/CodeGenDialect.td" +include "mlir/Interfaces/DataLayoutInterfaces.td" +include "mlir/IR/AttrTypeBase.td" + +//===----------------------------------------------------------------------===// +// BaseType +//===----------------------------------------------------------------------===// + +class CodeGenType traits = [], + string base = "mlir::Type"> + : TypeDef { + let mnemonic = typeMnemonic; +} + +// There are no codegen dialect types, but the dialect tablegen generates type +// boilerplate for the dialect anyway. + +def codegen_DoNotUseType : CodeGenType<"DoNotUse", "dummy"> { + let summary = ""; + let description = [{ }]; +} + +#endif // CUDAQ_OPTIMIZER_CODEGEN_TYPES diff --git a/include/cudaq/Optimizer/CodeGen/Passes.h b/include/cudaq/Optimizer/CodeGen/Passes.h index 7496965ab2..61c297175c 100644 --- a/include/cudaq/Optimizer/CodeGen/Passes.h +++ b/include/cudaq/Optimizer/CodeGen/Passes.h @@ -52,6 +52,9 @@ std::unique_ptr createRemoveMeasurementsPass(); /// Register target pipelines. void registerTargetPipelines(); +/// Register CodeGenDialect with the provided DialectRegistry. +void registerCodeGenDialect(mlir::DialectRegistry ®istry); + // declarative passes #define GEN_PASS_DECL #define GEN_PASS_REGISTRATION diff --git a/include/cudaq/Optimizer/CodeGen/Passes.td b/include/cudaq/Optimizer/CodeGen/Passes.td index 22930e542b..fa450745b1 100644 --- a/include/cudaq/Optimizer/CodeGen/Passes.td +++ b/include/cudaq/Optimizer/CodeGen/Passes.td @@ -19,7 +19,9 @@ def QuakeToQIR : Pass<"quake-to-qir", "mlir::ModuleOp"> { QIR qubits. }]; - let dependentDialects = ["mlir::LLVM::LLVMDialect"]; + let dependentDialects = [ + "cudaq::codegen::CodeGenDialect", "mlir::LLVM::LLVMDialect" + ]; let constructor = "cudaq::opt::createConvertToQIRPass()"; } diff --git a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h index 43b6cf35da..52595c14b2 100644 --- a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h +++ b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h @@ -43,6 +43,10 @@ constexpr static const char QIRArrayGetElementPtr1d[] = "__quantum__rt__array_get_element_ptr_1d"; constexpr static const char QIRArrayQubitAllocateArray[] = "__quantum__rt__qubit_allocate_array"; +constexpr static const char QIRArrayQubitAllocateArrayWithStateFP64[] = + "__quantum__rt__qubit_allocate_array_with_state_fp64"; +constexpr static const char QIRArrayQubitAllocateArrayWithStateFP32[] = + "__quantum__rt__qubit_allocate_array_with_state_fp32"; constexpr static const char QIRQubitAllocate[] = "__quantum__rt__qubit_allocate"; constexpr static const char QIRArrayQubitReleaseArray[] = diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td index 6d77169329..dec070384c 100644 --- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td +++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td @@ -794,6 +794,35 @@ def cc_AddressOfOp : CCOp<"address_of", [Pure, }]; } +def cc_GlobalOp : CCOp<"global", [IsolatedFromAbove, Symbol]> { + let summary = "Create a global constant or variable"; + let description = [{ + A GlobalOp is used to create a global variable or constant that can be + referenced by symbol using the AddressOfOp. The type of this op is always + implicitly a cc::PointerType. + + For example, this op may be used to define arrays of doubles, which may in + turn be used as initial state vectors for quantum memory (VeqType). + }]; + + let arguments = (ins + TypeAttr:$global_type, + StrAttr:$sym_name, + OptionalAttr:$value, + UnitAttr:$constant, + UnitAttr:$external + ); + + let hasCustomAssemblyFormat = 1; + + let extraClassDeclaration = [{ + cudaq::cc::PointerType getType() { + auto globalTy = getGlobalType(); + return cudaq::cc::PointerType::get(globalTy); + } + }]; +} + def cc_ExtractValueOp : CCOp<"extract_value", [Pure]> { let summary = "Extract a value from an aggregate value."; let description = [{ diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td index b1e8ea39eb..802be8d2e4 100644 --- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td +++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td @@ -71,8 +71,12 @@ def quake_AllocaOp : QuakeOp<"alloca", [MemoryEffects<[MemAlloc, MemWrite]>]> { See DeallocOp. }]; - let arguments = (ins Optional:$size); - let results = (outs AnyRefType:$ref_or_vec); + let arguments = (ins + Optional:$size + ); + let results = (outs + AnyRefType:$ref_or_vec + ); let builders = [ OpBuilder<(ins ), [{ @@ -92,6 +96,16 @@ def quake_AllocaOp : QuakeOp<"alloca", [MemoryEffects<[MemAlloc, MemWrite]>]> { let hasCanonicalizer = 1; let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool hasInitializedState() { + auto *self = getOperation(); + return self->hasOneUse() && + mlir::isa(*self->getUsers().begin()); + } + + quake::InitializeStateOp getInitializedState(); + }]; } def quake_ConcatOp : QuakeOp<"concat", [Pure]> { @@ -281,6 +295,31 @@ def quake_ExtractRefOp : QuakeOp<"extract_ref", [Pure]> { }]; } +def quake_InitializeStateOp : QuakeOp<"init_state", [MemoryEffects<[MemAlloc, + MemWrite]>]> { + let summary = "Initialize the quantum state to a specific complex vector."; + let description = [{ + Given a !cc.ptr pointing to a complex data array of size 2**N, where N is + the number of qubits in the targets operand, initialize the state of those + target qubits to the provided state vector. This operation returns a new + quake.veq instance. There should be no other uses of the input veq value, + \em{targets}, that was allocated. This supports a RAII (resource allocation + is initialization) semantics on the qubits in the vector. + }]; + + let arguments = (ins + VeqType:$targets, + cc_PointerType:$state + ); + let results = (outs VeqType); + + let assemblyFormat = [{ + $targets `,` $state `:` functional-type(operands, results) attr-dict + }]; + + let hasVerifier = 1; +} + def quake_RelaxSizeOp : QuakeOp<"relax_size", [Pure]> { let summary = "Relax the constant size on a !veq to be unknown."; let description = [{ diff --git a/lib/Frontend/nvqpp/ConvertDecl.cpp b/lib/Frontend/nvqpp/ConvertDecl.cpp index d099ea1ecc..c456e42ba2 100644 --- a/lib/Frontend/nvqpp/ConvertDecl.cpp +++ b/lib/Frontend/nvqpp/ConvertDecl.cpp @@ -178,9 +178,9 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) { if (name.equals("basic_string")) return pushType(cc::CharspanType::get(ctx)); if (name.equals("vector")) { - auto *cts = cast(x); + auto *cts = dyn_cast(x); // Traverse template argument 0 to get the vector's element type. - if (!TraverseType(cts->getTemplateArgs()[0].getAsType())) + if (!cts || !TraverseType(cts->getTemplateArgs()[0].getAsType())) return false; return pushType(cc::StdvecType::get(ctx, popType())); } @@ -191,6 +191,22 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) { } if (name.equals("_Bit_type")) return pushType(builder.getI64Type()); + if (name.equals("complex")) { + auto *cts = dyn_cast(x); + // Traverse template argument 0 to get the complex's element type. + if (!cts || !TraverseType(cts->getTemplateArgs()[0].getAsType())) + return false; + auto memTy = popType(); + return pushType(ComplexType::get(memTy)); + } + if (name.equals("initializer_list")) { + auto *cts = dyn_cast(x); + // Traverse template argument 0, the initializer list's element type. + if (!cts || !TraverseType(cts->getTemplateArgs()[0].getAsType())) + return false; + auto memTy = popType(); + return pushType(cc::ArrayType::get(memTy)); + } if (name.equals("function")) { auto *cts = cast(x); // Traverse template argument 0 to get the function's signature. diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp index 09d534ae4b..ddc05456c3 100644 --- a/lib/Frontend/nvqpp/ConvertExpr.cpp +++ b/lib/Frontend/nvqpp/ConvertExpr.cpp @@ -53,8 +53,7 @@ static clang::NamedDecl *getNamedDecl(clang::Expr *expr) { static std::pair, SmallVector> maybeUnpackOperands(OpBuilder &builder, Location loc, ValueRange operands, bool isControl = false) { - // If this is not a controlled op, then we just keep all operands - // as targets. + // If this is not a controlled op, then we just keep all operands as targets. if (!isControl) return std::make_pair(operands, SmallVector{}); @@ -599,30 +598,30 @@ Value QuakeBridgeVisitor::integerCoercion(Location loc, TODO_loc(loc, "Integer conversion but not integer types"); } -bool QuakeBridgeVisitor::TraverseImplicitCastExpr(clang::ImplicitCastExpr *x, - DataRecursionQueue *) { - // RecursiveASTVisitor is tuned for dumping surface syntax so doesn't visit - // the type. Override so that the casted to type is visited and pushed on the - // stack. +bool QuakeBridgeVisitor::TraverseCastExpr(clang::CastExpr *x, + DataRecursionQueue *) { + // RecursiveASTVisitor is tuned for dumping surface syntax so doesn't + // necessarily visit the type. Override so that the casted to type is visited + // and pushed on the stack. [[maybe_unused]] auto typeStackDepth = typeStack.size(); LLVM_DEBUG(llvm::dbgs() << "%% "; x->dump()); if (!TraverseType(x->getType())) return false; assert(typeStack.size() == typeStackDepth + 1 && "must push a type"); - auto result = Base::TraverseImplicitCastExpr(x); + for (auto *sub : getStmtChildren(x)) + if (!TraverseStmt(sub)) + return false; + bool result = WalkUpFromCastExpr(x); if (result) { assert(typeStack.size() == typeStackDepth && "must be original depth"); } return result; } -bool QuakeBridgeVisitor::VisitImplicitCastExpr(clang::ImplicitCastExpr *x) { +bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) { // The type to cast the expression into is pushed during the traversal of the // ImplicitCastExpr in non-error cases. auto castToTy = popType(); - if (x->getCastKind() == clang::CastKind::CK_FunctionToPointerDecay) - return true; // NOP - auto loc = toLocation(x); auto intToIntCast = [&](Location locSub, Value mlirVal) { clang::QualType srcTy = x->getSubExpr()->getType(); @@ -654,9 +653,10 @@ bool QuakeBridgeVisitor::VisitImplicitCastExpr(clang::ImplicitCastExpr *x) { assert(result && "integer conversion failed"); return result; } + case clang::CastKind::CK_FunctionToPointerDecay: case clang::CastKind::CK_ArrayToPointerDecay: - return true; case clang::CastKind::CK_NoOp: + case clang::CastKind::CK_ToVoid: return true; case clang::CastKind::CK_FloatingToIntegral: { auto qualTy = x->getType(); @@ -720,6 +720,9 @@ bool QuakeBridgeVisitor::VisitImplicitCastExpr(clang::ImplicitCastExpr *x) { if (cxxExpr->getNumArgs() == 1) return true; } + if (isa(castToTy) && isa(peekValue().getType())) { + return true; + } if (auto funcTy = peelPointerFromFunction(castToTy)) if (auto fromTy = dyn_cast(peekValue().getType())) { auto inputs = funcTy.getInputs(); @@ -1082,8 +1085,8 @@ bool QuakeBridgeVisitor::TraverseLambdaExpr(clang::LambdaExpr *x, bool QuakeBridgeVisitor::TraverseMemberExpr(clang::MemberExpr *x, DataRecursionQueue *) { if (auto *methodDecl = dyn_cast(x->getMemberDecl())) { - // For function members, we want to push the type of the function, since the - // visit to CallExpr requires a type to have been pushed. + // For function members, we want to push the type of the function, since + // the visit to CallExpr requires a type to have been pushed. [[maybe_unused]] auto typeStackDepth = typeStack.size(); if (!TraverseType(methodDecl->getType())) return false; @@ -1426,7 +1429,6 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) { } else if (auto load = v.getDefiningOp()) { processedArgs.push_back(load.getPtrvalue()); } else { - v.dump(); reportClangError(x, mangler, "could not determine string argument"); } }; @@ -1576,8 +1578,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) { SymbolRefAttr calleeSymbol; auto *ctx = builder.getContext(); - // Expand the negations inline around the quake.apply. This will result - // in less duplication of code than threading the negated sense of the + // Expand the negations inline around the quake.apply. This will result in + // less duplication of code than threading the negated sense of the // control recursively through the callable. auto inlinedStartControlNegations = [&]() { if (!negations.empty()) { @@ -2044,8 +2046,8 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr( } if (typeName == "_Bit_reference" || typeName == "__bit_reference") { // For vector, on the kernel side this is represented as a sequence - // of byte-sized boolean values (true and false). On the host side, C++ - // is likely going to pack the booleans as bits in words. + // of byte-sized boolean values (true and false). On the host side, C++ is + // likely going to pack the booleans as bits in words. auto indexVar = popValue(); auto svec = popValue(); assert(svec.getType().isa()); @@ -2136,22 +2138,6 @@ void QuakeBridgeVisitor::maybeAddCallOperationSignature(clang::Decl *x) { } } -bool QuakeBridgeVisitor::TraverseCXXTemporaryObjectExpr( - clang::CXXTemporaryObjectExpr *x, DataRecursionQueue *) { - if (auto *ctor = x->getConstructor()) - maybeAddCallOperationSignature(ctor); - if (!TraverseType(x->getType())) - return false; - return WalkUpFromCXXTemporaryObjectExpr(x); -} - -bool QuakeBridgeVisitor::VisitCXXTemporaryObjectExpr( - clang::CXXTemporaryObjectExpr *x) { - // We probably want a distinctive op here instead of just leaving a type. - // Really this means allocating at least 1 byte and calling the default ctor. - return true; -} - bool QuakeBridgeVisitor::TraverseInitListExpr(clang::InitListExpr *x, DataRecursionQueue *) { if (x->isSyntacticForm()) { @@ -2228,12 +2214,12 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) { return pushValue(last[0]); } - // These initializer expressions are not quantum references. In this case, - // we allocate some memory for a variable and store the init list elements - // there. Add the array size value + // These initializer expressions are not quantum references. In this case, we + // allocate some memory for a variable and store the init list elements there. auto structTy = dyn_cast(initListTy); std::int32_t structMems = structTy ? structTy.getMembers().size() : 0; std::int32_t numEles = structMems ? size / structMems : size; + // Generate the array size value. Value arrSize = builder.create(loc, numEles, 64); // Allocate the required memory chunk. @@ -2242,6 +2228,40 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) { return arrTy.getElementType(); return initListTy; }(); + + bool lastValuesAreConstant = [&]() { + for (Value v : last) + if (!opt::factory::maybeValueOfFloatConstant(v)) + return false; + return true; + }(); + + if (initializerIsGlobal && lastValuesAreConstant) { + static unsigned counter = 0; + auto *ctx = builder.getContext(); + auto globalTy = cc::ArrayType::get(ctx, eleTy, size); + SmallVector values; + auto f64Ty = builder.getF64Type(); + for (Value v : last) { + auto fp = opt::factory::maybeValueOfFloatConstant(v); + values.push_back(FloatAttr::get(f64Ty, *fp)); + } + // NB: Unfortunately, the LLVM-IR dialect doesn't lower DenseF64ArrayAttr to + // LLVM IR without throwing errors. + auto tensorTy = RankedTensorType::get(size, eleTy); + auto f64Attr = DenseElementsAttr::get(tensorTy, values); + // Create a unique name. + std::string name = "__nvqpp__rodata_init_" + std::to_string(counter++); + { + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToEnd(module.getBody()); + builder.create(loc, globalTy, name, f64Attr, + /*constant=*/true); + } + auto ptrTy = cc::PointerType::get(globalTy); + auto globalInit = builder.create(loc, ptrTy, name); + return pushValue(globalInit); + } Value alloca = (numEles > 1) ? builder.create(loc, eleTy, arrSize) : builder.create(loc, eleTy); @@ -2283,12 +2303,25 @@ bool QuakeBridgeVisitor::TraverseCXXConstructExpr(clang::CXXConstructExpr *x, if (x->isElidable()) return true; [[maybe_unused]] auto typeStackDepth = typeStack.size(); + bool saveInitializerIsGlobal = initializerIsGlobal; if (x->getConstructor()) { if (!TraverseType(x->getType())) return false; assert(typeStack.size() == typeStackDepth + 1); + if (x->isStdInitListInitialization() && isa(peekType())) + initializerIsGlobal = true; } + auto *ctor = x->getConstructor(); + // FIXME: this implicit code visit setting is a hack to only visit a default + // argument value when constructing a complex value. We should always be able + // to visit default arguments, but we currently trip over default allocators, + // etc. + bool saveVisitImplicitCode = visitImplicitCode; + if (isInClassInNamespace(ctor, "complex", "std")) + visitImplicitCode = true; auto result = Base::TraverseCXXConstructExpr(x); + visitImplicitCode = saveVisitImplicitCode; + initializerIsGlobal = saveInitializerIsGlobal; assert(typeStack.size() == typeStackDepth || raisedError); return result; } @@ -2301,7 +2334,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { } // The ctor type is the class for which the ctor is a member. auto ctorTy = popType(); - auto ctorName = ctor->getNameAsString(); + // FIXME: not every constructor has a name. + std::string ctorName = ctor->getNameAsString(); if (isInNamespace(ctor, "cudaq")) { if (x->getNumArgs() == 0) { if (ctorName == "qudit") { @@ -2323,17 +2357,81 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { return pushValue(builder.create(loc, veq1Ty)); } } else if (x->getNumArgs() == 1) { - if (ctorName == "qreg" || ctorName == "qvector") { + if (ctorName == "qreg") { // This is a cudaq::qreg(std::size_t). auto sizeVal = popValue(); assert(isa(sizeVal.getType())); return pushValue(builder.create( loc, quake::VeqType::getUnsized(builder.getContext()), sizeVal)); } + if (ctorName == "qudit") { + auto initials = popValue(); + bool ok = false; + if (auto ptrTy = dyn_cast(initials.getType())) + if (auto arrTy = dyn_cast(ptrTy.getElementType())) + ok = isa(arrTy.getElementType()); + if (!ok) { + // Invalid initializer ignored, but emit an error. + reportClangError(x, mangler, "invalid qudit initial value"); + return pushValue(builder.create(loc)); + } + auto *ctx = builder.getContext(); + auto veqTy = quake::VeqType::get(ctx, 1); + auto alloc = builder.create(loc, veqTy); + auto init = builder.create(loc, veqTy, alloc, + initials); + return pushValue(builder.create(loc, init, 0)); + } + if (ctorName == "qvector") { + auto initials = popValue(); + auto *ctx = builder.getContext(); + if (isa(initials.getType())) { + // This is the cudaq::qvector(std::size_t) ctor. + return pushValue(builder.create( + loc, quake::VeqType::getUnsized(ctx), initials)); + } + // Otherwise, it is the cudaq::qvector(std::vector) ctor. + Value numQubits; + Type initialsTy = initials.getType(); + if (auto ptrTy = dyn_cast(initialsTy)) { + if (auto arrTy = dyn_cast(ptrTy.getElementType())) { + if (arrTy.isUnknownSize()) { + if (auto allocOp = initials.getDefiningOp()) + if (auto size = allocOp.getSeqSize()) + numQubits = + builder.create(loc, size); + } else { + std::size_t arraySize = arrTy.getSize(); + if (!std::has_single_bit(arraySize)) { + reportClangError(x, mangler, + "state vector must be a power of 2 in length"); + } + numQubits = builder.create( + loc, std::countr_zero(arraySize), 64); + } + } + } else if (auto stdvecTy = dyn_cast(initialsTy)) { + Value vecLen = builder.create( + loc, builder.getI64Type(), initials); + numQubits = builder.create(loc, vecLen); + auto ptrTy = cc::PointerType::get(stdvecTy.getElementType()); + initials = builder.create(loc, ptrTy, initials); + } + if (!numQubits) { + reportClangError( + x, mangler, + "internal error: could not determine the number of qubits"); + return false; + } + auto veqTy = quake::VeqType::getUnsized(ctx); + auto alloc = builder.create(loc, veqTy, numQubits); + return pushValue(builder.create( + loc, veqTy, alloc, initials)); + } if ((ctorName == "qspan" || ctorName == "qview") && isa(peekValue().getType())) { - // One of the qspan ctors, which effectively just makes a copy. Here - // we omit making a copy and just forward the veq argument. + // One of the qspan ctors, which effectively just makes a copy. Here we + // omit making a copy and just forward the veq argument. assert(isa(ctorTy)); return true; } @@ -2352,6 +2450,12 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { }(); if (isVectorOfQubitRefs) return true; + if (ctorName == "complex") { + Value imag = popValue(); + Value real = popValue(); + return pushValue(builder.create( + loc, ComplexType::get(real.getType()), real, imag)); + } if (ctorName == "function") { // Are we converting a lambda expr to a std::function? auto backVal = peekValue(); @@ -2408,15 +2512,14 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { return true; } - // We check for vector constructors with 2 args, the first - // could be an initializer_list or an integer, while the - // second is the allocator + // We check for vector constructors with 2 args, the first could be an + // initializer_list or an integer, while the second is the allocator if (ctorName == "vector") { if (x->getNumArgs() == 2) { - // This is a std::vector constructor, first we'll check if it - // is constructed from a constant initializer list, in that case - // we'll have a AllocaOp at the top of the stack that allocates a - // ptr>, where C is constant / known + // This is a std::vector constructor, first we'll check if it is + // constructed from a constant initializer list, in that case we'll have + // a AllocaOp at the top of the stack that allocates a ptr>, + // where C is constant / known auto desugared = x->getArg(0)->getType().getCanonicalType(); if (auto recordType = dyn_cast(desugared.getTypePtr())) @@ -2431,9 +2534,9 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { allocation, definingOp.getSeqSize())); } - // Next check if its created from a size integer - // Let's do a check on the first argument, make sure that when - // we peel off all the typedefs that it is an integer + // Next check if its created from a size integer. Let's do a check on + // the first argument, make sure that when we peel off all the typedefs + // that it is an integer. if (auto builtInType = dyn_cast(desugared.getTypePtr())) if (builtInType->isInteger() && @@ -2447,8 +2550,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { return ctorTy; }(); - // create stdvec init op without a buffer. - // Allocate the required memory chunk + // Create stdvec init op without a buffer. Allocate the required + // memory chunk Value alloca = builder.create(loc, eleTy, arrSize); // Create the stdvec_init op @@ -2456,8 +2559,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { loc, cc::StdvecType::get(eleTy), alloca, arrSize)); } } - // Disallow any default vector construction bc we don't - // want any .push_back + // Disallow any default vector construction bc we don't want any + // .push_back if (ctor->isDefaultConstructor()) reportClangError(ctor, mangler, "Default std::vector constructor within quantum " @@ -2468,15 +2571,15 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { auto *parent = ctor->getParent(); if (ctor->isCopyConstructor() && parent->isLambda()) { - // Copy-ctor on a lambda. For now on the QPU device side, we do not - // make a copy of a lambda. Any capture data will be marshalled at - // runtime and passed as ordinary arguments via lambda lifting. + // Copy-ctor on a lambda. For now on the QPU device side, we do not make a + // copy of a lambda. Any capture data will be marshalled at runtime and + // passed as ordinary arguments via lambda lifting. return true; } if (ctor->isCopyOrMoveConstructor() && parent->isPOD()) { - // Copy or move constructor on a POD struct. The value stack should - // contain the object to load the value from. + // Copy or move constructor on a POD struct. The value stack should contain + // the object to load the value from. auto fromStruct = popValue(); assert(isa(ctorTy) && "POD must be a struct type"); return pushValue(builder.create(loc, fromStruct)); @@ -2494,11 +2597,10 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) { // one byte. // 2) Allocate a new object. // 3) Call the constructor passing the address of the allocation as `this`. - auto mem = builder.create(loc, ctorTy); - // FIXME: Using Ctor_Complete for mangled name generation blindly here. - // Is there a programmatic way of determining which enum to use from the - // AST? + + // FIXME: Using Ctor_Complete for mangled name generation blindly here. Is + // there a programmatic way of determining which enum to use from the AST? auto mangledName = cxxMangledDeclName(clang::GlobalDecl{ctor, clang::Ctor_Complete}); auto funcTy = diff --git a/lib/Frontend/nvqpp/ConvertType.cpp b/lib/Frontend/nvqpp/ConvertType.cpp index 7db5bd6398..46488590a7 100644 --- a/lib/Frontend/nvqpp/ConvertType.cpp +++ b/lib/Frontend/nvqpp/ConvertType.cpp @@ -11,12 +11,16 @@ #include "cudaq/Optimizer/Dialect/CC/CCTypes.h" #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h" #include "cudaq/Todo.h" +#include "clang/Basic/TargetInfo.h" +#include "llvm/TargetParser/Triple.h" #define DEBUG_TYPE "lower-ast-type" using namespace mlir; -static bool isArithmeticType(Type t) { return isa(t); } +static bool isArithmeticType(Type t) { + return isa(t); +} /// Is \p t a quantum reference type. In the bridge, quantum types are always /// reference types. @@ -321,9 +325,15 @@ Type QuakeBridgeVisitor::builtinTypeToType(const clang::BuiltinType *t) { return builder.getF32Type(); case BuiltinType::Double: return builder.getF64Type(); - case BuiltinType::LongDouble: - return astContext->getTypeSize(t) == 64 ? builder.getF64Type() - : builder.getF128Type(); + case BuiltinType::LongDouble: { + auto bitWidth = astContext->getTargetInfo().getLongDoubleWidth(); + if (bitWidth == 64) + return builder.getF64Type(); + llvm::Triple triple(astContext->getTargetInfo().getTargetOpts().Triple); + if (triple.isX86()) + return builder.getF80Type(); + return builder.getF128Type(); + } case BuiltinType::Float128: case BuiltinType::Ibm128: /* double double format -> {double, double} */ return builder.getF128Type(); diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp index f599113781..7980537f51 100644 --- a/lib/Optimizer/Builder/Factory.cpp +++ b/lib/Optimizer/Builder/Factory.cpp @@ -11,6 +11,7 @@ #include "cudaq/Optimizer/Dialect/CC/CCOps.h" #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h" #include "llvm/TargetParser/Triple.h" +#include "mlir/IR/Matchers.h" using namespace mlir; @@ -151,6 +152,20 @@ func::FuncOp factory::createFunction(StringRef name, ArrayRef retTypes, return func; } +std::optional factory::maybeValueOfIntConstant(Value v) { + APInt cst; + if (matchPattern(v, m_ConstantInt(&cst))) + return {cst.getZExtValue()}; + return std::nullopt; +} + +std::optional factory::maybeValueOfFloatConstant(Value v) { + APFloat cst(0.0); + if (matchPattern(v, m_ConstantFloat(&cst))) + return {cst.convertToDouble()}; + return std::nullopt; +} + void factory::createGlobalCtorCall(ModuleOp mod, FlatSymbolRefAttr ctor) { auto *ctx = mod.getContext(); auto loc = mod.getLoc(); diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index f07c0396c7..60bf6619c0 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -92,6 +92,23 @@ static constexpr IntrinsicCode intrinsicTable[] = { return %9 : !cc.struct<{!cc.ptr, i64}> })#"}, + {"__nvqpp_getStateVectorData_fp32", {}, R"#( + func.func private @__nvqpp_getStateVectorData_fp32(%p : i64, %o : i64) -> !cc.ptr> + )#"}, + {"__nvqpp_getStateVectorData_fp64", {}, R"#( + func.func private @__nvqpp_getStateVectorData_fp64(%p : i64, %o : i64) -> !cc.ptr> + )#"}, + {"__nvqpp_getStateVectorLength_fp32", + {}, + R"#( + func.func private @__nvqpp_getStateVectorLength_fp32(%p : i64, %o : i64) -> i64 + )#"}, + {"__nvqpp_getStateVectorLength_fp64", + {}, + R"#( + func.func private @__nvqpp_getStateVectorLength_fp64(%p : i64, %o : i64) -> i64 + )#"}, + // __nvqpp_initializer_list_to_vector_bool {cudaq::stdvecBoolCtorFromInitList, {}, diff --git a/lib/Optimizer/CodeGen/CMakeLists.txt b/lib/Optimizer/CodeGen/CMakeLists.txt index 701881cece..5c44565903 100644 --- a/lib/Optimizer/CodeGen/CMakeLists.txt +++ b/lib/Optimizer/CodeGen/CMakeLists.txt @@ -11,6 +11,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") endif() add_cudaq_library(OptCodeGen + CodeGenDialect.cpp + CodeGenOps.cpp + CodeGenTypes.cpp LowerToQIRProfile.cpp LowerToQIR.cpp Passes.cpp @@ -19,6 +22,9 @@ add_cudaq_library(OptCodeGen DEPENDS CCDialect + CodeGenDialectIncGen + CodeGenOpsIncGen + CodeGenTypesIncGen OptCodeGenPassIncGen OptPeepholeIncGen OptTransformsPassIncGen diff --git a/lib/Optimizer/CodeGen/CodeGenDialect.cpp b/lib/Optimizer/CodeGen/CodeGenDialect.cpp new file mode 100644 index 0000000000..1689e1eaf3 --- /dev/null +++ b/lib/Optimizer/CodeGen/CodeGenDialect.cpp @@ -0,0 +1,27 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "CodeGenDialect.h" +#include "CodeGenOps.h" +#include "mlir/IR/DialectImplementation.h" + +//===----------------------------------------------------------------------===// +// Generated logic +//===----------------------------------------------------------------------===// + +#include "cudaq/Optimizer/CodeGen/CodeGenDialect.cpp.inc" + +//===----------------------------------------------------------------------===// + +void cudaq::codegen::CodeGenDialect::initialize() { + registerTypes(); + addOperations< +#define GET_OP_LIST +#include "cudaq/Optimizer/CodeGen/CodeGenOps.cpp.inc" + >(); +} diff --git a/runtime/host_config.h b/lib/Optimizer/CodeGen/CodeGenDialect.h similarity index 60% rename from runtime/host_config.h rename to lib/Optimizer/CodeGen/CodeGenDialect.h index b930c834ad..a61d3db6bf 100644 --- a/runtime/host_config.h +++ b/lib/Optimizer/CodeGen/CodeGenDialect.h @@ -1,5 +1,5 @@ /****************************************************************-*- C++ -*-**** - * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * * All rights reserved. * * * * This source code and the accompanying materials are made available under * @@ -8,5 +8,10 @@ #pragma once -#define CUDAQ_USE_STD20 (__cplusplus >= 202002L) -#define CUDAQ_APPLE_CLANG (defined(__apple_build_version__)) +#include "mlir/IR/Dialect.h" + +//===----------------------------------------------------------------------===// +// Generated logic +//===----------------------------------------------------------------------===// + +#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h.inc" diff --git a/lib/Optimizer/CodeGen/CodeGenOps.cpp b/lib/Optimizer/CodeGen/CodeGenOps.cpp new file mode 100644 index 0000000000..502657ecf1 --- /dev/null +++ b/lib/Optimizer/CodeGen/CodeGenOps.cpp @@ -0,0 +1,22 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "CodeGenOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeUtilities.h" + +using namespace mlir; + +//===----------------------------------------------------------------------===// +// Generated logic +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "cudaq/Optimizer/CodeGen/CodeGenOps.cpp.inc" diff --git a/lib/Optimizer/CodeGen/CodeGenOps.h b/lib/Optimizer/CodeGen/CodeGenOps.h new file mode 100644 index 0000000000..63d1a8154f --- /dev/null +++ b/lib/Optimizer/CodeGen/CodeGenOps.h @@ -0,0 +1,23 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include "cudaq/Optimizer/Dialect/CC/CCTypes.h" +#include "cudaq/Optimizer/Dialect/Common/Traits.h" +#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/OpImplementation.h" + +//===----------------------------------------------------------------------===// +// Generated logic +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "cudaq/Optimizer/CodeGen/CodeGenOps.h.inc" diff --git a/lib/Optimizer/CodeGen/CodeGenTypes.cpp b/lib/Optimizer/CodeGen/CodeGenTypes.cpp new file mode 100644 index 0000000000..94bc9e71b6 --- /dev/null +++ b/lib/Optimizer/CodeGen/CodeGenTypes.cpp @@ -0,0 +1,24 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "CodeGenTypes.h" +#include "CodeGenDialect.h" +#include "llvm/ADT/TypeSwitch.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/DialectImplementation.h" + +//===----------------------------------------------------------------------===// +// Generated logic +//===----------------------------------------------------------------------===// + +#define GET_TYPEDEF_CLASSES +#include "cudaq/Optimizer/CodeGen/CodeGenTypes.cpp.inc" + +void cudaq::codegen::CodeGenDialect::registerTypes() { + addTypes(); +} diff --git a/lib/Optimizer/CodeGen/CodeGenTypes.h b/lib/Optimizer/CodeGen/CodeGenTypes.h new file mode 100644 index 0000000000..ab6f49d033 --- /dev/null +++ b/lib/Optimizer/CodeGen/CodeGenTypes.h @@ -0,0 +1,20 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Types.h" + +//===----------------------------------------------------------------------===// +// Generated logic +//===----------------------------------------------------------------------===// + +#define GET_TYPEDEF_CLASSES +#include "cudaq/Optimizer/CodeGen/CodeGenTypes.h.inc" diff --git a/lib/Optimizer/CodeGen/LowerToQIR.cpp b/lib/Optimizer/CodeGen/LowerToQIR.cpp index 2f7e70e88e..02270ca40c 100644 --- a/lib/Optimizer/CodeGen/LowerToQIR.cpp +++ b/lib/Optimizer/CodeGen/LowerToQIR.cpp @@ -16,6 +16,7 @@ #pragma GCC diagnostic pop #endif +#include "CodeGenOps.h" #include "cudaq/Optimizer/Builder/Intrinsics.h" #include "cudaq/Optimizer/CodeGen/Passes.h" #include "cudaq/Optimizer/CodeGen/Peephole.h" @@ -39,6 +40,9 @@ #include "mlir/Dialect/Arith/Transforms/Passes.h" #include "mlir/Target/LLVMIR/ModuleTranslation.h" #include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "lower-to-qir" using namespace mlir; @@ -81,6 +85,12 @@ class AllocaOpRewrite : public ConvertOpToLLVMPattern { return success(); } + if (alloca.hasInitializedState()) { + // If allocation has initialized state, the sub-graph should already be + // folded. Seeing it here is an error. + return alloca.emitOpError("initialize state must be folded"); + } + // Create a QIR call to allocate the qubits. StringRef qir_qubit_array_allocate = cudaq::opt::QIRArrayQubitAllocateArray; auto array_qbit_type = cudaq::opt::getArrayType(context); @@ -111,6 +121,78 @@ class AllocaOpRewrite : public ConvertOpToLLVMPattern { } }; +// Lower quake.init_state to a QIR function to allocate the +// qubits with the provided state vector. +class QmemRAIIOpRewrite + : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(cudaq::codegen::RAIIOp raii, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = raii->getLoc(); + auto parentModule = raii->getParentOfType(); + auto array_qbit_type = cudaq::opt::getArrayType(rewriter.getContext()); + + // Get the CC Pointer for the state + auto ccState = adaptor.getInitState(); + + // Inspect the element type of the complex data, need to + // know if its f32 or f64 + Type eleTy = rewriter.getF64Type(); + if (auto llvmPtrTy = dyn_cast(ccState.getType())) { + auto ptrEleTy = llvmPtrTy.getElementType(); + if (auto llvmStructTy = dyn_cast(ptrEleTy)) + if (llvmStructTy.getBody().size() == 2 && + llvmStructTy.getBody()[0] == llvmStructTy.getBody()[1] && + llvmStructTy.getBody()[0].isF32()) + eleTy = rewriter.getF32Type(); + } + + if (!isa(eleTy)) + return raii.emitOpError("invalid type on initialize state operation, " + "must be complex floating point."); + + // Get the size of the qubit register + Type allocTy = adaptor.getAllocType(); + auto allocSize = adaptor.getAllocSize(); + Value sizeOperand; + auto i64Ty = rewriter.getI64Type(); + if (allocSize) { + sizeOperand = allocSize; + auto sizeTy = cast(sizeOperand.getType()); + if (sizeTy.getWidth() < 64) + sizeOperand = rewriter.create(loc, i64Ty, sizeOperand); + else if (sizeTy.getWidth() > 64) + sizeOperand = rewriter.create(loc, i64Ty, sizeOperand); + } else { + auto type = cast(allocTy); + auto constantSize = type.getSize(); + sizeOperand = + rewriter.create(loc, constantSize, 64); + } + + // Create QIR allocation with initializer function. + auto *ctx = rewriter.getContext(); + auto ptrTy = cudaq::opt::factory::getPointerType(ctx); + StringRef functionName = + eleTy.isF64() ? cudaq::opt::QIRArrayQubitAllocateArrayWithStateFP64 + : cudaq::opt::QIRArrayQubitAllocateArrayWithStateFP32; + FlatSymbolRefAttr raiiSymbolRef = + cudaq::opt::factory::createLLVMFunctionSymbol( + functionName, array_qbit_type, {i64Ty, ptrTy}, parentModule); + + // Call the allocation function + Value castedInitState = + rewriter.create(loc, ptrTy, ccState); + rewriter.replaceOpWithNewOp( + raii, array_qbit_type, raiiSymbolRef, + ArrayRef{sizeOperand, castedInitState}); + return success(); + } +}; + /// Lower Quake Dealloc Ops to QIR function calls. class DeallocOpRewrite : public ConvertOpToLLVMPattern { public: @@ -139,9 +221,9 @@ class DeallocOpRewrite : public ConvertOpToLLVMPattern { cudaq::opt::factory::createLLVMFunctionSymbol( qirQuantumDeallocateFunc, retType, {operandType}, parentModule); - rewriter.replaceOpWithNewOp( - dealloc, ArrayRef({}), deallocSymbolRef, - adaptor.getOperands().front()); + rewriter.replaceOpWithNewOp(dealloc, ArrayRef({}), + deallocSymbolRef, + adaptor.getOperands().front()); return success(); } }; @@ -976,6 +1058,22 @@ class GetVeqSizeOpRewrite : public OpConversionPattern { // Conversion patterns for CC dialect ops. //===----------------------------------------------------------------------===// +class AddressOfOpPattern + : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + // One-to-one conversion to llvm.addressof op. + LogicalResult + matchAndRewrite(cudaq::cc::AddressOfOp addr, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Type type = getTypeConverter()->convertType(addr.getType()); + auto name = addr.getGlobalName(); + rewriter.replaceOpWithNewOp(addr, type, name); + return success(); + } +}; + class AllocaOpPattern : public ConvertOpToLLVMPattern { public: using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -1264,6 +1362,30 @@ class FuncToPtrOpPattern } }; +class GlobalOpPattern : public ConvertOpToLLVMPattern { +public: + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + // Replace the cc.global with an llvm.global, updating the types, etc. + LogicalResult + matchAndRewrite(cudaq::cc::GlobalOp global, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = global.getLoc(); + auto ptrTy = cast(global.getType()); + auto eleTy = ptrTy.getElementType(); + Type type = getTypeConverter()->convertType(eleTy); + auto name = global.getSymName(); + bool isReadOnly = global.getConstant(); + Attribute initializer = global.getValue().value_or(Attribute{}); + auto linkage = + global.getExternal() ? LLVM::Linkage::Linkonce : LLVM::Linkage::Private; + rewriter.create(loc, type, isReadOnly, linkage, name, + initializer, /*alignment=*/0); + rewriter.eraseOp(global); + return success(); + } +}; + class InsertValueOpPattern : public ConvertOpToLLVMPattern { public: @@ -1563,8 +1685,8 @@ class ReturnBitRewrite : public OpConversionPattern { } }; -/// In case we still have a RelaxSizeOp, we can just remove it, -/// since QIR works on Array * for all sized veqs. +/// In case we still have a RelaxSizeOp, we can just remove it, since QIR works +/// on `Array*` for all sized veqs. class RemoveRelaxSizeRewrite : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -1577,6 +1699,24 @@ class RemoveRelaxSizeRewrite : public OpConversionPattern { } }; +class CodeGenRAIIPattern : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(quake::InitializeStateOp init, + PatternRewriter &rewriter) const override { + Value mem = init.getTargets(); + auto alloc = mem.getDefiningOp(); + if (!alloc) + return init.emitOpError("init_state must have alloca as input"); + rewriter.replaceOpWithNewOp( + init, init.getType(), init.getState(), alloc.getType(), + alloc.getSize()); + rewriter.eraseOp(alloc); + return success(); + } +}; + //===----------------------------------------------------------------------===// // Code generation: converts the Quake IR to QIR. //===----------------------------------------------------------------------===// @@ -1650,7 +1790,20 @@ class QuakeToQIRRewrite : public cudaq::opt::QuakeToQIRBase { return ok ? success() : failure(); } + /// Greedy pass to match subgraphs in the IR and replace them with codegen + /// ops. This step makes converting a DAG of nodes in the conversion step + /// simpler. + void fuseSubgraphPatterns() { + auto *ctx = &getContext(); + RewritePatternSet patterns(ctx); + patterns.insert(ctx); + if (failed(applyPatternsAndFoldGreedily(getModule(), std::move(patterns)))) + signalPassFailure(); + } + void runOnOperation() override final { + fuseSubgraphPatterns(); + auto *context = &getContext(); // Ad hoc deal with ConstantArrayOp transformation. @@ -1680,14 +1833,15 @@ class QuakeToQIRRewrite : public cudaq::opt::QuakeToQIRBase { patterns.insert(context); + patterns.insert< - AllocaOpRewrite, AllocaOpPattern, CallableClosureOpPattern, - CallableFuncOpPattern, CallCallableOpPattern, CastOpPattern, - ComputePtrOpPattern, ConcatOpRewrite, DeallocOpRewrite, + AddressOfOpPattern, AllocaOpRewrite, AllocaOpPattern, + CallableClosureOpPattern, CallableFuncOpPattern, CallCallableOpPattern, + CastOpPattern, ComputePtrOpPattern, ConcatOpRewrite, DeallocOpRewrite, CreateStringLiteralOpPattern, DiscriminateOpPattern, ExtractQubitOpRewrite, ExtractValueOpPattern, FuncToPtrOpPattern, - InsertValueOpPattern, InstantiateCallableOpPattern, LoadOpPattern, - ExpPauliRewrite, OneTargetRewrite, + GlobalOpPattern, InsertValueOpPattern, InstantiateCallableOpPattern, + LoadOpPattern, ExpPauliRewrite, OneTargetRewrite, OneTargetRewrite, OneTargetRewrite, OneTargetRewrite, OneTargetRewrite, OneTargetRewrite, OneTargetOneParamRewrite, @@ -1696,17 +1850,20 @@ class QuakeToQIRRewrite : public cudaq::opt::QuakeToQIRBase { OneTargetOneParamRewrite, OneTargetOneParamRewrite, OneTargetTwoParamRewrite, - OneTargetTwoParamRewrite, PoisonOpPattern, ResetRewrite, - StdvecDataOpPattern, StdvecInitOpPattern, StdvecSizeOpPattern, - StoreOpPattern, SubveqOpRewrite, TwoTargetRewrite, - UndefOpPattern>(typeConverter); + OneTargetTwoParamRewrite, PoisonOpPattern, + QmemRAIIOpRewrite, ResetRewrite, StdvecDataOpPattern, + StdvecInitOpPattern, StdvecSizeOpPattern, StoreOpPattern, + SubveqOpRewrite, TwoTargetRewrite, UndefOpPattern>( + typeConverter); patterns.insert>(typeConverter, measureCounter); target.addLegalDialect(); target.addLegalOp(); - if (failed(applyFullConversion(getModule(), target, std::move(patterns)))) + if (failed(applyFullConversion(getModule(), target, std::move(patterns)))) { + LLVM_DEBUG(getModule().dump()); signalPassFailure(); + } } }; diff --git a/lib/Optimizer/CodeGen/PassDetails.h b/lib/Optimizer/CodeGen/PassDetails.h index 48ae6c5301..719c03b391 100644 --- a/lib/Optimizer/CodeGen/PassDetails.h +++ b/lib/Optimizer/CodeGen/PassDetails.h @@ -8,6 +8,7 @@ #pragma once +#include "CodeGenDialect.h" #include "cudaq/Optimizer/Dialect/CC/CCDialect.h" #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h" #include "mlir/Dialect/Func/IR/FuncOps.h" diff --git a/lib/Optimizer/CodeGen/Passes.cpp b/lib/Optimizer/CodeGen/Passes.cpp index 40e38f7d38..f11a0579eb 100644 --- a/lib/Optimizer/CodeGen/Passes.cpp +++ b/lib/Optimizer/CodeGen/Passes.cpp @@ -71,3 +71,7 @@ void cudaq::opt::registerTargetPipelines() { "Convert kernels to IonQ gate set.", addIonQPipeline); } + +void cudaq::opt::registerCodeGenDialect(DialectRegistry ®istry) { + registry.insert(); +} diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp index 858925884b..e2b4a57acf 100644 --- a/lib/Optimizer/Dialect/CC/CCOps.cpp +++ b/lib/Optimizer/Dialect/CC/CCOps.cpp @@ -40,8 +40,7 @@ cudaq::cc::AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) { getParentOfType(getOperation()), getGlobalNameAttr()); // TODO: add globals? - auto function = dyn_cast_or_null(op); - if (!function) + if (!isa_and_nonnull(op)) return emitOpError("must reference a global defined by 'func.func'"); return success(); } @@ -408,7 +407,7 @@ OpFoldResult cudaq::cc::GetConstantElementOp::fold(FoldAdaptor adaptor) { auto conArr = getConstantArray().getDefiningOp(); if (!conArr) return nullptr; - cudaq::cc::ArrayType arrTy = conArr.getType(); + cc::ArrayType arrTy = conArr.getType(); if (arrTy.isUnknownSize()) return nullptr; auto eleTy = arrTy.getElementType(); @@ -420,18 +419,75 @@ OpFoldResult cudaq::cc::GetConstantElementOp::fold(FoldAdaptor adaptor) { if (auto fltTy = dyn_cast(eleTy)) { auto floatConstVal = cast(conArr.getConstantValues()[offset]).getValue(); - Value val = - builder.create(loc, floatConstVal, fltTy); - return val; + return builder.create(loc, floatConstVal, fltTy) + .getResult(); } auto intConstVal = cast(conArr.getConstantValues()[offset]).getInt(); auto intTy = cast(eleTy); - Value val = builder.create(loc, intConstVal, intTy); - return val; + return builder.create(loc, intConstVal, intTy) + .getResult(); } - Value val = builder.create(loc, eleTy); - return val; + return builder.create(loc, eleTy).getResult(); +} + +//===----------------------------------------------------------------------===// +// GlobalOp +//===----------------------------------------------------------------------===// + +ParseResult cudaq::cc::GlobalOp::parse(OpAsmParser &parser, + OperationState &result) { + // Check for the `extern` optional keyword first. + if (succeeded(parser.parseOptionalKeyword("extern"))) + result.addAttribute(getExternalAttrName(result.name), + parser.getBuilder().getUnitAttr()); + + // Check for the `constant` optional keyword second. + if (succeeded(parser.parseOptionalKeyword("constant"))) + result.addAttribute(getConstantAttrName(result.name), + parser.getBuilder().getUnitAttr()); + + // Parse the rest of the global. + // @ ( ) : + StringAttr name; + if (parser.parseSymbolName(name, getSymNameAttrName(result.name), + result.attributes)) + return failure(); + if (succeeded(parser.parseOptionalLParen())) { + Attribute value; + if (parser.parseAttribute(value, getValueAttrName(result.name), + result.attributes) || + parser.parseRParen()) + return failure(); + } + SmallVector types; + if (parser.parseOptionalColonTypeList(types) || + parser.parseOptionalAttrDict(result.attributes)) + return failure(); + if (types.size() > 1) + return parser.emitError(parser.getNameLoc(), "expected zero or one type"); + result.addAttribute(getGlobalTypeAttrName(result.name), + TypeAttr::get(types[0])); + return success(); +} + +void cudaq::cc::GlobalOp::print(OpAsmPrinter &p) { + p << ' '; + if (getExternal()) + p << "extern "; + if (getConstant()) + p << "constant "; + p.printSymbolName(getSymName()); + if (auto value = getValue()) { + p << " ("; + p.printAttribute(*value); + p << ")"; + } + p << " : " << getGlobalType(); + p.printOptionalAttrDictWithKeyword( + (*this)->getAttrs(), + {getSymNameAttrName(), getValueAttrName(), getGlobalTypeAttrName(), + getConstantAttrName(), getExternalAttrName()}); } //===----------------------------------------------------------------------===// @@ -557,8 +613,8 @@ LogicalResult cudaq::cc::LoopOp::verify() { return emitOpError("size of init args and outputs must be equal"); if (getWhileRegion().front().getArguments().size() != initArgsSize) return emitOpError("size of init args and while region args must be equal"); - if (auto condOp = dyn_cast( - getWhileRegion().front().getTerminator())) { + if (auto condOp = + dyn_cast(getWhileRegion().front().getTerminator())) { if (condOp.getResults().size() != initArgsSize) return emitOpError("size of init args and condition op must be equal"); } else { @@ -570,8 +626,8 @@ LogicalResult cudaq::cc::LoopOp::verify() { if (getStepRegion().front().getArguments().size() != initArgsSize) return emitOpError( "size of init args and step region args must be equal"); - if (auto contOp = dyn_cast( - getStepRegion().front().getTerminator())) { + if (auto contOp = + dyn_cast(getStepRegion().front().getTerminator())) { if (contOp.getOperands().size() != initArgsSize) return emitOpError("size of init args and continue op must be equal"); } else { @@ -1168,7 +1224,7 @@ ParseResult cudaq::cc::CreateLambdaOp::parse(OpAsmParser &parser, LogicalResult cudaq::cc::CallCallableOp::verify() { FunctionType funcTy; auto ty = getCallee().getType(); - if (auto lambdaTy = dyn_cast(ty)) + if (auto lambdaTy = dyn_cast(ty)) funcTy = lambdaTy.getSignature(); else if (auto fTy = dyn_cast(ty)) funcTy = fTy; @@ -1246,7 +1302,7 @@ struct ReplaceInFunc : public OpRewritePattern { void cudaq::cc::ReturnOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { - patterns.add>(context); + patterns.add>(context); } //===----------------------------------------------------------------------===// @@ -1316,8 +1372,7 @@ struct ReplaceInLoop : public OpRewritePattern { void cudaq::cc::UnwindBreakOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { - patterns.add>( - context); + patterns.add>(context); } //===----------------------------------------------------------------------===// @@ -1346,9 +1401,7 @@ LogicalResult cudaq::cc::UnwindContinueOp::verify() { void cudaq::cc::UnwindContinueOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { - patterns - .add>( - context); + patterns.add>(context); } //===----------------------------------------------------------------------===// diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp index fa53604c1d..5d52c8baeb 100644 --- a/lib/Optimizer/Dialect/CC/CCTypes.cpp +++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp @@ -150,35 +150,35 @@ void cc::ArrayType::print(AsmPrinter &printer) const { //===----------------------------------------------------------------------===// -namespace cudaq { +namespace cudaq::cc { Type cc::SpanLikeType::getElementType() const { return llvm::TypeSwitch(*this).Case( [](auto type) { return type.getElementType(); }); } -bool cc::isDynamicType(Type ty) { - if (isa(ty)) +bool isDynamicType(Type ty) { + if (isa(ty)) return true; - if (auto strTy = dyn_cast(ty)) { + if (auto strTy = dyn_cast(ty)) { for (auto memTy : strTy.getMembers()) if (isDynamicType(memTy)) return true; return false; } - if (auto arrTy = dyn_cast(ty)) + if (auto arrTy = dyn_cast(ty)) return arrTy.isUnknownSize() || isDynamicType(arrTy.getElementType()); // Note: this isn't considering quake, builtin, etc. types. return false; } -cc::CallableType cc::CallableType::getNoSignature(MLIRContext *ctx) { +CallableType CallableType::getNoSignature(MLIRContext *ctx) { return CallableType::get(ctx, FunctionType::get(ctx, {}, {})); } -void cc::CCDialect::registerTypes() { +void CCDialect::registerTypes() { addTypes(); } -} // namespace cudaq +} // namespace cudaq::cc diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp index abea24d1d0..3a939f57af 100644 --- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp +++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp @@ -94,7 +94,7 @@ bool quake::isSupportedMappingOperation(Operation *op) { return isa(op); } -mlir::ValueRange quake::getQuantumTypesFromRange(mlir::ValueRange range) { +ValueRange quake::getQuantumTypesFromRange(ValueRange range) { // Skip over classical types at the beginning int numClassical = 0; @@ -105,7 +105,7 @@ mlir::ValueRange quake::getQuantumTypesFromRange(mlir::ValueRange range) { break; } - mlir::ValueRange retVals = range.drop_front(numClassical); + ValueRange retVals = range.drop_front(numClassical); // Make sure all remaining operands are quantum for (auto operand : retVals) @@ -115,17 +115,16 @@ mlir::ValueRange quake::getQuantumTypesFromRange(mlir::ValueRange range) { return retVals; } -mlir::ValueRange quake::getQuantumResults(Operation *op) { +ValueRange quake::getQuantumResults(Operation *op) { return getQuantumTypesFromRange(op->getResults()); } -mlir::ValueRange quake::getQuantumOperands(Operation *op) { +ValueRange quake::getQuantumOperands(Operation *op) { return getQuantumTypesFromRange(op->getOperands()); } LogicalResult quake::setQuantumOperands(Operation *op, ValueRange quantumVals) { - mlir::ValueRange quantumOperands = - getQuantumTypesFromRange(op->getOperands()); + ValueRange quantumOperands = getQuantumTypesFromRange(op->getOperands()); if (quantumOperands.size() != quantumVals.size()) return failure(); @@ -183,6 +182,14 @@ LogicalResult quake::AllocaOp::verify() { } } } + + // Check the uses. If any use is a InitializeStateOp, then it must be the only + // use. + Operation *self = getOperation(); + if (!self->getUsers().empty() && !self->hasOneUse()) + for (auto *op : self->getUsers()) + if (isa(op)) + return emitOpError("init_state must be the only use"); return success(); } @@ -194,6 +201,15 @@ void quake::AllocaOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add(context); } +quake::InitializeStateOp quake::AllocaOp::getInitializedState() { + auto *self = getOperation(); + if (self->hasOneUse()) { + auto x = self->getUsers().begin(); + return dyn_cast(*x); + } + return {}; +} + //===----------------------------------------------------------------------===// // Apply //===----------------------------------------------------------------------===// @@ -480,6 +496,19 @@ LogicalResult quake::ExtractRefOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// InitializeStateOp +//===----------------------------------------------------------------------===// + +LogicalResult quake::InitializeStateOp::verify() { + auto veqTy = cast(getTargets().getType()); + if (veqTy.hasSpecifiedSize()) + if (!std::has_single_bit(veqTy.getSize())) + return emitOpError("initialize state vector must be power of 2, but is " + + std::to_string(veqTy.getSize()) + " instead."); + return success(); +} + //===----------------------------------------------------------------------===// // RelaxSizeOp //===----------------------------------------------------------------------===// @@ -694,8 +723,8 @@ static LogicalResult getParameterAsDouble(Value parameter, double &result) { auto paramDefOp = parameter.getDefiningOp(); if (!paramDefOp) return failure(); - if (auto constOp = mlir::dyn_cast(paramDefOp)) { - if (auto value = dyn_cast(constOp.getValue())) { + if (auto constOp = dyn_cast(paramDefOp)) { + if (auto value = dyn_cast(constOp.getValue())) { result = value.getValueAsDouble(); return success(); } diff --git a/lib/Optimizer/Transforms/AddDeallocs.cpp b/lib/Optimizer/Transforms/AddDeallocs.cpp index d9c06ca05a..4fc41b83a8 100644 --- a/lib/Optimizer/Transforms/AddDeallocs.cpp +++ b/lib/Optimizer/Transforms/AddDeallocs.cpp @@ -49,7 +49,11 @@ struct DeallocationAnalysisInfo { deallocMap.insert(std::make_pair(&op, false)); } else if (auto dealloc = dyn_cast(op)) { auto val = dealloc.getReference(); - Operation *alloc = cast(val.getDefiningOp()); + Operation *alloc = val.getDefiningOp(); + if (!isa(alloc)) { + auto initState = cast(alloc); + alloc = initState.getTargets().getDefiningOp(); + } if (deallocMap.count(alloc)) deallocMap[alloc] = true; else @@ -99,6 +103,8 @@ class DeallocationAnalysis { } } else if (auto dealloc = dyn_cast(o)) { auto val = dealloc.getReference(); + if (auto init = val.getDefiningOp()) + val = init.getTargets(); if (auto alloc = val.getDefiningOp()) { auto *op = alloc.getOperation(); if (allocMap.count(op)) @@ -121,8 +127,16 @@ class DeallocationAnalysis { inline void generateDeallocsForSet(PatternRewriter &rewriter, llvm::DenseSet &allocSet) { - for (Operation *a : allocSet) - rewriter.create(a->getLoc(), cast(a)); + for (Operation *a : allocSet) { + auto alloc = cast(a); + Value v = alloc; + if (a->hasOneUse()) { + if (auto initState = + dyn_cast(*a->getUsers().begin())) + v = initState; + } + rewriter.create(a->getLoc(), v); + } } // The different rewrite cases involve the same work, but use different types. diff --git a/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp b/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp index 06cdc68161..abd04d6f58 100644 --- a/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp +++ b/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp @@ -165,7 +165,7 @@ class CombineQuantumAllocationsPass for (auto &block : func.getRegion()) for (auto &op : block) { if (auto alloc = dyn_cast_or_null(&op)) { - if (alloc.getSize()) + if (alloc.getSize() || alloc.hasInitializedState()) return; analysis.allocations.push_back(alloc); auto size = allocationSize(alloc); diff --git a/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp b/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp index 59fe5ef137..d597bad483 100644 --- a/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp +++ b/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp @@ -123,6 +123,8 @@ class FactorQuantumAllocationsPass ConversionTarget target(*ctx); target.addLegalDialect(); target.addDynamicallyLegalOp([](quake::DeallocOp d) { + if (d.getReference().getDefiningOp()) + return true; if (auto ty = dyn_cast(d.getReference().getType())) return !ty.hasSpecifiedSize(); return true; @@ -147,6 +149,8 @@ class FactorQuantumAllocationsPass allocations.end(); }); target.addDynamicallyLegalOp([](quake::DeallocOp d) { + if (d.getReference().getDefiningOp()) + return true; if (auto ty = dyn_cast(d.getReference().getType())) return !ty.hasSpecifiedSize(); return true; @@ -164,7 +168,8 @@ class FactorQuantumAllocationsPass LogicalResult runAnalysis(SmallVector &allocations) { auto func = getOperation(); func.walk([&](quake::AllocaOp alloc) { - if (!allocaOfVeq(alloc) || allocaOfUnspecifiedSize(alloc)) + if (!allocaOfVeq(alloc) || allocaOfUnspecifiedSize(alloc) || + alloc.hasInitializedState()) return; bool usesAreConvertible = [&]() { for (auto *users : alloc->getUsers()) { diff --git a/lib/Optimizer/Transforms/LowerUnwind.cpp b/lib/Optimizer/Transforms/LowerUnwind.cpp index a9767e3834..2e3ba4f216 100644 --- a/lib/Optimizer/Transforms/LowerUnwind.cpp +++ b/lib/Optimizer/Transforms/LowerUnwind.cpp @@ -321,6 +321,16 @@ static bool anyPrimitiveAncestor( return false; } +static Value adjustedDeallocArg(quake::AllocaOp alloc) { + if (auto init = alloc.getInitializedState()) + return init.getResult(); + return alloc.getResult(); +} + +static Value adjustedDeallocArg(Operation *op) { + return adjustedDeallocArg(cast(op)); +} + namespace { /// A scope op that contains an unwind op and is contained by a loop (for break /// or continue) or for return always, dictates that the unwind op must transfer @@ -366,7 +376,7 @@ struct ScopeOpPattern : public OpRewritePattern { auto *contOp = pr.first; rewriter.setInsertionPoint(contOp); for (auto a : llvm::reverse(pr.second)) - rewriter.create(a.getLoc(), a.getResult()); + rewriter.create(a.getLoc(), adjustedDeallocArg(a)); rewriter.replaceOpWithNewOp(contOp, nextBlock, contOp->getOperands()); } @@ -380,7 +390,7 @@ struct ScopeOpPattern : public OpRewritePattern { if (Block *blk = blockInfo.continueBlock) { rewriter.setInsertionPointToEnd(blk); for (auto a : llvm::reverse(qallocas)) - rewriter.create(a->getLoc(), a->getResult(0)); + rewriter.create(a->getLoc(), adjustedDeallocArg(a)); if (asPrimitive) { Block *landingPad = getLandingPad(infoMap, scope).continueBlock; rewriter.create(loc, landingPad, blk->getArguments()); @@ -393,7 +403,7 @@ struct ScopeOpPattern : public OpRewritePattern { if (Block *blk = blockInfo.breakBlock) { rewriter.setInsertionPointToEnd(blk); for (auto a : llvm::reverse(qallocas)) - rewriter.create(a->getLoc(), a->getResult(0)); + rewriter.create(a->getLoc(), adjustedDeallocArg(a)); if (asPrimitive) { Block *landingPad = getLandingPad(infoMap, scope).breakBlock; rewriter.create(loc, landingPad, blk->getArguments()); @@ -406,7 +416,7 @@ struct ScopeOpPattern : public OpRewritePattern { if (Block *blk = blockInfo.returnBlock) { rewriter.setInsertionPointToEnd(blk); for (auto a : llvm::reverse(qallocas)) - rewriter.create(a->getLoc(), a->getResult(0)); + rewriter.create(a->getLoc(), adjustedDeallocArg(a)); assert(asPrimitive); Block *landingPad = getLandingPad(infoMap, scope).returnBlock; rewriter.create(loc, landingPad, blk->getArguments()); @@ -458,7 +468,7 @@ struct FuncLikeOpPattern : public OpRewritePattern { auto *exitOp = pr.first; rewriter.setInsertionPoint(exitOp); for (auto a : llvm::reverse(pr.second)) - rewriter.create(a.getLoc(), a.getResult()); + rewriter.create(a.getLoc(), adjustedDeallocArg(a)); } // Here, we handle the unwind return jumps. @@ -477,7 +487,7 @@ struct FuncLikeOpPattern : public OpRewritePattern { if (Block *exitBlock = blockInfo.returnBlock) { rewriter.setInsertionPointToEnd(exitBlock); for (auto a : llvm::reverse(qallocas)) - rewriter.create(a->getLoc(), a->getResult(0)); + rewriter.create(a->getLoc(), adjustedDeallocArg(a)); rewriter.create(func.getLoc(), exitBlock->getArguments()); func.getBody().push_back(exitBlock); } diff --git a/python/cudaq/__init__.py b/python/cudaq/__init__.py index ee0f389a01..e6dc131886 100644 --- a/python/cudaq/__init__.py +++ b/python/cudaq/__init__.py @@ -6,7 +6,7 @@ # the terms of the Apache License 2.0 which accompanies this distribution. # # ============================================================================ # -import sys, os +import sys, os, numpy from ._packages import * from .kernel.kernel_decorator import kernel, PyKernelDecorator from .kernel.kernel_builder import make_kernel, QuakeValue, PyKernel @@ -87,6 +87,26 @@ def synthesize(kernel, *args): kernelName=kernel.name) +def simulation_dtype(): + """ + Return the data type for the current simulation backend, + either `numpy.complex128` or `numpy.complex64`. + """ + target = get_target() + precision = target.get_precision() + if precision == cudaq_runtime.SimulationPrecision.fp64: + return complex + return numpy.complex64 + + +def create_state(array_data): + """ + Create a state array with the appropriate data type for the + current simulation backend target. + """ + return numpy.array(array_data, dtype=simulation_dtype()) + + def __clearKernelRegistries(): global globalKernelRegistry, globalAstRegistry globalKernelRegistry.clear() diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py index c07c737b56..5a74a35bb2 100644 --- a/python/cudaq/kernel/ast_bridge.py +++ b/python/cudaq/kernel/ast_bridge.py @@ -17,7 +17,7 @@ from ..mlir.passmanager import * from ..mlir.dialects import quake, cc from ..mlir.dialects import builtin, func, arith, math, complex -from ..mlir._mlir_libs._quakeDialects import cudaq_runtime, load_intrinsic +from ..mlir._mlir_libs._quakeDialects import cudaq_runtime, load_intrinsic, register_all_dialects # This file implements the CUDA Quantum Python AST to MLIR conversion. # It provides a `PyASTBridge` class that implements the `ast.NodeVisitor` type @@ -115,6 +115,7 @@ def __init__(self, **kwargs): self.loc = Location.unknown(context=self.ctx) else: self.ctx = Context() + register_all_dialects(self.ctx) quake.register_dialect(self.ctx) cc.register_dialect(self.ctx) cudaq_runtime.registerLLVMDialectTranslation(self.ctx) @@ -245,11 +246,27 @@ def getFloatAttr(self, type, value): def getConstantFloat(self, value): """ Create a constant float operation and return its MLIR result Value. - Takes as input the concrete float value. + Takes as input the concrete float value. """ ty = self.getFloatType() return arith.ConstantOp(ty, self.getFloatAttr(ty, value)).result + def getComplexType(self): + """ + Return an MLIR complex type (double precision). + """ + return ComplexType.get(self.getFloatType()) + + def getConstantComplex(self, value): + """ + Create a constant complex operation and return its MLIR result Value. + Takes as input the concrete float value. + """ + ty = self.getComplexType() + return complex.CreateOp(ty, + self.getConstantFloat(value.real), + self.getConstantFloat(value.imag)).result + def getConstantInt(self, value, width=64): """ Create a constant integer operation and return its MLIR result Value. @@ -884,7 +901,7 @@ def visit_Assign(self, node): self.visit(node.value) if len(self.valueStack) == 0: - self.emitFatalError("invalid assignement detected.", node) + self.emitFatalError("invalid assignment detected.", node) varNames = [] varValues = [] @@ -930,7 +947,7 @@ def visit_Attribute(self, node): see from ubiquitous external modules like `numpy`. """ if self.verbose: - print('[Visit Attribute]') + print(f'[Visit Attribute {node.attr}]') self.currentNode = node # Disallow list.append since we don't do dynamic memory allocation @@ -2725,6 +2742,21 @@ def visit_Continue(self, node): else: cc.ContinueOp([]) + def promote_operand_type(self, ty, operand): + if ComplexType.isinstance(ty): + if not ComplexType.isinstance(operand.type): + if IntegerType.isinstance(operand.type): + operand = arith.SIToFPOp(self.getFloatType(), operand).result + operand = complex.CreateOp( + ComplexType.get(self.getFloatType()), operand, + self.getConstantFloat(0.0)).result + + if F64Type.isinstance(ty): + if IntegerType.isinstance(operand.type): + operand = arith.SIToFPOp(ty, operand).result + + return operand + def visit_BinOp(self, node): """ Visit binary operation nodes in the AST and map them to equivalents in the @@ -2757,16 +2789,22 @@ def visit_BinOp(self, node): raise RuntimeError("Invalid type for Binary Op {} ({}, {})".format( type(node.op), right, right)) - # Basedon the op type and the leaf types, create the MLIR operator + # Type promotion for Add, Sub, Div, Mult operations + if isinstance(node.op, (ast.Add, ast.Sub, ast.Mult, ast.Div)): + right = self.promote_operand_type(left.type, right) + left = self.promote_operand_type(right.type, left) + + # Based on the op type and the leaf types, create the MLIR operator if isinstance(node.op, ast.Add): if IntegerType.isinstance(left.type): self.pushValue(arith.AddIOp(left, right).result) return elif F64Type.isinstance(left.type): - if IntegerType.isinstance(right.type): - right = arith.SIToFPOp(left.type, right).result self.pushValue(arith.AddFOp(left, right).result) return + elif ComplexType.isinstance(left.type): + self.pushValue(complex.AddOp(left, right).result) + return else: self.emitFatalError("unhandled BinOp.Add types.", node) @@ -2774,6 +2812,11 @@ def visit_BinOp(self, node): if IntegerType.isinstance(left.type): self.pushValue(arith.SubIOp(left, right).result) return + if F64Type.isinstance(left.type): + self.pushValue(arith.SubFOp(left, right).result) + return + if ComplexType.isinstance(left.type): + self.pushValue(complex.SubOp(left, right).result) else: self.emitFatalError("unhandled BinOp.Sub types.", node) if isinstance(node.op, ast.FloorDiv): @@ -2784,17 +2827,6 @@ def visit_BinOp(self, node): self.emitFatalError("unhandled BinOp.FloorDiv types.", node) if isinstance(node.op, ast.Div): if ComplexType.isinstance(left.type): - if not ComplexType.isinstance(right.type): - right = complex.CreateOp( - ComplexType.get(self.getFloatType()), right, - self.getConstantFloat(0.0)).result - self.pushValue(complex.DivOp(left, right).result) - return - if ComplexType.isinstance(right.type): - if not ComplexType.isinstance(left.type): - left = complex.CreateOp( - ComplexType.get(self.getFloatType()), left, - self.getConstantFloat(0.0)).result self.pushValue(complex.DivOp(left, right).result) return @@ -2828,26 +2860,16 @@ def visit_BinOp(self, node): return if isinstance(node.op, ast.Mult): if ComplexType.isinstance(left.type): - if not ComplexType.isinstance(right.type): - if IntegerType.isinstance(right.type): - right = arith.SIToFPOp(self.getFloatType(), - right).result - right = complex.CreateOp(left.type, right, - self.getConstantFloat(0.)).result self.pushValue(complex.MulOp(left, right).result) return if F64Type.isinstance(left.type): - if not F64Type.isinstance(right.type): - right = arith.SIToFPOp(self.getFloatType(), right).result + self.pushValue(arith.MulFOp(left, right).result) + return if IntegerType.isinstance(left.type): - if not IntegerType.isinstance(right.type): - right = arith.FPToSIOp(left.type, right).result self.pushValue(arith.MulIOp(left, right).result) return - - self.pushValue(arith.MulFOp(left, right).result) return if isinstance(node.op, ast.Mod): if F64Type.isinstance(left.type): @@ -2895,9 +2917,10 @@ def visit_Name(self, node): if node.id in self.capturedVars: # Only support a small subset of types here + complexType = type(1j) value = self.capturedVars[node.id] if isinstance(value, list) and isinstance(value[0], - (int, bool, float)): + (int, bool, float, complexType)): elementValues = None if isinstance(value[0], float): elementValues = [self.getConstantFloat(el) for el in value] @@ -2905,6 +2928,8 @@ def visit_Name(self, node): elementValues = [self.getConstantInt(el) for el in value] elif isinstance(value[0], bool): elementValues = [self.getConstantInt(el, 1) for el in value] + elif isinstance(value[0], complexType): + elementValues = [self.getConstantComplex(el) for el in value] if elementValues != None: self.dependentCaptureVars[node.id] = value @@ -2922,6 +2947,8 @@ def visit_Name(self, node): mlirValCreator = lambda: self.getConstantInt(value, 1) elif isinstance(value, float): mlirValCreator = lambda: self.getConstantFloat(value) + elif isinstance(value, complexType): + mlirValCreator = lambda: self.getConstantComplex(value) if mlirValCreator != None: with InsertionPoint.at_block_begin(self.entry): @@ -2935,8 +2962,12 @@ def visit_Name(self, node): self.pushValue(stackSlot) return + errorType = type(value).__name__ + if (isinstance(value, list)): + errorType = f"{errorType}[{type(value[0]).__name__}]" + self.emitFatalError( - f"Invalid type for variable ({node.id}) captured from parent scope (only int, bool, float, and list[int|bool|float] accepted, type was {type(value)}).", + f"Invalid type for variable ({node.id}) captured from parent scope (only int, bool, float, complex, and list[int|bool|float|complex] accepted, type was {errorType}).", node) # Throw an exception for the case that the name is not diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py index 0b238065cb..30e0a266a2 100644 --- a/python/cudaq/kernel/kernel_builder.py +++ b/python/cudaq/kernel/kernel_builder.py @@ -7,10 +7,12 @@ # ============================================================================ # from functools import partialmethod +import hashlib import random import re import string import sys +import numpy as np from typing import get_origin, List from .quake_value import QuakeValue from .kernel_decorator import PyKernelDecorator @@ -22,8 +24,8 @@ from ..mlir.passmanager import * from ..mlir.execution_engine import * from ..mlir.dialects import quake, cc -from ..mlir.dialects import builtin, func, arith -from ..mlir._mlir_libs._quakeDialects import cudaq_runtime +from ..mlir.dialects import builtin, func, arith, math +from ..mlir._mlir_libs._quakeDialects import cudaq_runtime, register_all_dialects ## [PYTHON_VERSION_FIX] @@ -175,6 +177,14 @@ def __singleTargetSingleParameterControlOperation(self, context=self.ctx) +def supportCommonCast(mlirType, otherTy, arg, FromType, ToType, PyType): + argEleTy = cc.StdvecType.getElementType(mlirType) + eleTy = cc.StdvecType.getElementType(otherTy) + if ToType.isinstance(eleTy) and FromType.isinstance(argEleTy): + return [PyType(i) for i in arg] + return None + + class PyKernel(object): """ The :class:`Kernel` provides an API for dynamically constructing quantum @@ -191,10 +201,12 @@ class PyKernel(object): def __init__(self, argTypeList): self.ctx = Context() + register_all_dialects(self.ctx) quake.register_dialect(self.ctx) cc.register_dialect(self.ctx) cudaq_runtime.registerLLVMDialectTranslation(self.ctx) + self.stateHashes = [] self.metadata = {'conditionalOnMeasure': False} self.regCounter = 0 self.loc = Location.unknown(context=self.ctx) @@ -234,6 +246,13 @@ def __init__(self, argTypeList): self.insertPoint = InsertionPoint.at_block_begin(e) + def __del__(self): + """ + When a kernel builder is deleted we need to clean up + any state data if there is any. + """ + cudaq_runtime.deletePointersToStateData(self.stateHashes) + def __processArgType(self, ty): """ Process input argument type. Specifically, try to infer the @@ -247,6 +266,8 @@ def __processArgType(self, ty): 'int': int, 'bool': bool, 'float': float, + 'complex': complex, + 'numpy.complex64': np.complex64, 'pauli_word': cudaq_runtime.pauli_word } # Infer the slice type @@ -516,13 +537,13 @@ def __str__(self, canonicalize=True): return str(cloned) return str(self.module) - def qalloc(self, size=None): + def qalloc(self, initializer=None): """ Allocate a register of qubits of size `qubit_count` and return a handle to them as a :class:`QuakeValue`. Args: - qubit_count (Union[`int`,`QuakeValue`): The number of qubits to allocate. + initializer (Union[`int`,`QuakeValue`, `list[T]`): The number of qubits to allocate or a concrete state to allocate and initialize the qubits. Returns: :class:`QuakeValue`: A handle to the allocated qubits in the MLIR. @@ -533,18 +554,150 @@ def qalloc(self, size=None): ``` """ with self.insertPoint, self.loc: - if size == None: + # If the initializer is an integer, create `veq` + if isinstance(initializer, int): + veqTy = quake.VeqType.get(self.ctx, initializer) + return self.__createQuakeValue(quake.AllocaOp(veqTy).result) + + if isinstance(initializer, list): + initializer = np.array(initializer, dtype=type(initializer[0])) + + if isinstance(initializer, np.ndarray): + if len(initializer.shape) != 1: + raise RuntimeError( + "invalid initializer for qalloc (np.ndarray must be 1D, vector-like)" + ) + + if initializer.dtype not in [ + complex, np.complex128, np.complex64 + ]: + raise RuntimeError( + "qalloc state data must be of complex dtype.") + + # Get the current simulation precision + currentTarget = cudaq_runtime.get_target() + simulationPrecision = currentTarget.get_precision() + if initializer.dtype in [np.complex128, complex]: + if simulationPrecision == cudaq_runtime.SimulationPrecision.fp32: + raise RuntimeError( + "qalloc input state is complex128 but simulator is on complex64 floating point type." + ) + + if initializer.dtype == np.complex64: + if simulationPrecision == cudaq_runtime.SimulationPrecision.fp64: + raise RuntimeError( + "qalloc input state is complex64 but simulator is on complex128 floating point type." + ) + + # Compute a unique hash string for the state data + hashValue = hashlib.sha1(initializer).hexdigest( + )[:10] + self.name.removeprefix('__nvqppBuilderKernel_') + + # Get the size of the array + size = len(initializer) + + floatType = F64Type.get( + self.ctx + ) if simulationPrecision == cudaq_runtime.SimulationPrecision.fp64 else F32Type.get( + self.ctx) + complexType = ComplexType.get(floatType) + ptrComplex = cc.PointerType.get(self.ctx, complexType) + i32Ty = self.getIntegerType(32) + globalTy = cc.StructType.get(self.ctx, [ptrComplex, i32Ty]) + globalName = f'nvqpp.state.{hashValue}' + setStateName = f'nvqpp.set.state.{hashValue}' + with InsertionPoint.at_block_begin(self.module.body): + cc.GlobalOp(TypeAttr.get(globalTy), + globalName, + external=True) + setStateFunc = func.FuncOp(setStateName, + FunctionType.get( + inputs=[ptrComplex], + results=[]), + loc=self.loc) + entry = setStateFunc.add_entry_block() + kDynamicPtrIndex: int = -2147483648 + with InsertionPoint(entry): + zero = self.getConstantInt(0) + address = cc.AddressOfOp( + cc.PointerType.get(self.ctx, globalTy), + FlatSymbolRefAttr.get(globalName)) + ptr = cc.ComputePtrOp( + cc.PointerType.get(self.ctx, ptrComplex), address, + [zero, zero], + DenseI32ArrayAttr.get( + [kDynamicPtrIndex, kDynamicPtrIndex], + context=self.ctx)) + cc.StoreOp(entry.arguments[0], ptr) + func.ReturnOp([]) + + zero = self.getConstantInt(0) + numQubits = np.log2(size) + if not numQubits.is_integer(): + raise RuntimeError( + "invalid input state size for qalloc (not a power of 2)" + ) + + # check state is normalized + norm = sum([np.conj(a) * a for a in initializer]) + if np.abs(norm.imag) > 1e-4 or np.abs(1. - norm.real) > 1e-4: + raise RuntimeError( + "invalid input state for qalloc (not normalized)") + + veqTy = quake.VeqType.get(self.ctx, int(numQubits)) + qubits = quake.AllocaOp(veqTy).result + address = cc.AddressOfOp(cc.PointerType.get(self.ctx, globalTy), + FlatSymbolRefAttr.get(globalName)) + ptr = cc.ComputePtrOp( + cc.PointerType.get(self.ctx, ptrComplex), address, + [zero, zero], + DenseI32ArrayAttr.get([kDynamicPtrIndex, kDynamicPtrIndex], + context=self.ctx)) + loaded = cc.LoadOp(ptr) + qubits = quake.InitializeStateOp(qubits.type, qubits, + loaded).result + + # Record the unique hash value + if hashValue not in self.stateHashes: + self.stateHashes.append(hashValue) + + # Store the pointer to the array data + cudaq_runtime.storePointerToStateData( + self.name, hashValue, initializer, + cudaq_runtime.SimulationPrecision.fp64) + + return self.__createQuakeValue(qubits) + + # If the initializer is a QuakeValue, see if it is + # a integer or a `stdvec` type + if isinstance(initializer, QuakeValue): + veqTy = quake.VeqType.get(self.ctx) + if IntegerType.isinstance(initializer.mlirValue.type): + # This is an integer size + return self.__createQuakeValue( + quake.AllocaOp(veqTy, + size=initializer.mlirValue).result) + + if cc.StdvecType.isinstance(initializer.mlirValue.type): + # This is a state to initialize to + size = cc.StdvecSizeOp(self.getIntegerType(), + initializer.mlirValue).result + numQubits = math.CountTrailingZerosOp(size).result + qubits = quake.AllocaOp(veqTy, size=numQubits).result + ptrTy = cc.PointerType.get( + self.ctx, + cc.StdvecType.getElementType( + initializer.mlirValue.type)) + initials = cc.StdvecDataOp(ptrTy, initializer.mlirValue) + quake.InitializeStateOp(veqTy, qubits, initials) + return self.__createQuakeValue(qubits) + + # If no initializer, create a single qubit + if initializer == None: qubitTy = quake.RefType.get(self.ctx) return self.__createQuakeValue(quake.AllocaOp(qubitTy).result) - else: - if isinstance(size, QuakeValue): - veqTy = quake.VeqType.get(self.ctx) - sizeVal = size.mlirValue - return self.__createQuakeValue( - quake.AllocaOp(veqTy, size=sizeVal).result) - else: - veqTy = quake.VeqType.get(self.ctx, size) - return self.__createQuakeValue(quake.AllocaOp(veqTy).result) + + raise RuntimeError("invalid initializer argument for qalloc.") def __isPauliWordType(self, ty): """ @@ -1132,6 +1285,26 @@ def getListType(eleType: type): continue listType = getListType(type(arg[0])) mlirType = mlirTypeFromPyType(argType, self.ctx) + + if cc.StdvecType.isinstance(mlirType): + # Support passing `list[int]` to a `list[float]` argument + if cc.StdvecType.isinstance(self.mlirArgTypes[i]): + maybeCasted = supportCommonCast(mlirType, + self.mlirArgTypes[i], arg, + IntegerType, F64Type, float) + if maybeCasted != None: + processedArgs.append(maybeCasted) + continue + + # Support passing `list[float]` to a `list[complex]` argument + maybeCasted = supportCommonCast(mlirType, + self.mlirArgTypes[i], arg, + F64Type, ComplexType, + complex) + if maybeCasted != None: + processedArgs.append(maybeCasted) + continue + if mlirType != self.mlirArgTypes[ i] and listType != mlirTypeToPyType(self.mlirArgTypes[i]): emitFatalError( diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py index 203e66eb5c..61dc288510 100644 --- a/python/cudaq/kernel/utils.py +++ b/python/cudaq/kernel/utils.py @@ -209,9 +209,12 @@ def mlirTypeFromPyType(argType, ctx, **kwargs): ) return cc.StdvecType.get(ctx, mlirTypeFromPyType(float, ctx)) - if isinstance(argInstance[0], complex): + if isinstance(argInstance[0], (complex, np.complex128)): return cc.StdvecType.get(ctx, mlirTypeFromPyType(complex, ctx)) + if isinstance(argInstance[0], np.complex64): + return cc.StdvecType.get(ctx, ComplexType.get(F32Type.get(ctx))) + if isinstance(argInstance[0], pauli_word): return cc.StdvecType.get(ctx, cc.CharspanType.get(ctx)) diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index ea60a147ed..ca58f36236 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -29,6 +29,7 @@ #include "mlir/Target/LLVMIR/Export.h" #include +#include #include namespace py = pybind11; @@ -37,6 +38,16 @@ using namespace mlir; namespace cudaq { static std::unique_ptr jitCache; +struct PyStateVectorData { + void *data = nullptr; + simulation_precision precision = simulation_precision::fp32; + std::string kernelName; +}; +using PyStateVectorStorage = std::map; + +static std::unique_ptr stateStorage = + std::make_unique(); + std::tuple jitAndCreateArgs(const std::string &name, MlirModule module, cudaq::OpaqueArguments &runtimeArgs, @@ -185,6 +196,28 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module, std::string properName = name; + // If we have any state vector data, we need to extract the function pointer + // to set that data, and then set it. + for (auto &[stateHash, svdata] : *stateStorage) { + if (svdata.kernelName != name) + continue; + auto setStateFPtr = jit->lookup("nvqpp.set.state." + stateHash); + if (!setStateFPtr) + throw std::runtime_error( + "python alt_launch_kernel failed to get set state function."); + + if (svdata.precision == simulation_precision::fp64) { + auto setStateFunc = + reinterpret_cast *)>(*setStateFPtr); + setStateFunc(reinterpret_cast *>(svdata.data)); + continue; + } + + auto setStateFunc = + reinterpret_cast *)>(*setStateFPtr); + setStateFunc(reinterpret_cast *>(svdata.data)); + } + // Need to first invoke the init_func() auto kernelInitFunc = properName + ".init_func"; auto initFuncPtr = jit->lookup(kernelInitFunc); @@ -362,6 +395,7 @@ void bindAltLaunchKernel(py::module &mod) { }, py::arg("kernelName"), py::arg("module"), py::kw_only(), py::arg("callable_names") = std::vector{}, "DOC STRING"); + mod.def( "pyAltLaunchKernelR", [&](const std::string &kernelName, MlirModule module, MlirType returnType, @@ -398,5 +432,28 @@ void bindAltLaunchKernel(py::module &mod) { return getQIRLL(name, module, args, profile); }, py::arg("kernel"), py::kw_only(), py::arg("profile") = ""); + + mod.def( + "storePointerToStateData", + [](const std::string &name, const std::string &hash, py::buffer data, + simulation_precision precision) { + auto ptr = data.request().ptr; + stateStorage->insert({hash, PyStateVectorData{ptr, precision, name}}); + }, + "Store qalloc state initialization array data."); + + mod.def( + "deletePointersToStateData", + [](const std::vector &hashes) { + for (auto iter = stateStorage->cbegin(); iter != stateStorage->end();) { + if (std::find(hashes.begin(), hashes.end(), iter->first) != + hashes.end()) { + stateStorage->erase(iter++); + continue; + } + iter++; + } + }, + "Remove our pointers to the qalloc array data."); } } // namespace cudaq \ No newline at end of file diff --git a/python/runtime/cudaq/target/py_runtime_target.cpp b/python/runtime/cudaq/target/py_runtime_target.cpp index fd36dbc157..320ef516fa 100644 --- a/python/runtime/cudaq/target/py_runtime_target.cpp +++ b/python/runtime/cudaq/target/py_runtime_target.cpp @@ -17,6 +17,10 @@ namespace cudaq { void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) { + py::enum_(mod, "SimulationPrecision") + .value("fp32", simulation_precision::fp32) + .value("fp64", simulation_precision::fp64); + py::class_( mod, "Target", "The `cudaq.Target` represents the underlying infrastructure that CUDA " @@ -40,13 +44,17 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) { .def("is_emulated", &cudaq::RuntimeTarget::is_emulated, "Returns true if the emulation mode for the target has been " "activated.") + .def("get_precision", &cudaq::RuntimeTarget::get_precision, "") .def( "__str__", [](cudaq::RuntimeTarget &self) { - return fmt::format("Target {}\n\tsimulator={}\n\tplatform={}" - "\n\tdescription={}\n", - self.name, self.simulatorName, self.platformName, - self.description); + return fmt::format( + "Target {}\n\tsimulator={}\n\tplatform={}" + "\n\tdescription={}\n\tprecision={}\n", + self.name, self.simulatorName, self.platformName, + self.description, + self.get_precision() == simulation_precision::fp32 ? "fp32" + : "fp64"); }, "Persist the information in this `cudaq.Target` to a string."); diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp index 0103f757ca..72301dff2c 100644 --- a/python/runtime/mlir/py_register_dialects.cpp +++ b/python/runtime/mlir/py_register_dialects.cpp @@ -12,9 +12,12 @@ #include "cudaq/Optimizer/CAPI/Dialects.h" #include "cudaq/Optimizer/CodeGen/Passes.h" #include "cudaq/Optimizer/CodeGen/Pipelines.h" +#include "cudaq/Optimizer/Dialect/CC/CCDialect.h" #include "cudaq/Optimizer/Dialect/CC/CCTypes.h" +#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h" #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h" #include "cudaq/Optimizer/Transforms/Passes.h" +#include "mlir/InitAllDialects.h" #include #include @@ -232,5 +235,15 @@ void bindRegisterDialects(py::module &mod) { if (failed(builder.loadIntrinsic(unwrapped, name))) unwrapped.emitError("failed to load intrinsic " + name); }); + + mod.def("register_all_dialects", [](MlirContext context) { + DialectRegistry registry; + registry.insert(); + cudaq::opt::registerCodeGenDialect(registry); + registerAllDialects(registry); + auto *mlirContext = unwrap(context); + mlirContext->appendDialectRegistry(registry); + mlirContext->loadAllAvailableDialects(); + }); } } // namespace cudaq \ No newline at end of file diff --git a/python/tests/builder/test_kernel_builder.py b/python/tests/builder/test_kernel_builder.py index bf13e97b5e..2ba0313049 100644 --- a/python/tests/builder/test_kernel_builder.py +++ b/python/tests/builder/test_kernel_builder.py @@ -876,42 +876,139 @@ def test_recursive_calls(): print(kernel3) - -## [SKIP_TEST] -@pytest.mark.skip( - reason="AttributeError: module 'cudaq' has no attribute 'from_state'") -def test_from_state(): +def can_set_target(name): + target_installed = True + try: + cudaq.set_target(name) + except RuntimeError: + target_installed = False + return target_installed + +skipIfNvidiaFP64NotInstalled = pytest.make.skipif( + can_set_target('nvidia-fp64'), + reason='Could not find nvidia-fp64 in installation') + +@skipIfNvidiaFP64NotInstalled +def test_from_state0(): cudaq.reset_target() - state = np.asarray([.70710678, 0., 0., 0.70710678]) - kernel = cudaq.make_kernel() - qubits = kernel.qalloc(2) + cudaq.set_target('nvidia-fp64') - cudaq.from_state(kernel, qubits, state) + kernel, initState = cudaq.make_kernel(list[complex]) + qubits = kernel.qalloc(initState) - print(kernel) + # Test float64 list, casts to complex + state = [.70710678, 0., 0., 0.70710678] + counts = cudaq.sample(kernel, state) + print(counts) + assert '11' in counts + assert '00' in counts + + # Test complex list + state = [.70710678j, 0., 0., 0.70710678] + counts = cudaq.sample(kernel, state) + print(counts) + assert '11' in counts + assert '00' in counts + + # Test Numpy array + state = np.asarray([.70710678, 0., 0., 0.70710678]) + counts = cudaq.sample(kernel, state) + print(counts) + assert '11' in counts + assert '00' in counts + + # Now test constant array data, not kernel input + state = np.array([.70710678, 0., 0., 0.70710678], dtype=complex) + kernel = cudaq.make_kernel() + qubits = kernel.qalloc(state) counts = cudaq.sample(kernel) print(counts) assert '11' in counts assert '00' in counts - kernel = cudaq.from_state(state) + state = [.70710678 + 0j, 0., 0., 0.70710678] + kernel = cudaq.make_kernel() + qubits = kernel.qalloc(state) counts = cudaq.sample(kernel) print(counts) assert '11' in counts assert '00' in counts - hamiltonian = 5.907 - 2.1433 * spin.x(0) * spin.x(1) - 2.1433 * spin.y( - 0) * spin.y(1) + .21829 * spin.z(0) - 6.125 * spin.z(1) - state = np.asarray([0., .292786, .956178, 0.]) + state = np.array([.70710678, 0., 0., 0.70710678]) + kernel = cudaq.make_kernel() + with pytest.raises(RuntimeError) as e: + # float data and not complex data + qubits = kernel.qalloc(state) + + state = np.array([.70710678, 0., 0., 0.70710678], dtype=np.complex64) + kernel = cudaq.make_kernel() + with pytest.raises(RuntimeError) as e: + # Wrong precision for fp64 simulator + qubits = kernel.qalloc(state) + + with pytest.raises(RuntimeError) as e: + qubits = kernel.qalloc(np.array([1., 0., 0.], dtype=complex)) + +skipIfNvidiaNotInstalled = pytest.make.skipif( + can_set_target('nvidia'), + reason='Could not find nvidia in installation') + +@skipIfNvidiaNotInstalled +def test_from_state1(): + cudaq.reset_target() + cudaq.set_target('nvidia') + + state = np.array([.70710678, 0., 0., 0.70710678], dtype=np.complex128) kernel = cudaq.make_kernel() - qubits = kernel.qalloc(2) - cudaq.from_state(kernel, qubits, state) - energy = cudaq.observe(kernel, hamiltonian).expectation() - assert np.isclose(-1.748, energy, 1e-3) + with pytest.raises(RuntimeError) as e: + qubits = kernel.qalloc(state) + + state = np.array([.70710678, 0., 0., 0.70710678], dtype=np.complex64) + kernel2 = cudaq.make_kernel() + qubits = kernel2.qalloc(state) + counts = cudaq.sample(kernel2) + print(counts) + assert '11' in counts + assert '00' in counts - ss = cudaq.get_state(kernel) - for i in range(4): - assert np.isclose(ss[i], state[i], 1e-3) + cudaq.reset_target() + + # Regardless of the target precision, use + # cudaq.simulation_dtype() or cudaq.create_state() + state = np.array([.70710678, 0., 0., 0.70710678], + dtype=cudaq.simulation_dtype()) + kernel2 = cudaq.make_kernel() + qubits = kernel2.qalloc(state) + counts = cudaq.sample(kernel2) + print(counts) + assert '11' in counts + assert '00' in counts + + state = cudaq.create_state([.70710678, 0., 0., 0.70710678]) + kernel2 = cudaq.make_kernel() + qubits = kernel2.qalloc(state) + counts = cudaq.sample(kernel2) + print(counts) + assert '11' in counts + assert '00' in counts + + state = cudaq.create_state(np.array([.5]*4)) + kernel2 = cudaq.make_kernel() + qubits = kernel2.qalloc(state) + counts = cudaq.sample(kernel2) + print(counts) + assert '11' in counts + assert '00' in counts + assert '01' in counts + assert '10' in counts + + kernel, initState = cudaq.make_kernel(list[np.complex64]) + qubits = kernel.qalloc(initState) + state = cudaq.create_state([.70710678, 0., 0., 0.70710678]) + counts = cudaq.sample(kernel, state) + print(counts) + assert '11' in counts + assert '00' in counts @skipIfPythonLessThan39 @@ -925,12 +1022,13 @@ def test_pauli_word_input(): 1, 3, 3, -0.0454063, -0, 15 ] h = cudaq.SpinOperator(h2_data, 4) - + kernel, theta, paulis = cudaq.make_kernel(float, list[cudaq.pauli_word]) q = kernel.qalloc(4) kernel.x(q[0]) kernel.x(q[1]) - kernel.for_loop(0, paulis.size(), lambda idx : kernel.exp_pauli(theta, q, paulis[idx])) + kernel.for_loop(0, paulis.size(), + lambda idx: kernel.exp_pauli(theta, q, paulis[idx])) print(kernel) want_exp = cudaq.observe(kernel, h, .11, ['XXXY']).expectation() diff --git a/python/tests/kernel/test_kernel_lists.py b/python/tests/kernel/test_kernel_lists.py new file mode 100644 index 0000000000..f08f8190f4 --- /dev/null +++ b/python/tests/kernel/test_kernel_lists.py @@ -0,0 +1,87 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + +import os, sys + +import pytest +import numpy as np +from typing import List + +import cudaq +from cudaq import spin + +## [PYTHON_VERSION_FIX] +skipIfPythonLessThan39 = pytest.mark.skipif( + sys.version_info < (3, 9), + reason="built-in collection types such as `list` not supported") + +def test_float_lists(): + """Test that we can use float numbers inside kernel functions.""" + + cudaq.reset_target() + cudaq.set_target('nvidia-fp64') + + f = [0., 1., 1., 0.] + + # Pass list of float as a parameter + @cudaq.kernel + def test_float_vec_param(vec : list[float]): + f1 = vec + + counts = cudaq.sample(test_float_vec_param, f) + assert len(counts) == 0 + + + # Capture list of float + @cudaq.kernel + def test_float_vec_capture(): + f1 = f + + counts = cudaq.sample(test_float_vec_capture) + assert len(counts) == 0 + + + # Define list of float inside kernel + @cudaq.kernel + def test_float_vec_definition(): + f1 = [1.0, 0., 0., 1.] + + counts = cudaq.sample(test_float_vec_definition) + assert len(counts) == 0 + + +def test_float_lists(): + """Test that we can use complex numbers inside kernel functions.""" + + # Pass list of complex as a parameter + c = [.70710678 + 0j, 0., 0., 0.70710678] + + @cudaq.kernel + def test_complex_vec_param(vec : list[complex]): + f1 = vec + + counts = cudaq.sample(test_complex_vec_param, c) + assert len(counts) == 0 + + + # Capture list of complex + @cudaq.kernel + def test_complex_vec_capture(): + f1 = c + + counts = cudaq.sample(test_complex_vec_capture) + assert len(counts) == 0 + + # Define list of complex inside kernel + @cudaq.kernel + def test_complex_vec_definition(): + f1 = [1.0 + 0j, 0., 0., 1.] + + + counts = cudaq.sample(test_complex_vec_definition) + assert len(counts) == 0 diff --git a/python/utils/LinkedLibraryHolder.cpp b/python/utils/LinkedLibraryHolder.cpp index 9dacd8e57f..b7aff90760 100644 --- a/python/utils/LinkedLibraryHolder.cpp +++ b/python/utils/LinkedLibraryHolder.cpp @@ -29,6 +29,8 @@ constexpr static const char PLATFORM_LIBRARY[] = "PLATFORM_LIBRARY="; constexpr static const char NVQIR_SIMULATION_BACKEND[] = "NVQIR_SIMULATION_BACKEND="; constexpr static const char TARGET_DESCRIPTION[] = "TARGET_DESCRIPTION="; +constexpr static const char IS_FP64_SIMULATION[] = + "CUDAQ_SIMULATION_SCALAR_FP64"; /// @brief A utility function to check availability of Nvidia GPUs and return /// their count @@ -75,6 +77,8 @@ bool RuntimeTarget::is_emulated() { return platform.is_emulated(); } +simulation_precision RuntimeTarget::get_precision() { return precision; } + /// @brief Search the targets folder in the install for available targets. void findAvailableTargets( const std::filesystem::path &targetPath, @@ -89,6 +93,7 @@ void findAvailableTargets( if (path.extension().string() == ".config") { bool isSimulationTarget = false; // Extract the target name from the file name + simulation_precision precision = simulation_precision::fp32; auto fileName = path.filename().string(); auto targetName = std::regex_replace(fileName, std::regex(".config"), ""); std::string platformName = "default", simulatorName = "qpp", @@ -124,6 +129,8 @@ void findAvailableTargets( description.erase( std::remove(description.begin(), description.end(), '\"'), description.end()); + } else if (line.find(IS_FP64_SIMULATION) != std::string::npos) { + precision = simulation_precision::fp64; } } } @@ -131,14 +138,15 @@ void findAvailableTargets( cudaq::info("Found Target: {} -> (sim={}, platform={})", targetName, simulatorName, platformName); // Add the target. - targets.emplace(targetName, RuntimeTarget{targetName, simulatorName, - platformName, description}); + targets.emplace(targetName, + RuntimeTarget{targetName, simulatorName, platformName, + description, precision}); if (isSimulationTarget) { cudaq::info("Found Simulation target: {} -> (sim={}, platform={})", targetName, simulatorName, platformName); - simulationTargets.emplace(targetName, - RuntimeTarget{targetName, simulatorName, - platformName, description}); + simulationTargets.emplace( + targetName, RuntimeTarget{targetName, simulatorName, platformName, + description, precision}); isSimulationTarget = false; } } diff --git a/python/utils/LinkedLibraryHolder.h b/python/utils/LinkedLibraryHolder.h index 2e35ab7a2c..7477ca0c79 100644 --- a/python/utils/LinkedLibraryHolder.h +++ b/python/utils/LinkedLibraryHolder.h @@ -8,6 +8,7 @@ #pragma once +#include "cudaq/host_config.h" #include #include #include @@ -30,11 +31,13 @@ struct RuntimeTarget { std::string simulatorName; std::string platformName; std::string description; + simulation_precision precision; /// @brief Return the number of QPUs this target exposes. std::size_t num_qpus(); bool is_remote(); bool is_emulated(); + simulation_precision get_precision(); }; /// @brief The LinkedLibraryHolder provides a mechanism for diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h index 7bed619cb0..d1a6963088 100644 --- a/python/utils/OpaqueArguments.h +++ b/python/utils/OpaqueArguments.h @@ -323,17 +323,31 @@ packArgs(OpaqueArguments &argData, py::args args, return; }) .Case([&](ComplexType type) { - genericVecAllocator.template operator()>( - [](py::handle element) -> std::complex { - if (!py::hasattr(element, "real")) - throw std::runtime_error( - "invalid complex element type"); - if (!py::hasattr(element, "imag")) - throw std::runtime_error( - "invalid complex element type"); - return {PyFloat_AsDouble(element.attr("real").ptr()), - PyFloat_AsDouble(element.attr("imag").ptr())}; - }); + if (isa(type.getElementType())) { + genericVecAllocator.template operator()>( + [](py::handle element) -> std::complex { + if (!py::hasattr(element, "real")) + throw std::runtime_error( + "invalid complex element type"); + if (!py::hasattr(element, "imag")) + throw std::runtime_error( + "invalid complex element type"); + return {PyFloat_AsDouble(element.attr("real").ptr()), + PyFloat_AsDouble(element.attr("imag").ptr())}; + }); + } else { + genericVecAllocator.template operator()>( + [](py::handle element) -> std::complex { + if (!py::hasattr(element, "real")) + throw std::runtime_error( + "invalid complex element type"); + if (!py::hasattr(element, "imag")) + throw std::runtime_error( + "invalid complex element type"); + return {element.attr("real").cast(), + element.attr("imag").cast()}; + }); + } return; }) .Default([](Type ty) { diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 6998b3e4aa..ddfcbe5130 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -23,4 +23,4 @@ install (FILES nvqir/CircuitSimulator.h nvqir/QIRTypes.h nvqir/Gates.h DESTINATION include/nvqir) -install (FILES cudaq.h host_config.h DESTINATION include) +install (FILES cudaq.h DESTINATION include) diff --git a/runtime/common/MeasureCounts.cpp b/runtime/common/MeasureCounts.cpp index da456a6c43..d99d965546 100644 --- a/runtime/common/MeasureCounts.cpp +++ b/runtime/common/MeasureCounts.cpp @@ -455,7 +455,7 @@ void sample_result::clear() { totalShots = 0; } -void sample_result::dump(std::ostream &os) { +void sample_result::dump(std::ostream &os) const { os << "{ "; if (sampleResults.size() > 1) { os << "\n "; @@ -492,7 +492,7 @@ void sample_result::dump(std::ostream &os) { os << "}\n"; } -void sample_result::dump() { dump(std::cout); } +void sample_result::dump() const { dump(std::cout); } bool sample_result::has_even_parity(std::string_view bitString) { int c = std::count(bitString.begin(), bitString.end(), '1'); diff --git a/runtime/common/MeasureCounts.h b/runtime/common/MeasureCounts.h index 5bafa2541a..ee028e37c7 100644 --- a/runtime/common/MeasureCounts.h +++ b/runtime/common/MeasureCounts.h @@ -215,10 +215,10 @@ class sample_result { size(const std::string_view registerName = GlobalRegisterName) noexcept; /// @brief Dump this sample_result to standard out. - void dump(); + void dump() const; /// @brief Dump this sample_result to the given output stream - void dump(std::ostream &os); + void dump(std::ostream &os) const; /// @brief Clear this sample_result. void clear(); diff --git a/runtime/common/RuntimeMLIR.cpp b/runtime/common/RuntimeMLIR.cpp index 0b3f452137..c8f0d27117 100644 --- a/runtime/common/RuntimeMLIR.cpp +++ b/runtime/common/RuntimeMLIR.cpp @@ -87,9 +87,9 @@ std::unique_ptr initializeMLIR() { } DialectRegistry registry; - registry.insert(); + registry.insert(); + cudaq::opt::registerCodeGenDialect(registry); + registerAllDialects(registry); auto context = std::make_unique(registry); context->loadAllAvailableDialects(); registerLLVMDialectTranslation(*context); diff --git a/runtime/cudaq.h b/runtime/cudaq.h index 9e52db4747..fe290c9fd1 100644 --- a/runtime/cudaq.h +++ b/runtime/cudaq.h @@ -9,8 +9,8 @@ #pragma once #include "common/NoiseModel.h" +#include "cudaq/host_config.h" #include "cudaq/qis/qubit_qis.h" -#include "host_config.h" #include #include diff --git a/runtime/cudaq/algorithms/broadcast.h b/runtime/cudaq/algorithms/broadcast.h index a3b6bd008e..ad0b480aa9 100644 --- a/runtime/cudaq/algorithms/broadcast.h +++ b/runtime/cudaq/algorithms/broadcast.h @@ -8,8 +8,8 @@ #pragma once +#include "cudaq/host_config.h" #include "cudaq/platform.h" -#include "host_config.h" namespace cudaq { diff --git a/runtime/cudaq/algorithms/observe.h b/runtime/cudaq/algorithms/observe.h index 445a47c1e6..e12afa1772 100644 --- a/runtime/cudaq/algorithms/observe.h +++ b/runtime/cudaq/algorithms/observe.h @@ -13,8 +13,8 @@ #include "common/ObserveResult.h" #include "cudaq/algorithms/broadcast.h" #include "cudaq/concepts.h" +#include "cudaq/host_config.h" #include "cudaq/spin_op.h" -#include "host_config.h" #include #if CUDAQ_USE_STD20 #include diff --git a/runtime/cudaq/algorithms/sample.h b/runtime/cudaq/algorithms/sample.h index b11ef47f79..c1f2e4a6c8 100644 --- a/runtime/cudaq/algorithms/sample.h +++ b/runtime/cudaq/algorithms/sample.h @@ -13,7 +13,7 @@ #include "common/MeasureCounts.h" #include "cudaq/algorithms/broadcast.h" #include "cudaq/concepts.h" -#include "host_config.h" +#include "cudaq/host_config.h" namespace cudaq { bool kernelHasConditionalFeedback(const std::string &); diff --git a/runtime/cudaq/algorithms/state.h b/runtime/cudaq/algorithms/state.h index 55cb96a3d6..e2b0a27d4a 100644 --- a/runtime/cudaq/algorithms/state.h +++ b/runtime/cudaq/algorithms/state.h @@ -11,9 +11,9 @@ #include "common/ExecutionContext.h" #include "common/KernelWrapper.h" #include "cudaq/concepts.h" +#include "cudaq/host_config.h" #include "cudaq/platform.h" #include "cudaq/platform/QuantumExecutionQueue.h" -#include "host_config.h" #include #include diff --git a/runtime/cudaq/builder/QuakeValue.h b/runtime/cudaq/builder/QuakeValue.h index 5527037623..c406a97232 100644 --- a/runtime/cudaq/builder/QuakeValue.h +++ b/runtime/cudaq/builder/QuakeValue.h @@ -8,7 +8,7 @@ #pragma once -#include "host_config.h" +#include "cudaq/host_config.h" #include #include #include diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp index d241a7742b..ccf4f05d65 100644 --- a/runtime/cudaq/builder/kernel_builder.cpp +++ b/runtime/cudaq/builder/kernel_builder.cpp @@ -9,6 +9,7 @@ #include "kernel_builder.h" #include "common/Logger.h" #include "common/RuntimeMLIR.h" +#include "cudaq/Optimizer/Builder/Intrinsics.h" #include "cudaq/Optimizer/Builder/Runtime.h" #include "cudaq/Optimizer/CodeGen/Passes.h" #include "cudaq/Optimizer/Dialect/CC/CCDialect.h" @@ -19,6 +20,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Math/IR/Math.h" #include "mlir/ExecutionEngine/ExecutionEngine.h" #include "mlir/IR/AsmState.h" #include "mlir/IR/BuiltinOps.h" @@ -45,57 +47,75 @@ namespace cudaq::details { /// @brief Track unique measurement register names. static std::size_t regCounter = 0; -KernelBuilderType mapArgToType(double &e) { +KernelBuilderType convertArgumentTypeToMLIR(double &e) { return KernelBuilderType( [](MLIRContext *ctx) { return Float64Type::get(ctx); }); } -KernelBuilderType mapArgToType(float &e) { +KernelBuilderType convertArgumentTypeToMLIR(float &e) { return KernelBuilderType( [](MLIRContext *ctx) { return Float32Type::get(ctx); }); } -KernelBuilderType mapArgToType(int &e) { +KernelBuilderType convertArgumentTypeToMLIR(int &e) { return KernelBuilderType( [](MLIRContext *ctx) { return IntegerType::get(ctx, 32); }); } -KernelBuilderType mapArgToType(std::vector &e) { +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e) { return KernelBuilderType([](MLIRContext *ctx) { return cudaq::cc::StdvecType::get(ctx, Float64Type::get(ctx)); }); } -KernelBuilderType mapArgToType(std::size_t &e) { +KernelBuilderType convertArgumentTypeToMLIR(std::size_t &e) { return KernelBuilderType( [](MLIRContext *ctx) { return IntegerType::get(ctx, 64); }); } -KernelBuilderType mapArgToType(std::vector &e) { +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e) { return KernelBuilderType([](MLIRContext *ctx) { return cudaq::cc::StdvecType::get(ctx, mlir::IntegerType::get(ctx, 32)); }); } -KernelBuilderType mapArgToType(std::vector &e) { +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e) { return KernelBuilderType([](MLIRContext *ctx) { return cudaq::cc::StdvecType::get(ctx, mlir::IntegerType::get(ctx, 64)); }); } /// Map a std::vector to a KernelBuilderType -KernelBuilderType mapArgToType(std::vector &e) { +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e) { return KernelBuilderType([](MLIRContext *ctx) { return cudaq::cc::StdvecType::get(ctx, Float32Type::get(ctx)); }); } -KernelBuilderType mapArgToType(cudaq::qubit &e) { +/// Map a std::vector> to a KernelBuilderType +KernelBuilderType +convertArgumentTypeToMLIR(std::vector> &e) { + return KernelBuilderType([](MLIRContext *ctx) { + return cudaq::cc::StdvecType::get(ctx, + ComplexType::get(Float64Type::get(ctx))); + }); +} + +/// Map a std::vector> to a KernelBuilderType +KernelBuilderType +convertArgumentTypeToMLIR(std::vector> &e) { + return KernelBuilderType([](MLIRContext *ctx) { + return cudaq::cc::StdvecType::get(ctx, + ComplexType::get(Float32Type::get(ctx))); + }); +} + +KernelBuilderType convertArgumentTypeToMLIR(cudaq::qubit &e) { return KernelBuilderType( [](MLIRContext *ctx) { return quake::RefType::get(ctx); }); } -KernelBuilderType mapArgToType(cudaq::qvector<> &e) { +KernelBuilderType convertArgumentTypeToMLIR(cudaq::qvector<> &e) { return KernelBuilderType( [](MLIRContext *ctx) { return quake::VeqType::getUnsized(ctx); }); } @@ -144,9 +164,8 @@ initializeBuilder(MLIRContext *context, cudaq::info("kernel_builder has {} arguments", arguments.size()); - // Every Kernel should have a ReturnOp terminator, - // then we'll set the insertion point to right - // before it. + // Every Kernel should have a ReturnOp terminator, then we'll set the + // insertion point to right before it. opBuilder->setInsertionPointToStart(entryBlock); auto terminator = opBuilder->create(); opBuilder->setInsertionPoint(terminator); @@ -192,10 +211,9 @@ void exp_pauli(ImplicitLocOpBuilder &builder, const QuakeValue &theta, } /// @brief Search the given `FuncOp` for all `CallOps` recursively. -/// If found, see if the called function is in the current `ModuleOp` -/// for this `kernel_builder`, if so do nothing. If it is not found, -/// then find it in the other `ModuleOp`, clone it, and add it to this -/// `ModuleOp`. +/// If found, see if the called function is in the current `ModuleOp` for this +/// `kernel_builder`, if so do nothing. If it is not found, then find it in the +/// other `ModuleOp`, clone it, and add it to this `ModuleOp`. void addAllCalledFunctionRecursively( func::FuncOp &function, ModuleOp ¤tModule, mlir::OwningOpRef &otherModule) { @@ -244,10 +262,10 @@ void addAllCalledFunctionRecursively( visitAllCallOps(function); } -/// @brief Get a the function with the given name. First look in the -/// current `ModuleOp` for this `kernel_builder`, if found return it as is. If -/// not found, find it in the other `kernel_builder` `ModuleOp` and return a -/// clone of it. Throw an exception if no kernel with the given name is found +/// @brief Get a the function with the given name. First look in the current +/// `ModuleOp` for this `kernel_builder`, if found return it as is. If not +/// found, find it in the other `kernel_builder` `ModuleOp` and return a clone +/// of it. Throw an exception if no kernel with the given name is found func::FuncOp cloneOrGetFunction(StringRef name, ModuleOp ¤tModule, mlir::OwningOpRef &otherModule) { @@ -276,8 +294,8 @@ void call(ImplicitLocOpBuilder &builder, std::string &name, auto function = block->getParentOp(); auto currentModule = function->getParentOfType(); - // We need to clone the function we care about, we need - // any other functions it calls, so store it in a vector + // We need to clone the function we care about, we need any other functions it + // calls, so store it in a vector std::vector functions; // Get the function with the kernel name we care about. @@ -285,8 +303,8 @@ void call(ImplicitLocOpBuilder &builder, std::string &name, auto otherFuncCloned = cloneOrGetFunction(properName, currentModule, otherModule); - // We need to recursively find all CallOps and - // add their Callee FuncOps to the current Module + // We need to recursively find all CallOps and add their Callee FuncOps to the + // current Module addAllCalledFunctionRecursively(otherFuncCloned, currentModule, otherModule); // Map the QuakeValues to MLIR Values @@ -339,8 +357,8 @@ void applyControlOrAdjoint(ImplicitLocOpBuilder &builder, std::string &name, auto otherFuncCloned = cloneOrGetFunction(properName, currentModule, otherModule); - // We need to recursively find all CallOps and - // add their Callee FuncOps to the current Module + // We need to recursively find all CallOps and add their Callee FuncOps to the + // current Module addAllCalledFunctionRecursively(otherFuncCloned, currentModule, otherModule); SmallVector mlirValues; @@ -460,21 +478,147 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, const std::size_t nQubits) { return QuakeValue(builder, qubits); } -QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &size) { +QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) { cudaq::info("kernel_builder allocating qubits from quake value"); - auto value = size.getValue(); + auto value = sizeOrVec.getValue(); auto type = value.getType(); + auto context = builder.getContext(); + + if (auto stdvecTy = dyn_cast(type)) { + // get the size + Value size = builder.create(builder.getI64Type(), value); + Value numQubits = builder.create(size); + auto veqTy = quake::VeqType::getUnsized(context); + // allocate the number of qubits we need + Value qubits = builder.create(veqTy, numQubits); + + auto ptrTy = cc::PointerType::get(stdvecTy.getElementType()); + Value initials = builder.create(ptrTy, value); + builder.create(veqTy, qubits, initials); + return QuakeValue(builder, qubits); + } + if (!type.isIntOrIndex()) throw std::runtime_error( "Invalid parameter passed to qalloc (must be integer type)."); - auto context = builder.getContext(); Value qubits = builder.create( quake::VeqType::getUnsized(context), value); return QuakeValue(builder, qubits); } +template +std::size_t getStateVectorLength(StateVectorStorage &stateVectorStorage, + std::int64_t index) { + if (index >= static_cast(stateVectorStorage.size())) + throw std::runtime_error("index to state initializer is out of range"); + if (!std::get> *>(stateVectorStorage[index])) + throw std::runtime_error("state vector cannot be null"); + auto length = + std::get> *>(stateVectorStorage[index]) + ->size(); + if (!std::has_single_bit(length)) + throw std::runtime_error("state initializer must be a power of 2"); + return std::countr_zero(length); +} + +template +std::complex *getStateVectorData(StateVectorStorage &stateVectorStorage, + std::intptr_t index) { + // This foregoes all the checks found in getStateVectorLength because these + // two functions are called in tandem, this one second. + return std::get> *>(stateVectorStorage[index]) + ->data(); +} + +extern "C" { +/// Runtime callback to get the log2(size) of a captured state vector. +std::size_t +__nvqpp_getStateVectorLength_fp64(StateVectorStorage &stateVectorStorage, + std::int64_t index) { + return getStateVectorLength(stateVectorStorage, index); +} + +std::size_t +__nvqpp_getStateVectorLength_fp32(StateVectorStorage &stateVectorStorage, + std::int64_t index) { + return getStateVectorLength(stateVectorStorage, index); +} + +/// Runtime callback to get the data array of a captured state vector. +std::complex * +__nvqpp_getStateVectorData_fp64(StateVectorStorage &stateVectorStorage, + std::intptr_t index) { + return getStateVectorData(stateVectorStorage, index); +} + +/// Runtime callback to get the data array of a captured state vector. +std::complex * +__nvqpp_getStateVectorData_fp32(StateVectorStorage &stateVectorStorage, + std::intptr_t index) { + return getStateVectorData(stateVectorStorage, index); +} +} + +QuakeValue qalloc(ImplicitLocOpBuilder &builder, + StateVectorStorage &stateVectorStorage, + StateVectorVariant &&state, simulation_precision precision) { + auto *context = builder.getContext(); + auto index = stateVectorStorage.size(); + stateVectorStorage.emplace_back(std::move(state)); + + // Deal with the single/double precision differences here. + const char *getLengthCallBack; + const char *getDataCallBack; + Type componentTy; + { + auto parentModule = + builder.getBlock()->getParentOp()->getParentOfType(); + IRBuilder irb(context); + if (precision == simulation_precision::fp64) { + getLengthCallBack = "__nvqpp_getStateVectorLength_fp64"; + getDataCallBack = "__nvqpp_getStateVectorData_fp64"; + componentTy = irb.getF64Type(); + } else { + getLengthCallBack = "__nvqpp_getStateVectorLength_fp32"; + getDataCallBack = "__nvqpp_getStateVectorData_fp32"; + componentTy = irb.getF32Type(); + } + if (failed(irb.loadIntrinsic(parentModule, getLengthCallBack)) || + failed(irb.loadIntrinsic(parentModule, getDataCallBack))) + throw std::runtime_error("loading callbacks should never fail"); + } + + static_assert(sizeof(std::intptr_t) * 8 == 64); + std::intptr_t vecStor = reinterpret_cast(&stateVectorStorage); + + auto vecPtr = builder.create(vecStor, 64); + auto idxOp = builder.create(index, 64); + + // Use callback to determine the size of the captured vector `state` at + // runtime. + auto i64Ty = builder.getI64Type(); + auto size = builder.create(i64Ty, getLengthCallBack, + ValueRange{vecPtr, idxOp}); + + // Allocate the qubits + Value qubits = builder.create( + quake::VeqType::getUnsized(context), size.getResult(0)); + + // Use callback to retrieve the data pointer of the captured vector `state` at + // runtime. + auto complexTy = ComplexType::get(componentTy); + auto ptrComplexTy = cc::PointerType::get(complexTy); + auto dataPtr = builder.create(ptrComplexTy, getDataCallBack, + ValueRange{vecPtr, idxOp}); + + // Add the initialize state op + qubits = builder.create(qubits.getType(), qubits, + dataPtr.getResult(0)); + return QuakeValue(builder, qubits); +} + QuakeValue constantVal(ImplicitLocOpBuilder &builder, double val) { llvm::APFloat d(val); Value constant = @@ -741,7 +885,8 @@ void tagEntryPoint(ImplicitLocOpBuilder &builder, ModuleOp &module, std::tuple jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit, std::unordered_map &jitHash, - std::string kernelName, std::vector extraLibPaths) { + std::string kernelName, std::vector extraLibPaths, + StateVectorStorage &stateVectorStorage) { // Start of by getting the current ModuleOp auto *block = builder.getBlock(); @@ -757,10 +902,8 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit, auto moduleHash = static_cast(hash); if (jit) { - // Have we added more instructions - // since the last time we jit the code? - // If so, we need to delete this JIT engine - // and create a new one. + // Have we added more instructions since the last time we jit the code? If + // so, we need to delete this JIT engine and create a new one. if (moduleHash == jitHash[jit]) return std::make_tuple(false, jit); else { @@ -799,9 +942,8 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit, pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); - // For some reason I get CFG ops from the LowerToCFGPass - // instead of the unrolled cc loop if I don't run - // the above manually. + // For some reason I get CFG ops from the LowerToCFGPass instead of the + // unrolled cc loop if I don't run the above manually. if (failed(pm.run(module))) throw std::runtime_error( "cudaq::builder failed to JIT compile the Quake representation."); @@ -810,10 +952,17 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit, pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader(/*genAsQuake=*/true)); pm.addPass(cudaq::opt::createGenerateKernelExecution()); optPM.addPass(cudaq::opt::createLowerToCFGPass()); - optPM.addPass(cudaq::opt::createCombineQuantumAllocations()); + // We want quantum allocations to stay where they are if + // we are simulating and have user-provided state vectors. + // This check could be better / smarter probably, in tandem + // with some synth strategy to rewrite initState with circuit + // synthesis result + if (stateVectorStorage.empty()) + optPM.addPass(cudaq::opt::createCombineQuantumAllocations()); pm.addPass(createCanonicalizerPass()); pm.addPass(createCSEPass()); pm.addPass(cudaq::opt::createConvertToQIRPass()); + pm.addPass(createCanonicalizerPass()); if (failed(pm.run(module))) throw std::runtime_error( @@ -861,8 +1010,8 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit, cudaq::info("- JIT Engine created successfully."); - // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR - // for the following we want the proper name, BuilderKernelPTRST + // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR for the following we + // want the proper name, BuilderKernelPTRST std::string properName = name(kernelName); // Need to first invoke the init_func() @@ -892,17 +1041,18 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit, void invokeCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit, std::string kernelName, void **argsArray, - std::vector extraLibPaths) { + std::vector extraLibPaths, + StateVectorStorage &storage) { assert(jit != nullptr && "JIT ExecutionEngine was null."); cudaq::info("kernel_builder invoke kernel with args."); - // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR - // for the following we want the proper name, BuilderKernelPTRST + // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR for the following we + // want the proper name, BuilderKernelPTRST std::string properName = name(kernelName); - // Incoming Args... have been converted to void **, - // now we convert to void * altLaunchKernel args. + // Incoming Args... have been converted to void **, now we convert to void * + // altLaunchKernel args. auto argCreatorName = properName + ".argsCreator"; auto expectedPtr = jit->lookup(argCreatorName); if (!expectedPtr) { @@ -937,9 +1087,9 @@ std::string to_quake(ImplicitLocOpBuilder &builder) { // Strategy - we want to clone this ModuleOp because we have to // add a valid terminator (func.return), but it is not gauranteed that - // the programmer is done building up the kernel even though they've asked to - // look at the quake code. So we'll clone here, and add the return op (we have - // to or the print out string will be invalid (verifier failed)). + // the programmer is done building up the kernel even though they've asked + // to look at the quake code. So we'll clone here, and add the return op + // (we have to or the print out string will be invalid (verifier failed)). auto clonedModule = module.clone(); func::FuncOp unwrappedParentFunc = llvm::cast(parentFunc); @@ -960,4 +1110,9 @@ std::string to_quake(ImplicitLocOpBuilder &builder) { return printOut; } +std::ostream &operator<<(std::ostream &stream, + const kernel_builder_base &builder) { + return stream << builder.to_quake(); +} + } // namespace cudaq::details diff --git a/runtime/cudaq/builder/kernel_builder.h b/runtime/cudaq/builder/kernel_builder.h index f42e96d396..170f8571b0 100644 --- a/runtime/cudaq/builder/kernel_builder.h +++ b/runtime/cudaq/builder/kernel_builder.h @@ -9,10 +9,10 @@ #pragma once #include "cudaq/builder/QuakeValue.h" +#include "cudaq/host_config.h" #include "cudaq/qis/modifiers.h" #include "cudaq/qis/qvector.h" #include "cudaq/utils/cudaq_utils.h" -#include "host_config.h" #include #include #include @@ -62,13 +62,16 @@ concept KernelBuilderArgTypeIsValid = std::disjunction_v...>; // If you want to add to the list of valid kernel argument types first add it -// here, then add `details::mapArgToType()` function +// here, then add `details::convertArgumentTypeToMLIR()` function #define CUDAQ_VALID_BUILDER_ARGS_FOLD() \ - requires(KernelBuilderArgTypeIsValid< \ - Args, float, double, std::size_t, int, std::vector, \ - std::vector, std::vector, \ - std::vector, cudaq::qubit, cudaq::qvector<>> && \ - ...) + requires( \ + KernelBuilderArgTypeIsValid< \ + Args, float, double, std::size_t, int, std::vector, \ + std::vector, std::vector, std::vector, \ + std::vector>, std::vector>, \ + std::vector, cudaq::qubit, \ + cudaq::qvector<>> && \ + ...) #else // Not C++ 2020: stub these out. #define QuakeValueOrNumericType typename @@ -76,6 +79,15 @@ concept KernelBuilderArgTypeIsValid = #endif namespace details { +/// Use parametric type: `initializations` must be vectors of complex float or +/// double. No other type is allowed. +using StateVectorVariant = std::variant> *, + std::vector> *>; + +/// Type describing user-provided state vector data. This is a list of the state +/// vector variables used in a kernel with at least one `qvector` with initial +/// state. +using StateVectorStorage = std::vector; // Define a `mlir::Type` generator in the `cudaq` namespace, this helps us keep // MLIR out of this public header @@ -98,34 +110,42 @@ class KernelBuilderType { }; /// Map a `double` to a `KernelBuilderType` -KernelBuilderType mapArgToType(double &e); +KernelBuilderType convertArgumentTypeToMLIR(double &e); /// Map a `float` to a `KernelBuilderType` -KernelBuilderType mapArgToType(float &e); +KernelBuilderType convertArgumentTypeToMLIR(float &e); /// Map a `int` to a `KernelBuilderType` -KernelBuilderType mapArgToType(int &e); +KernelBuilderType convertArgumentTypeToMLIR(int &e); /// Map a `size_t` to a `KernelBuilderType` -KernelBuilderType mapArgToType(std::size_t &e); +KernelBuilderType convertArgumentTypeToMLIR(std::size_t &e); /// Map a `std::vector` to a `KernelBuilderType` -KernelBuilderType mapArgToType(std::vector &e); +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e); /// Map a `std::vector` to a `KernelBuilderType` -KernelBuilderType mapArgToType(std::vector &e); +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e); /// Map a `std::vector` to a `KernelBuilderType` -KernelBuilderType mapArgToType(std::vector &e); +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e); /// Map a `vector` to a `KernelBuilderType` -KernelBuilderType mapArgToType(std::vector &e); +KernelBuilderType convertArgumentTypeToMLIR(std::vector &e); + +/// Map a `vector>` to a `KernelBuilderType` +KernelBuilderType +convertArgumentTypeToMLIR(std::vector> &e); + +/// Map a `vector>` to a `KernelBuilderType` +KernelBuilderType +convertArgumentTypeToMLIR(std::vector> &e); /// Map a `qubit` to a `KernelBuilderType` -KernelBuilderType mapArgToType(cudaq::qubit &e); +KernelBuilderType convertArgumentTypeToMLIR(cudaq::qubit &e); /// @brief Map a `qvector` to a `KernelBuilderType` -KernelBuilderType mapArgToType(cudaq::qvector<> &e); +KernelBuilderType convertArgumentTypeToMLIR(cudaq::qvector<> &e); /// @brief Initialize the `MLIRContext`, return the raw pointer which we'll wrap /// in an `unique_ptr`. @@ -159,6 +179,11 @@ QuakeValue qalloc(mlir::ImplicitLocOpBuilder &builder, /// @brief Allocate a `qvector` from existing `QuakeValue` size QuakeValue qalloc(mlir::ImplicitLocOpBuilder &builder, QuakeValue &size); +/// @brief Allocate a `qvector` from a user provided state vector. +QuakeValue qalloc(mlir::ImplicitLocOpBuilder &builder, + StateVectorStorage &stateVectorData, + StateVectorVariant &&state, simulation_precision precision); + /// @brief Create a QuakeValue representing a constant floating-point number QuakeValue constantVal(mlir::ImplicitLocOpBuilder &builder, double val); @@ -221,12 +246,13 @@ void applyPasses(mlir::PassManager &); std::tuple jitCode(mlir::ImplicitLocOpBuilder &, mlir::ExecutionEngine *, std::unordered_map &, std::string, - std::vector); + std::vector, StateVectorStorage &); /// @brief Invoke the function with the given kernel name. void invokeCode(mlir::ImplicitLocOpBuilder &builder, mlir::ExecutionEngine *jit, std::string kernelName, void **argsArray, - std::vector extraLibPaths); + std::vector extraLibPaths, + StateVectorStorage &storage); /// @brief Invoke the provided kernel function. void call(mlir::ImplicitLocOpBuilder &builder, std::string &name, @@ -326,10 +352,7 @@ class kernel_builder_base { /// @brief Write the kernel_builder to the given output stream. This outputs /// the Quake representation. friend std::ostream &operator<<(std::ostream &stream, - const kernel_builder_base &builder) { - stream << builder.to_quake(); - return stream; - } + const kernel_builder_base &builder); }; } // namespace details @@ -387,9 +410,12 @@ class kernel_builder : public details::kernel_builder_base { return std::get(term); } + /// @brief Storage for any user-provided state-vector data. + details::StateVectorStorage stateVectorStorage; + public: - /// @brief The constructor, takes the input `KernelBuilderType`s which is used - /// to create the MLIR function type + /// @brief The constructor, takes the input `KernelBuilderType`s which is + /// used to create the MLIR function type kernel_builder(std::vector &types) : context(details::initializeContext(), details::deleteContext), opBuilder(nullptr, [](mlir::ImplicitLocOpBuilder *) {}), @@ -431,6 +457,28 @@ class kernel_builder : public details::kernel_builder_base { return details::qalloc(*opBuilder.get(), size); } + /// Return a `QuakeValue` representing the allocated quantum register, + /// initialized to the given state vector, \p state. + /// + /// Note: input argument is a \e true reference here, the calling context has + /// to own the data. Specifically, the builder object will capture variables + /// by reference (implemented as a container of pointers for simplicity) but + /// the builder does not create, own, or copy these vectors. This implies that + /// if the captured vector goes out of scope before the kernel is invoked, the + /// reference may contain garbage. This behavior is identical to a C++ lambda + /// capture by reference. + QuakeValue qalloc(std::vector> &state) { + return details::qalloc(*opBuilder.get(), stateVectorStorage, + details::StateVectorVariant{&state}, + simulation_precision::fp64); + } + // Overload for complex vector. + QuakeValue qalloc(std::vector> &state) { + return details::qalloc(*opBuilder.get(), stateVectorStorage, + details::StateVectorVariant{&state}, + simulation_precision::fp32); + } + /// @brief Return a `QuakeValue` representing the constant floating-point /// value. QuakeValue constantVal(double val) { @@ -794,7 +842,7 @@ class kernel_builder : public details::kernel_builder_base { void jitCode(std::vector extraLibPaths = {}) override { auto [wasChanged, ptr] = details::jitCode(*opBuilder, jitEngine.get(), jitEngineToModuleHash, - kernelName, extraLibPaths); + kernelName, extraLibPaths, stateVectorStorage); // If we had a jitEngine, but the code changed, delete the one we had. if (jitEngine && wasChanged) details::deleteJitEngine(jitEngine.release()); @@ -818,7 +866,7 @@ class kernel_builder : public details::kernel_builder_base { jitCode(extraLibPaths); } details::invokeCode(*opBuilder, jitEngine.get(), kernelName, argsArray, - extraLibPaths); + extraLibPaths, stateVectorStorage); } /// @brief The call operator for the kernel_builder, takes as input the @@ -882,7 +930,7 @@ CUDAQ_VALID_BUILDER_ARGS_FOLD() auto make_kernel() { std::vector types; cudaq::tuple_for_each(std::tuple(), [&](auto &&el) { - types.push_back(details::mapArgToType(el)); + types.push_back(details::convertArgumentTypeToMLIR(el)); }); return kernel_builder(types); } diff --git a/runtime/cudaq/concepts.h b/runtime/cudaq/concepts.h index 75f7420086..52d9379734 100644 --- a/runtime/cudaq/concepts.h +++ b/runtime/cudaq/concepts.h @@ -8,7 +8,7 @@ #pragma once -#include "host_config.h" +#include "cudaq/host_config.h" #include #if CUDAQ_USE_STD20 diff --git a/runtime/cudaq/host_config.h b/runtime/cudaq/host_config.h new file mode 100644 index 0000000000..ccba759ebe --- /dev/null +++ b/runtime/cudaq/host_config.h @@ -0,0 +1,36 @@ +/****************************************************************-*- C++ -*-**** + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#pragma once + +#include + +#define CUDAQ_USE_STD20 (__cplusplus >= 202002L) +#define CUDAQ_APPLE_CLANG (defined(__apple_build_version__)) + +namespace cudaq { + +/// @brief Define an enumeration of possible simulation +/// floating point precision types. +enum class simulation_precision { fp32, fp64 }; + +#if defined(CUDAQ_SIMULATION_SCALAR_FP64) && \ + defined(CUDAQ_SIMULATION_SCALAR_FP32) +#error "Simulation precision cannot be both double and float" +#elif defined(CUDAQ_SIMULATION_SCALAR_FP32) +using simulation_scalar = std::complex; +#elif defined(CUDAQ_SIMULATION_SCALAR_FP64) +using simulation_scalar = std::complex; +#else +// If neither precision is specified, assume double. +// Do NOT add the warning though as this fires hundreds of times during a build. +// #pragma message("Simulation precision is unspecified, assuming double") +using simulation_scalar = std::complex; +#endif + +} // namespace cudaq diff --git a/runtime/cudaq/platform/default/nvidia-fp64.config b/runtime/cudaq/platform/default/nvidia-fp64.config index 30bc83ebf6..b7fb8f4d49 100644 --- a/runtime/cudaq/platform/default/nvidia-fp64.config +++ b/runtime/cudaq/platform/default/nvidia-fp64.config @@ -8,6 +8,7 @@ msg="" +PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64" gpu_found=$(query_gpu) if ${gpu_found} && [ -f "${install_dir}/lib/libnvqir-custatevec-fp64.so" ]; then NVQIR_SIMULATION_BACKEND="custatevec-fp64" diff --git a/runtime/cudaq/platform/default/nvidia.config b/runtime/cudaq/platform/default/nvidia.config index c410061a46..8055293ba0 100644 --- a/runtime/cudaq/platform/default/nvidia.config +++ b/runtime/cudaq/platform/default/nvidia.config @@ -8,6 +8,7 @@ msg="" +PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP32" gpu_found=$(query_gpu) if ${gpu_found} && [ -f "${install_dir}/lib/libnvqir-custatevec-fp32.so" ]; then NVQIR_SIMULATION_BACKEND="custatevec-fp32" @@ -24,4 +25,4 @@ fi GPU_REQUIREMENTS="true" # This could be a string listing compatibility in the future -TARGET_DESCRIPTION="The NVIDIA Target provides a simulated QPU via single-GPU cuStateVec integration on FP32 types." \ No newline at end of file +TARGET_DESCRIPTION="The NVIDIA Target provides a simulated QPU via single-GPU cuStateVec integration on FP32 types." diff --git a/runtime/cudaq/qis/execution_manager.h b/runtime/cudaq/qis/execution_manager.h index 841ec1cdcf..1b5f2a9c87 100644 --- a/runtime/cudaq/qis/execution_manager.h +++ b/runtime/cudaq/qis/execution_manager.h @@ -9,6 +9,7 @@ #pragma once #include "common/QuditIdTracker.h" +#include "cudaq/host_config.h" #include "cudaq/spin_op.h" #include #include @@ -17,6 +18,7 @@ namespace cudaq { class ExecutionContext; using SpinMeasureResult = std::pair; +using complex = std::complex; /// A QuditInfo is a type encoding the number of \a levels and the \a id of the /// qudit to the ExecutionManager. @@ -111,6 +113,12 @@ class ExecutionManager { /// Reset the execution context virtual void resetExecutionContext() = 0; + /// @brief Initialize the state of the given qudits to the provided + /// state vector. + virtual void initializeState(const std::vector &targets, + const void *state, + simulation_precision precision) = 0; + /// Apply the quantum instruction with the given name, on the provided target /// qudits. Supports input of control qudits and rotational parameters. Can /// also optionally take a spin_op as input to affect a general Pauli diff --git a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp index 1e6bfd9e51..bdc8980603 100644 --- a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp +++ b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp @@ -60,6 +60,15 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager { simulator()->allocateQubits(qudits.size()); } + void initializeState(const std::vector &targets, + const void *state, + cudaq::simulation_precision precision) override { + // Here we have qubits in requestedAllocations + // want to allocate and set state + simulator()->allocateQubits(requestedAllocations.size(), state, precision); + requestedAllocations.clear(); + } + void deallocateQudit(const cudaq::QuditInfo &q) override { // Before trying to deallocate, make sure the qudit hasn't diff --git a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp index b537004df4..f623ab0c3a 100644 --- a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp +++ b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp @@ -55,6 +55,12 @@ class PhotonicsExecutionManager : public cudaq::BasicExecutionManager { allocateQudit(q); } + void initializeState(const std::vector &targets, + const void *state, + simulation_precision precision) override { + throw std::runtime_error("initializeState not implemented."); + } + /// @brief Qudit deallocation method void deallocateQudit(const cudaq::QuditInfo &q) override {} diff --git a/runtime/cudaq/qis/qarray.h b/runtime/cudaq/qis/qarray.h index babd7df6da..99d75ac0d8 100644 --- a/runtime/cudaq/qis/qarray.h +++ b/runtime/cudaq/qis/qarray.h @@ -8,8 +8,8 @@ #pragma once +#include "cudaq/host_config.h" #include "cudaq/qis/qview.h" -#include "host_config.h" namespace cudaq { diff --git a/runtime/cudaq/qis/qreg.h b/runtime/cudaq/qis/qreg.h index f58ea46c93..5899a6c314 100644 --- a/runtime/cudaq/qis/qreg.h +++ b/runtime/cudaq/qis/qreg.h @@ -8,8 +8,8 @@ #pragma once +#include "cudaq/host_config.h" #include "cudaq/qis/qspan.h" -#include "host_config.h" #if defined(__clang__) #pragma clang diagnostic push diff --git a/runtime/cudaq/qis/qspan.h b/runtime/cudaq/qis/qspan.h index 10e7175f8b..573cea66d8 100644 --- a/runtime/cudaq/qis/qspan.h +++ b/runtime/cudaq/qis/qspan.h @@ -8,8 +8,8 @@ #pragma once +#include "cudaq/host_config.h" #include "cudaq/qis/qudit.h" -#include "host_config.h" #if CUDAQ_USE_STD20 #include #include diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h index f79c4935c9..d6b9fe8610 100644 --- a/runtime/cudaq/qis/qubit_qis.h +++ b/runtime/cudaq/qis/qubit_qis.h @@ -9,13 +9,13 @@ #pragma once #include "common/MeasureCounts.h" +#include "cudaq/host_config.h" #include "cudaq/qis/modifiers.h" #include "cudaq/qis/pauli_word.h" #include "cudaq/qis/qarray.h" #include "cudaq/qis/qreg.h" #include "cudaq/qis/qvector.h" #include "cudaq/spin_op.h" -#include "host_config.h" #include #include diff --git a/runtime/cudaq/qis/qudit.h b/runtime/cudaq/qis/qudit.h index d750aa2a4d..527e55f570 100644 --- a/runtime/cudaq/qis/qudit.h +++ b/runtime/cudaq/qis/qudit.h @@ -10,7 +10,15 @@ #include "execution_manager.h" +using namespace std::complex_literals; + namespace cudaq { +using complex = std::complex; + +namespace ket { +inline static const std::vector zero{1. + 0i, 0. + 0i}; +inline static const std::vector one{0. + 0i, 1. + 0i}; +} // namespace ket /// The qudit models a general d-level quantum system. /// This type is templated on the number of levels d. @@ -28,6 +36,30 @@ class qudit { public: /// Construct a qudit, will allocated a new unique index qudit() : idx(getExecutionManager()->getAvailableIndex(n_levels())) {} + qudit(const std::vector &state) : qudit() { + if (state.size() != Levels) + throw std::runtime_error( + "Invalid number of state vector elements for qudit allocation (" + + std::to_string(state.size()) + ")."); + + auto norm = + std::inner_product( + state.begin(), state.end(), state.begin(), + simulation_scalar{0., 0.}, [](auto a, auto b) { return a + b; }, + [](auto a, auto b) { return std::conj(a) * b; }) + .real(); + if (std::fabs(1.0 - norm) > 1e-4) + throw std::runtime_error("Invalid vector norm for qudit allocation."); + + // Perform the initialization + auto precision = std::is_same_v + ? simulation_precision::fp32 + : simulation_precision::fp64; + getExecutionManager()->initializeState({QuditInfo(n_levels(), idx)}, + state.data(), precision); + } + qudit(const std::initializer_list &list) + : qudit({list.begin(), list.end()}) {} // Qudits cannot be copied qudit(const qudit &q) = delete; diff --git a/runtime/cudaq/qis/qvector.h b/runtime/cudaq/qis/qvector.h index 4c3ac0f4c5..1e3638b098 100644 --- a/runtime/cudaq/qis/qvector.h +++ b/runtime/cudaq/qis/qvector.h @@ -8,8 +8,8 @@ #pragma once +#include "cudaq/host_config.h" #include "cudaq/qis/qview.h" -#include "host_config.h" namespace cudaq { @@ -30,6 +30,43 @@ class qvector { /// @brief Construct a `qvector` with `size` qudits in the |0> state. qvector(std::size_t size) : qudits(size) {} + qvector(const std::vector &vector) + : qudits(std::log2(vector.size())) { + if (Levels == 2) { + auto numElements = std::log2(vector.size()); + if (std::floor(numElements) != numElements) + throw std::runtime_error( + "Invalid state vector passed to qvector initialization - number of " + "elements must be power of 2."); + } + + auto norm = + std::inner_product( + vector.begin(), vector.end(), vector.begin(), + simulation_scalar{0., 0.}, [](auto a, auto b) { return a + b; }, + [](auto a, auto b) { return std::conj(a) * b; }) + .real(); + if (std::fabs(1.0 - norm) > 1e-4) + throw std::runtime_error("Invalid vector norm for qudit allocation."); + + std::vector targets; + for (auto &q : qudits) + targets.emplace_back(QuditInfo{Levels, q.id()}); + + auto precision = std::is_same_v + ? simulation_precision::fp32 + : simulation_precision::fp64; + getExecutionManager()->initializeState(targets, vector.data(), precision); + } + + // FIXME do we need float versions? + qvector(const std::vector &vector) + : qvector(std::vector{vector.begin(), vector.end()}) {} + qvector(const std::initializer_list &list) + : qvector(std::vector{list.begin(), list.end()}) {} + qvector(const std::initializer_list &list) + : qvector(std::vector{list.begin(), list.end()}) {} + /// @cond /// Nullary constructor /// meant to be used with `kernel_builder>` diff --git a/runtime/cudaq/qis/qview.h b/runtime/cudaq/qis/qview.h index e7b519881a..1b690a6118 100644 --- a/runtime/cudaq/qis/qview.h +++ b/runtime/cudaq/qis/qview.h @@ -8,8 +8,8 @@ #pragma once +#include "cudaq/host_config.h" #include "cudaq/qis/qudit.h" -#include "host_config.h" #if CUDAQ_USE_STD20 #include #include diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h index 13d33384d6..10e2babb0d 100644 --- a/runtime/nvqir/CircuitSimulator.h +++ b/runtime/nvqir/CircuitSimulator.h @@ -14,7 +14,7 @@ #include "common/MeasureCounts.h" #include "common/NoiseModel.h" #include "common/Timing.h" - +#include "cudaq/host_config.h" #include #include #include @@ -191,7 +191,10 @@ class CircuitSimulator { virtual std::size_t allocateQubit() = 0; /// @brief Allocate `count` qubits. - virtual std::vector allocateQubits(const std::size_t count) = 0; + virtual std::vector + allocateQubits(std::size_t count, const void *state = nullptr, + cudaq::simulation_precision precision = + cudaq::simulation_precision::fp32) = 0; /// @brief Deallocate the qubit with give unique index virtual void deallocate(const std::size_t qubitIdx) = 0; @@ -594,7 +597,11 @@ class CircuitSimulatorBase : public CircuitSimulator { } /// @brief Add the given number of qubits to the state. - virtual void addQubitsToState(std::size_t count) { + virtual void addQubitsToState(std::size_t count, + const void *state = nullptr) { + if (state != nullptr) + throw std::runtime_error("State initialization must be handled by " + "subclasses, override addQubitsToState."); for (std::size_t i = 0; i < count; i++) addQubitToState(); } @@ -806,8 +813,26 @@ class CircuitSimulatorBase : public CircuitSimulator { } /// @brief Allocate `count` qubits. - std::vector allocateQubits(std::size_t count) override { - ScopedTraceWithContext("allocateQubits", count); + std::vector + allocateQubits(std::size_t count, const void *state = nullptr, + cudaq::simulation_precision precision = + cudaq::simulation_precision::fp32) override { + // Make sure if someone gives us state data, that the precision + // is correct for this simulation. + if (state != nullptr) { + if constexpr (std::is_same_v) { + if (precision == cudaq::simulation_precision::fp64) + throw std::runtime_error( + "Invalid user-provided state data. Simulator " + "is FP32 but state data is FP64."); + } else { + if (precision == cudaq::simulation_precision::fp32) + throw std::runtime_error( + "Invalid user-provided state data. Simulator " + "is FP64 but state data is FP32."); + } + } + std::vector qubits; for (std::size_t i = 0; i < count; i++) qubits.emplace_back(tracker.getNextIndex()); @@ -832,7 +857,7 @@ class CircuitSimulatorBase : public CircuitSimulator { stateDimension = calculateStateDim(nQubitsAllocated); // Tell the subtype to allocate more qubits - addQubitsToState(count); + addQubitsToState(count, state); // May be that the state grows enough that we // want to handle observation via sampling diff --git a/runtime/nvqir/NVQIR.cpp b/runtime/nvqir/NVQIR.cpp index 152c35fcdd..6ccadee59a 100644 --- a/runtime/nvqir/NVQIR.cpp +++ b/runtime/nvqir/NVQIR.cpp @@ -184,6 +184,25 @@ Array *__quantum__rt__qubit_allocate_array(uint64_t size) { return vectorSizetToArray(qubitIdxs); } +Array *__quantum__rt__qubit_allocate_array_with_state_fp64( + uint64_t size, std::complex *data) { + ScopedTraceWithContext("NVQIR::qubit_allocate_array_with_data_fp64", size); + __quantum__rt__initialize(0, nullptr); + auto qubitIdxs = nvqir::getCircuitSimulatorInternal()->allocateQubits( + size, data, cudaq::simulation_precision::fp64); + return vectorSizetToArray(qubitIdxs); +} + +Array * +__quantum__rt__qubit_allocate_array_with_state_fp32(uint64_t size, + std::complex *data) { + ScopedTraceWithContext("NVQIR::qubit_allocate_array_with_data_fp32", size); + __quantum__rt__initialize(0, nullptr); + auto qubitIdxs = nvqir::getCircuitSimulatorInternal()->allocateQubits( + size, data, cudaq::simulation_precision::fp32); + return vectorSizetToArray(qubitIdxs); +} + /// @brief Once done, release the QIR qubit array void __quantum__rt__qubit_release_array(Array *arr) { ScopedTraceWithContext("NVQIR::qubit_release_array", arr->size()); diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu index 040aa1dff1..143c154f4b 100644 --- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu +++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu @@ -78,6 +78,73 @@ __global__ void setFirstNElements(T *sv, const T *__restrict__ sv2, int64_t N) { } } +// kronprod functions adapted from +// https://github.com/DmitryLyakh/TAL_SH/blob/3cefc2133a68b67c515f4b68a0ed9e3c66e4b4b2/tensor_algebra_gpu_nvidia.cu#L745 + +#define THRDS_ARRAY_PRODUCT 256 + +#pragma push +#pragma nv_diag_suppress 177 +__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a, + cuDoubleComplex b) { + return cuCmul(a, b); +} +__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a, + cuDoubleComplex b) { + return cuCadd(a, b); +} +__device__ __host__ cuFloatComplex operator*(cuFloatComplex a, + cuFloatComplex b) { + return cuCmulf(a, b); +} +__device__ __host__ cuFloatComplex operator+(cuFloatComplex a, + cuFloatComplex b) { + return cuCaddf(a, b); +} + +template +__global__ void kronprod(size_t tsize1, const T *arr1, size_t tsize2, + const T *arr2, T *arr0) { + __shared__ T lbuf[THRDS_ARRAY_PRODUCT + 1], rbuf[THRDS_ARRAY_PRODUCT]; + size_t _ib, _in, _jb, _jn, _tx, _jc, _ja; + + _tx = (size_t)threadIdx.x; + for (_jb = blockIdx.y * THRDS_ARRAY_PRODUCT; _jb < tsize2; + _jb += gridDim.y * THRDS_ARRAY_PRODUCT) { + if (_jb + THRDS_ARRAY_PRODUCT > tsize2) { + _jn = tsize2 - _jb; + } else { + _jn = THRDS_ARRAY_PRODUCT; + } + + if (_tx < _jn) + rbuf[_tx] = arr2[_jb + _tx]; + + for (_ib = blockIdx.x * THRDS_ARRAY_PRODUCT; _ib < tsize1; + _ib += gridDim.x * THRDS_ARRAY_PRODUCT) { + if (_ib + THRDS_ARRAY_PRODUCT > tsize1) { + _in = tsize1 - _ib; + } else { + _in = THRDS_ARRAY_PRODUCT; + } + + if (_tx < _in) + lbuf[_tx] = arr1[_ib + _tx]; + + __syncthreads(); + for (_jc = 0; _jc < _jn; _jc++) { + if (_tx < _in) { + _ja = (_jb + _jc) * tsize1 + (_ib + _tx); + arr0[_ja] = arr0[_ja] + lbuf[_tx] * rbuf[_jc]; + } + } + __syncthreads(); + } + } + return; +} +#pragma pop + /// @brief The CuStateVecCircuitSimulator implements the CircuitSimulator /// base class to provide a simulator that delegates to the NVIDIA CuStateVec /// GPU-accelerated library. @@ -196,40 +263,101 @@ protected: controls32.size())); } + /// @brief Nice utility function to have to print the state vector contents on + /// GPU. + void printStateFromGPU(const std::string &name, void *ptr, std::size_t size) { + std::vector> tmp(size); + cudaMemcpy(tmp.data(), ptr, size * sizeof(std::complex), + cudaMemcpyDeviceToHost); + for (auto &r : tmp) + printf("%s: (%.12lf, %.12lf)\n", name.c_str(), r.real(), r.imag()); + printf("\n"); + } + /// @brief Increase the state size by the given number of qubits. - void addQubitsToState(std::size_t count) override { - ScopedTraceWithContext("CuStateVecCircuitSimulator::addQubitsToState", count); + void addQubitsToState(std::size_t count, const void *stateIn) override { + ScopedTraceWithContext("CuStateVecCircuitSimulator::addQubitsToState", + count); if (count == 0) return; + // Cast the state, at this point an error would + // have been thrown if it is not of the right floating point type + std::complex *state = + reinterpret_cast *>( + const_cast(stateIn)); + int dev; HANDLE_CUDA_ERROR(cudaGetDevice(&dev)); cudaq::info("GPU {} Allocating new qubit array of size {}.", dev, count); + constexpr int32_t threads_per_block = 256; + uint32_t n_blocks = + (stateDimension + threads_per_block - 1) / threads_per_block; + + // Check if this is the first time to allocate, if so + // the allocation is much easier if (!deviceStateVector) { + // Create the memory and the handle HANDLE_CUDA_ERROR(cudaMalloc((void **)&deviceStateVector, stateDimension * sizeof(CudaDataType))); - constexpr int32_t threads_per_block = 256; - uint32_t n_blocks = - (stateDimension + threads_per_block - 1) / threads_per_block; - initializeDeviceStateVector<<>>( - reinterpret_cast(deviceStateVector), stateDimension); HANDLE_ERROR(custatevecCreate(&handle)); + + // If no state provided, initialize to the zero state + if (state == nullptr) { + initializeDeviceStateVector<<>>( + reinterpret_cast(deviceStateVector), + stateDimension); + return; + } + + // User state provided... + + // FIXME handle case where pointer is a device pointer + + // First allocation, so just set the user provided data here + HANDLE_CUDA_ERROR(cudaMemcpy(deviceStateVector, state, + stateDimension * sizeof(CudaDataType), + cudaMemcpyHostToDevice)); + return; + } + + // State already exists, need to allocate new state and compute + // kronecker product with existing state + + // Allocate new vector to place the kron prod result + void *newDeviceStateVector; + HANDLE_CUDA_ERROR(cudaMalloc((void **)&newDeviceStateVector, + stateDimension * sizeof(CudaDataType))); + + // Place the state data on device. Could be that + // we just need the zero state, or the user could have provided one + void *otherState; + HANDLE_CUDA_ERROR(cudaMalloc((void **)&otherState, + (1UL << count) * sizeof(CudaDataType))); + if (state == nullptr) { + initializeDeviceStateVector<<>>( + reinterpret_cast(otherState), (1UL << count)); } else { - // Allocate new state.. - void *newDeviceStateVector; - HANDLE_CUDA_ERROR(cudaMalloc((void **)&newDeviceStateVector, - stateDimension * sizeof(CudaDataType))); - constexpr int32_t threads_per_block = 256; - uint32_t n_blocks = - (stateDimension + threads_per_block - 1) / threads_per_block; - setFirstNElements<<>>( - reinterpret_cast(newDeviceStateVector), - reinterpret_cast(deviceStateVector), - previousStateDimension); - HANDLE_CUDA_ERROR(cudaFree(deviceStateVector)); - deviceStateVector = newDeviceStateVector; + + // FIXME Handle case where data is already on GPU + HANDLE_CUDA_ERROR(cudaMemcpy(otherState, state, + (1UL << count) * sizeof(CudaDataType), + cudaMemcpyHostToDevice)); } + + // Compute the kronecker product + kronprod<<>>( + previousStateDimension, + reinterpret_cast(deviceStateVector), (1UL << count), + reinterpret_cast(otherState), + reinterpret_cast(newDeviceStateVector)); + HANDLE_CUDA_ERROR(cudaGetLastError()); + + // Free the old vectors we don't need anymore. + HANDLE_CUDA_ERROR(cudaFree(deviceStateVector)); + HANDLE_CUDA_ERROR(cudaFree(otherState)); + deviceStateVector = newDeviceStateVector; } /// @brief Increase the state size by one qubit. @@ -343,9 +471,7 @@ public: } /// @brief Device synchronization - void synchronize() override { - HANDLE_CUDA_ERROR(cudaDeviceSynchronize()); - } + void synchronize() override { HANDLE_CUDA_ERROR(cudaDeviceSynchronize()); } /// @brief Measure operation /// @param qubitIdx @@ -638,9 +764,10 @@ public: return cudaq::State{{stateDimension}, {}}; std::vector> tmp(stateDimension); - HANDLE_CUDA_ERROR(cudaMemcpy(tmp.data(), deviceStateVector, - stateDimension * sizeof(std::complex), - cudaMemcpyDeviceToHost)); + HANDLE_CUDA_ERROR( + cudaMemcpy(tmp.data(), deviceStateVector, + stateDimension * sizeof(std::complex), + cudaMemcpyDeviceToHost)); if constexpr (std::is_same_v) { std::vector> data; diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt index 23f10564af..b279895cd0 100644 --- a/runtime/nvqir/cutensornet/CMakeLists.txt +++ b/runtime/nvqir/cutensornet/CMakeLists.txt @@ -1,3 +1,11 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # + # Find CUDA Toolkit for CUDA libs, e.g., cudart. find_package(CUDAToolkit REQUIRED) @@ -69,7 +77,7 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3") target_include_directories(nvqir-${LIBRARY_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/runtime/common ${CMAKE_SOURCE_DIR}/runtime/nvqir ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR}) target_link_libraries(nvqir-${LIBRARY_NAME} PRIVATE fmt::fmt-header-only cudaq cudaq-common ${CUTENSORNET_LIB} ${CUTENSOR_LIB} CUDA::cudart) install(TARGETS nvqir-${LIBRARY_NAME} DESTINATION lib) - file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\n") + file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\nPREPROCESSOR_DEFINES=\"\${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64\"\n") install(FILES ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config DESTINATION targets) endmacro() diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp index edd31034c2..934b94a991 100644 --- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp +++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp @@ -5,6 +5,7 @@ * This source code and the accompanying materials are made available under * * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ + #include "simulator_cutensornet.h" #include "cudaq.h" #include "cutensornet.h" @@ -228,6 +229,7 @@ static nvqir::CutensornetExecutor *getPluginInstance() { cudaq::info("Successfully loaded the cutensornet plugin."); return fcn(); } + /// @brief Evaluate the expectation value of a given observable cudaq::observe_result SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) { @@ -271,7 +273,8 @@ cudaq::State SimulatorTensorNetBase::getStateData() { } nvqir::CircuitSimulator *SimulatorTensorNetBase::clone() { return nullptr; } -void SimulatorTensorNetBase::addQubitsToState(std::size_t count) { + +void SimulatorTensorNetBase::addQubitsToState(std::size_t count, const void *) { LOG_API_TIME(); if (!m_state) m_state = std::make_unique(count, m_cutnHandle); diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h index f939534b67..60bce7df0a 100644 --- a/runtime/nvqir/cutensornet/simulator_cutensornet.h +++ b/runtime/nvqir/cutensornet/simulator_cutensornet.h @@ -55,7 +55,8 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase { virtual cudaq::observe_result observe(const cudaq::spin_op &op) override; /// @brief Add qubits to the underlying quantum state - virtual void addQubitsToState(std::size_t count) override; + virtual void addQubitsToState(std::size_t count, + const void *state = nullptr) override; /// @brief Return the state vector data virtual cudaq::State getStateData() override; diff --git a/runtime/nvqir/qpp/QppCircuitSimulator.cpp b/runtime/nvqir/qpp/QppCircuitSimulator.cpp index abe30c1db8..0116fa1cbd 100644 --- a/runtime/nvqir/qpp/QppCircuitSimulator.cpp +++ b/runtime/nvqir/qpp/QppCircuitSimulator.cpp @@ -88,24 +88,35 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase { /// @brief Override the default sized allocation of qubits /// here to be a bit more efficient than the default implementation - void addQubitsToState(std::size_t count) override { + void addQubitsToState(std::size_t count, + const void *stateDataIn = nullptr) override { if (count == 0) return; + auto *stateData = reinterpret_cast *>( + const_cast(stateDataIn)); + if (state.size() == 0) { // If this is the first time, allocate the state - state = qpp::ket::Zero(stateDimension); - state(0) = 1.0; + if (stateData == nullptr) { + state = qpp::ket::Zero(stateDimension); + state(0) = 1.0; + } else + state = qpp::ket::Map(stateData, stateDimension); return; } // If we are resizing an existing, allocate // a zero state on a n qubit, and Kron-prod // that with the existing state. - qpp::ket zero_state = qpp::ket::Zero((1UL << count)); - zero_state(0) = 1.0; - state = qpp::kron(zero_state, state); - + if (stateData == nullptr) { + qpp::ket zero_state = qpp::ket::Zero((1UL << count)); + zero_state(0) = 1.0; + state = qpp::kron(zero_state, state); + } else { + qpp::ket initState = qpp::ket::Map(stateData, count); + state = qpp::kron(initState, state); + } return; } diff --git a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp index ae4ad1b453..82a01f26a4 100644 --- a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp +++ b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp @@ -68,10 +68,14 @@ class QppNoiseCircuitSimulator : public nvqir::QppCircuitSimulator { /// @brief Grow the density matrix by one qubit. void addQubitToState() override { addQubitsToState(1); } - void addQubitsToState(std::size_t count) override { + void addQubitsToState(std::size_t count, + const void *data = nullptr) override { if (count == 0) return; + if (data != nullptr) + throw std::runtime_error("init state not implemented for dm sim"); + if (state.size() == 0) { // If this is the first time, allocate the state state = qpp::cmat::Zero(stateDimension, stateDimension); diff --git a/runtime/nvqir/qpp/density-matrix-cpu.config b/runtime/nvqir/qpp/density-matrix-cpu.config index 56d43d0801..2c74e3f321 100644 --- a/runtime/nvqir/qpp/density-matrix-cpu.config +++ b/runtime/nvqir/qpp/density-matrix-cpu.config @@ -7,4 +7,5 @@ # ============================================================================ # NVQIR_SIMULATION_BACKEND="dm" -TARGET_DESCRIPTION="The Density Matrix CPU Target provides a simulated QPU via OpenMP-enabled, CPU-only density matrix emulation." \ No newline at end of file +PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64" +TARGET_DESCRIPTION="The Density Matrix CPU Target provides a simulated QPU via OpenMP-enabled, CPU-only density matrix emulation." diff --git a/runtime/nvqir/qpp/qpp-cpu.config b/runtime/nvqir/qpp/qpp-cpu.config index b87951ac1d..d73401fb66 100644 --- a/runtime/nvqir/qpp/qpp-cpu.config +++ b/runtime/nvqir/qpp/qpp-cpu.config @@ -7,4 +7,5 @@ # ============================================================================ # NVQIR_SIMULATION_BACKEND="qpp" -TARGET_DESCRIPTION="QPP-based CPU-only backend target" \ No newline at end of file +PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64" +TARGET_DESCRIPTION="QPP-based CPU-only backend target" diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp new file mode 100644 index 0000000000..388f38dcc4 --- /dev/null +++ b/targettests/execution/from_state.cpp @@ -0,0 +1,28 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s +// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s + +#include + +__qpu__ void test(std::vector inState) { + cudaq::qvector q = inState; +} + +// CHECK: size 2 + +int main() { + std::vector vec{M_SQRT1_2, 0., 0., M_SQRT1_2}; + auto counts = cudaq::sample(test, vec); + counts.dump(); + + printf("size %zu\n", counts.size()); + + +} \ No newline at end of file diff --git a/test/AST-Quake/qalloc_initialization.cpp b/test/AST-Quake/qalloc_initialization.cpp new file mode 100644 index 0000000000..b96f5b518d --- /dev/null +++ b/test/AST-Quake/qalloc_initialization.cpp @@ -0,0 +1,408 @@ +/******************************************************************************* + * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +// clang-format off +// RUN: cudaq-quake -D CUDAQ_SIMULATION_SCALAR_FP64 %cpp_std %s | cudaq-opt | FileCheck %s +// RUN: cudaq-quake -D CUDAQ_SIMULATION_SCALAR_FP64 %cpp_std %s | cudaq-opt | cudaq-translate --convert-to=qir | FileCheck --check-prefix=QIR %s +// clang-format on + +// Test various flavors of qubits declared with initial state information. + +#include + +struct Vanilla { + std::vector operator()() __qpu__ { + cudaq::qvector v = {0., 1., 1., 0.}; + h(v); + return mz(v); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__Vanilla() -> !cc.stdvec attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK-DAG: %[[VAL_3:.*]] = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr> +// CHECK: %[[VAL_4:.*]] = quake.alloca !quake.veq<2> +// CHECK: %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_3]] : (!quake.veq<2>, !cc.ptr>) -> !quake.veq +// clang-format on + +struct Cherry { + std::vector operator()() __qpu__ { + using namespace std::complex_literals; + cudaq::qvector v = std::initializer_list>{ + {0.0, 1.0}, {0.6, 0.4}, {1.0, 0.0}, {0.0, 0.0}}; + h(v); + return mz(v); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__Cherry() -> !cc.stdvec attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 4.000000e-01 : f64 +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 6.000000e-01 : f64 +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f64 +// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1.000000e+00 : f64 +// CHECK: %[[VAL_7:.*]] = complex.create %[[VAL_5]], %[[VAL_6]] : complex +// CHECK: %[[VAL_8:.*]] = complex.create %[[VAL_4]], %[[VAL_3]] : complex +// CHECK: %[[VAL_9:.*]] = complex.create %[[VAL_6]], %[[VAL_5]] : complex +// CHECK: %[[VAL_10:.*]] = cc.alloca !cc.array x 4> +// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr> +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_8]], %[[VAL_12]] : !cc.ptr> +// CHECK: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_10]][2] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_9]], %[[VAL_13]] : !cc.ptr> +// CHECK: %[[VAL_14:.*]] = quake.alloca !quake.veq<2> +// CHECK: %[[VAL_15:.*]] = quake.init_state %[[VAL_14]], %[[VAL_10]] : (!quake.veq<2>, !cc.ptr x 4>>) -> !quake.veq +// clang-format on + +struct MooseTracks { + std::vector operator()() __qpu__ { + using namespace std::complex_literals; + cudaq::qvector v = { + std::complex{0.0, 1.0}, std::complex{0.75, 0.25}, + std::complex{1.0, 0.0}, std::complex{0.0, 0.0}}; + h(v); + return mz(v); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__MooseTracks() -> !cc.stdvec attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 2.500000e-01 : f64 +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 7.500000e-01 : f64 +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f64 +// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1.000000e+00 : f64 +// CHECK: %[[VAL_7:.*]] = complex.create %[[VAL_5]], %[[VAL_6]] : complex +// CHECK: %[[VAL_8:.*]] = complex.create %[[VAL_4]], %[[VAL_3]] : complex +// CHECK: %[[VAL_9:.*]] = complex.create %[[VAL_6]], %[[VAL_5]] : complex +// CHECK: %[[VAL_10:.*]] = cc.alloca !cc.array x 4> +// CHECK: %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr> +// CHECK: %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_8]], %[[VAL_12]] : !cc.ptr> +// CHECK: %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_10]][2] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_9]], %[[VAL_13]] : !cc.ptr> +// CHECK: %[[VAL_14:.*]] = quake.alloca !quake.veq<2> +// CHECK: %[[VAL_15:.*]] = quake.init_state %[[VAL_14]], %[[VAL_10]] : (!quake.veq<2>, !cc.ptr x 4>>) -> !quake.veq +// clang-format on + +struct RockyRoad { + std::vector operator()() __qpu__ { + using namespace std::complex_literals; + cudaq::qvector v = {0.0 + 1.0i, std::complex{0.8, 0.2}, 1.0 + 0.0i, + std::complex{0.0, 0.0}}; + h(v); + return mz(v); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__RockyRoad() -> !cc.stdvec attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f{{[1280]+}} +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f64 +// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 2.000000e-01 : f64 +// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 8.000000e-01 : f64 +// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1.000000e+00 : f{{[1280]+}} +// CHECK-DAG: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64 +// CHECK-DAG: %[[VAL_9:.*]] = cc.alloca f64 +// CHECK: cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr +// CHECK: %[[VAL_10:.*]] = call @_ZNSt8literals16complex_literalsli1iEe(%[[VAL_7]]) : (f{{[1280]+}}) -> complex +// CHECK: %[[VAL_11:.*]] = cc.alloca complex +// CHECK: cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr> +// CHECK: %[[VAL_12:.*]] = call @_ZStplIdESt7complexIT_ERKS1_RKS2_(%[[VAL_9]], %[[VAL_11]]) : (!cc.ptr, !cc.ptr>) -> complex +// CHECK: %[[VAL_13:.*]] = complex.create %[[VAL_6]], %[[VAL_5]] : complex +// CHECK: %[[VAL_14:.*]] = cc.alloca f64 +// CHECK: cc.store %[[VAL_4]], %[[VAL_14]] : !cc.ptr +// CHECK: %[[VAL_15:.*]] = call @_ZNSt8literals16complex_literalsli1iEe(%[[VAL_3]]) : (f{{[1280]+}}) -> complex +// CHECK: %[[VAL_16:.*]] = cc.alloca complex +// CHECK: cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr> +// CHECK: %[[VAL_17:.*]] = call @_ZStplIdESt7complexIT_ERKS1_RKS2_(%[[VAL_14]], %[[VAL_16]]) : (!cc.ptr, !cc.ptr>) -> complex +// CHECK: %[[VAL_18:.*]] = cc.alloca !cc.array x 4> +// CHECK: %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][0] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_12]], %[[VAL_19]] : !cc.ptr> +// CHECK: %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_18]][1] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_13]], %[[VAL_20]] : !cc.ptr> +// CHECK: %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_18]][2] : (!cc.ptr x 4>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_17]], %[[VAL_21]] : !cc.ptr> +// CHECK: %[[VAL_22:.*]] = quake.alloca !quake.veq<2> +// CHECK: %[[VAL_23:.*]] = quake.init_state %[[VAL_22]], %[[VAL_18]] : (!quake.veq<2>, !cc.ptr x 4>>) -> !quake.veq +// clang-format on + +std::vector getTwoTimesRank(); + +struct Pistachio { + bool operator()() __qpu__ { + cudaq::qvector v{getTwoTimesRank()}; + h(v); + return mz(v[0]); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__Pistachio() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK: %[[VAL_2:.*]] = call @_Z15getTwoTimesRankv() : () -> !cc.stdvec +// CHECK: %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_2]] : (!cc.stdvec) -> i64 +// CHECK: %[[VAL_3:.*]] = math.cttz %[[VAL_30]] : i64 +// CHECK: %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_2]] : (!cc.stdvec) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = quake.alloca !quake.veq[%[[VAL_3]] : i64] +// CHECK: %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_4]] : (!quake.veq, !cc.ptr) -> !quake.veq +// clang-format on + +struct ChocolateMint { + bool operator()() __qpu__ { + cudaq::qvector v = getTwoTimesRank(); + h(v); + return mz(v[0]); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__ChocolateMint() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK: %[[VAL_2:.*]] = call @_Z15getTwoTimesRankv() : () -> !cc.stdvec +// CHECK: %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_2]] : (!cc.stdvec) -> i64 +// CHECK: %[[VAL_3:.*]] = math.cttz %[[VAL_30]] : i64 +// CHECK: %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_2]] : (!cc.stdvec) -> !cc.ptr +// CHECK: %[[VAL_5:.*]] = quake.alloca !quake.veq[%[[VAL_3]] : i64] +// CHECK: %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_4]] : (!quake.veq, !cc.ptr) -> !quake.veq +// clang-format on + +std::vector> getComplexInit(); + +struct Neapolitan { + std::vector operator()() __qpu__ { + cudaq::qvector v{getComplexInit()}; + h(v); + return mz(v); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__Neapolitan() -> !cc.stdvec attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK: %[[VAL_3:.*]] = call @_Z14getComplexInitv() : () -> !cc.stdvec> +// CHECK: %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec>) -> i64 +// CHECK: %[[VAL_4:.*]] = math.cttz %[[VAL_30]] : i64 +// CHECK: %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec>) -> !cc.ptr> +// CHECK: %[[VAL_6:.*]] = quake.alloca !quake.veq[%[[VAL_4]] : i64] +// CHECK: %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_5]] : (!quake.veq, !cc.ptr>) -> !quake.veq +// clang-format on + +struct ButterPecan { + std::vector operator()() __qpu__ { + cudaq::qvector v = getComplexInit(); + h(v); + return mz(v); + } +}; + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__ButterPecan() -> !cc.stdvec attributes {"cudaq-entrypoint", "cudaq-kernel"} { +// CHECK: %[[VAL_3:.*]] = call @_Z14getComplexInitv() : () -> !cc.stdvec> +// CHECK: %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec>) -> i64 +// CHECK: %[[VAL_4:.*]] = math.cttz %[[VAL_30]] : i64 +// CHECK: %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec>) -> !cc.ptr> +// CHECK: %[[VAL_6:.*]] = quake.alloca !quake.veq[%[[VAL_4]] : i64] +// CHECK: %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_5]] : (!quake.veq, !cc.ptr>) -> !quake.veq +// clang-format on + +__qpu__ auto Strawberry() { + cudaq::qubit q = {0., 1.}; + return mz(q); +} + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_Strawberry._Z10Strawberryv() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} { +// CHECK: %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64 +// CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64 +// CHECK: %[[VAL_2:.*]] = complex.create %[[VAL_1]], %[[VAL_1]] : complex +// CHECK: %[[VAL_3:.*]] = complex.create %[[VAL_0]], %[[VAL_1]] : complex +// CHECK: %[[VAL_4:.*]] = cc.alloca !cc.array x 2> +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]][0] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_2]], %[[VAL_5]] : !cc.ptr> +// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_3]], %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_7:.*]] = quake.alloca !quake.veq<1> +// CHECK: %[[VAL_8:.*]] = quake.init_state %[[VAL_7]], %[[VAL_4]] : (!quake.veq<1>, !cc.ptr x 2>>) -> !quake.veq<1> +// CHECK: %[[VAL_9:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref +// CHECK: %[[VAL_10:.*]] = quake.mz %[[VAL_9]] : (!quake.ref) -> !quake.measure +// CHECK: %[[VAL_11:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1 +// CHECK: return %[[VAL_11]] : i1 +// CHECK: } +// clang-format on + +#if 0 +// The ket syntax is not yet provided in the headers. +__qpu__ auto GoldRibbon() { + cudaq::qubit q = cudaq::ket::one; + return mz(q); +} +#endif + +__qpu__ bool Peppermint() { + cudaq::qubit q = {M_SQRT1_2, M_SQRT1_2}; + return mz(q); +} + +// clang-format off +// CHECK-LABEL: func.func @__nvqpp__mlirgen__function_Peppermint._Z10Peppermintv() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} { +// CHECK: %[[VAL_0:.*]] = arith.constant 0.70710678118654757 : f64 +// CHECK: %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64 +// CHECK: %[[VAL_2:.*]] = complex.create %[[VAL_0]], %[[VAL_1]] : complex +// CHECK: %[[VAL_3:.*]] = complex.create %[[VAL_0]], %[[VAL_1]] : complex +// CHECK: %[[VAL_4:.*]] = cc.alloca !cc.array x 2> +// CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]][0] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_2]], %[[VAL_5]] : !cc.ptr> +// CHECK: %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr x 2>>) -> !cc.ptr> +// CHECK: cc.store %[[VAL_3]], %[[VAL_6]] : !cc.ptr> +// CHECK: %[[VAL_7:.*]] = quake.alloca !quake.veq<1> +// CHECK: %[[VAL_8:.*]] = quake.init_state %[[VAL_7]], %[[VAL_4]] : (!quake.veq<1>, !cc.ptr x 2>>) -> !quake.veq<1> +// CHECK: %[[VAL_9:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref +// CHECK: %[[VAL_10:.*]] = quake.mz %[[VAL_9]] : (!quake.ref) -> !quake.measure +// CHECK: %[[VAL_11:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1 +// CHECK: return %[[VAL_11]] : i1 +// CHECK: } +// clang-format on + +//===----------------------------------------------------------------------===// +// +// QIR checks +// +//===----------------------------------------------------------------------===// + +// clang-format off +// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Vanilla() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull bitcast ([4 x double]* @__nvqpp__rodata_init_0 to i8*)) +// QIR: %[[VAL_2:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_1]]* %[[VAL_0]]) +// QIR: } + +// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Cherry() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = alloca [4 x { double, double }], align 8 +// QIR: %[[VAL_1:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 0 +// QIR: store double 0.000000e+00, double* %[[VAL_1]], align 8 +// QIR: %[[VAL_2:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 1 +// QIR: store double 1.000000e+00, double* %[[VAL_2]], align 8 +// QIR: %[[VAL_3:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 0 +// QIR: store double 6.000000e-01, double* %[[VAL_3]], align 8 +// QIR: %[[VAL_4:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 1 +// QIR: store double 4.000000e-01, double* %[[VAL_4]], align 8 +// QIR: %[[VAL_5:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 0 +// QIR: store double 1.000000e+00, double* %[[VAL_5]], align 8 +// QIR: %[[VAL_6:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 1 +// QIR: %[[VAL_7:.*]] = bitcast [4 x { double, double }]* %[[VAL_0]] to i8* +// QIR: call void @llvm.memset +// QIR: %[[VAL_8:.*]] = call %[[VAL_9:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull %[[VAL_7]]) +// QIR: %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_9]]* %[[VAL_8]]) +// QIR: } + +// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__MooseTracks() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = alloca [4 x { double, double }], align 8 +// QIR: %[[VAL_1:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 0 +// QIR: store double 0.000000e+00, double* %[[VAL_1]], align 8 +// QIR: %[[VAL_2:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 1 +// QIR: store double 1.000000e+00, double* %[[VAL_2]], align 8 +// QIR: %[[VAL_3:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 0 +// QIR: store double 7.500000e-01, double* %[[VAL_3]], align 8 +// QIR: %[[VAL_4:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 1 +// QIR: store double 2.500000e-01, double* %[[VAL_4]], align 8 +// QIR: %[[VAL_5:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 0 +// QIR: store double 1.000000e+00, double* %[[VAL_5]], align 8 +// QIR: %[[VAL_6:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 1 +// QIR: %[[VAL_7:.*]] = bitcast [4 x { double, double }]* %[[VAL_0]] to i8* +// QIR: call void @llvm.memset +// QIR: %[[VAL_8:.*]] = call %[[VAL_9:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull %[[VAL_7]]) +// QIR: %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_9]]* %[[VAL_8]]) +// QIR: } + +// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__RockyRoad() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = alloca double, align 8 +// QIR: store double 0.000000e+00, double* %[[VAL_0]], align 8 +// QIR: %[[VAL_1:.*]] = tail call { double, double } @_ZNSt8literals16complex_literalsli1iEe( +// QIR: %[[VAL_2:.*]] = alloca { double, double }, align 8 +// QIR: %[[VAL_3:.*]] = extractvalue { double, double } %[[VAL_1]], 0 +// QIR: %[[VAL_4:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_2]], i64 0, i32 0 +// QIR: store double %[[VAL_3]], double* %[[VAL_4]], align 8 +// QIR: %[[VAL_5:.*]] = extractvalue { double, double } %[[VAL_1]], 1 +// QIR: %[[VAL_6:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_2]], i64 0, i32 1 +// QIR: store double %[[VAL_5]], double* %[[VAL_6]], align 8 +// QIR: %[[VAL_7:.*]] = call { double, double } @_ZStplIdESt7complexIT_ERKS1_RKS2_(double* nonnull %[[VAL_0]], { double, double }* nonnull %[[VAL_2]]) +// QIR: %[[VAL_8:.*]] = alloca double, align 8 +// QIR: store double 1.000000e+00, double* %[[VAL_8]], align 8 +// QIR: %[[VAL_9:.*]] = call { double, double } @_ZNSt8literals16complex_literalsli1iEe( +// QIR: %[[VAL_10:.*]] = alloca { double, double }, align 8 +// QIR: %[[VAL_11:.*]] = extractvalue { double, double } %[[VAL_9]], 0 +// QIR: %[[VAL_12:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_10]], i64 0, i32 0 +// QIR: store double %[[VAL_11]], double* %[[VAL_12]], align 8 +// QIR: %[[VAL_13:.*]] = extractvalue { double, double } %[[VAL_9]], 1 +// QIR: %[[VAL_14:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_10]], i64 0, i32 1 +// QIR: store double %[[VAL_13]], double* %[[VAL_14]], align 8 +// QIR: %[[VAL_15:.*]] = call { double, double } @_ZStplIdESt7complexIT_ERKS1_RKS2_(double* nonnull %[[VAL_8]], { double, double }* nonnull %[[VAL_10]]) +// QIR: %[[VAL_16:.*]] = alloca [4 x { double, double }], align 8 +// QIR: %[[VAL_17:.*]] = extractvalue { double, double } %[[VAL_7]], 0 +// QIR: %[[VAL_18:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 0, i32 0 +// QIR: store double %[[VAL_17]], double* %[[VAL_18]], align 8 +// QIR: %[[VAL_19:.*]] = extractvalue { double, double } %[[VAL_7]], 1 +// QIR: %[[VAL_20:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 0, i32 1 +// QIR: store double %[[VAL_19]], double* %[[VAL_20]], align 8 +// QIR: %[[VAL_21:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 1, i32 0 +// QIR: store double 8.000000e-01, double* %[[VAL_21]], align 8 +// QIR: %[[VAL_22:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 1, i32 1 +// QIR: store double 2.000000e-01, double* %[[VAL_22]], align 8 +// QIR: %[[VAL_23:.*]] = extractvalue { double, double } %[[VAL_15]], 0 +// QIR: %[[VAL_24:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 2, i32 0 +// QIR: store double %[[VAL_23]], double* %[[VAL_24]], align 8 +// QIR: %[[VAL_25:.*]] = extractvalue { double, double } %[[VAL_15]], 1 +// QIR: %[[VAL_26:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 2, i32 1 +// QIR: store double %[[VAL_25]], double* %[[VAL_26]], align 8 +// QIR: %[[VAL_27:.*]] = bitcast [4 x { double, double }]* %[[VAL_16]] to i8* +// QIR: %[[VAL_28:.*]] = call %[[VAL_29:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull %[[VAL_27]]) +// QIR: %[[VAL_30:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_29]]* %[[VAL_28]]) +// QIR: } + +// QIR-LABEL: define i1 @__nvqpp__mlirgen__Pistachio() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = tail call { double*, i64 } @_Z15getTwoTimesRankv() +// QIR: %[[VAL_I:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 1 +// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false) +// QIR: %[[VAL_2:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 0 +// QIR: %[[VAL_3:.*]] = bitcast double* %[[VAL_2]] to i8* +// QIR: %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]]) +// QIR: %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]]) +// QIR: } + +// QIR-LABEL: define i1 @__nvqpp__mlirgen__ChocolateMint() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = tail call { double*, i64 } @_Z15getTwoTimesRankv() +// QIR: %[[VAL_I:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 1 +// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false) +// QIR: %[[VAL_2:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 0 +// QIR: %[[VAL_3:.*]] = bitcast double* %[[VAL_2]] to i8* +// QIR: %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]]) +// QIR: %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]]) +// QIR: } + +// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Neapolitan() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = tail call { { double, double }*, i64 } @_Z14getComplexInitv() +// QIR: %[[VAL_I:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 1 +// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false) +// QIR: %[[VAL_2:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 0 +// QIR: %[[VAL_3:.*]] = bitcast { double, double }* %[[VAL_2]] to i8* +// QIR: %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]]) +// QIR: %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]]) +// QIR: } + +// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__ButterPecan() local_unnamed_addr { +// QIR: %[[VAL_0:.*]] = tail call { { double, double }*, i64 } @_Z14getComplexInitv() +// QIR: %[[VAL_I:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 1 +// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false) +// QIR: %[[VAL_2:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 0 +// QIR: %[[VAL_3:.*]] = bitcast { double, double }* %[[VAL_2]] to i8* +// QIR: %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]]) +// QIR: %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]]) +// QIR: } + diff --git a/test/AST-Quake/type_alias.cpp b/test/AST-Quake/type_alias.cpp index 8d5f421b0c..306b63f09d 100644 --- a/test/AST-Quake/type_alias.cpp +++ b/test/AST-Quake/type_alias.cpp @@ -54,11 +54,11 @@ __qpu__ big kernel3(big arg) { } // CHECK-LABEL: func.func @__nvqpp__mlirgen__function_kernel3._Z7kernel3e( -// CHECK-SAME: %[[VAL_0:.*]]: f128{{.*}}) -> f128 -// CHECK: %[[VAL_1:.*]] = cc.alloca f128 -// CHECK: cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr +// CHECK-SAME: %[[VAL_0:.*]]: f[[TY:[1280]+]]{{.*}}) -> f[[TY]] +// CHECK: %[[VAL_1:.*]] = cc.alloca f[[TY]] +// CHECK: cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr // CHECK: %[[VAL_2:.*]] = quake.alloca !quake.ref // CHECK: quake.x %[[VAL_2]] : (!quake.ref) -> () -// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr -// CHECK: return %[[VAL_3]] : f128 +// CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr +// CHECK: return %[[VAL_3]] : f[[TY]] // CHECK: } diff --git a/test/Quake/add_dealloc.qke b/test/Quake/add_dealloc-0.qke similarity index 100% rename from test/Quake/add_dealloc.qke rename to test/Quake/add_dealloc-0.qke diff --git a/test/Quake/add_dealloc-1.qke b/test/Quake/add_dealloc-1.qke new file mode 100644 index 0000000000..0348d1c7f3 --- /dev/null +++ b/test/Quake/add_dealloc-1.qke @@ -0,0 +1,45 @@ +// ========================================================================== // +// Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates. // +// All rights reserved. // +// // +// This source code and the accompanying materials are made available under // +// the terms of the Apache License 2.0 which accompanies this distribution. // +// ========================================================================== // + +// RUN: cudaq-opt --add-dealloc %s | FileCheck %s + +module { + func.func @__nvqpp__mlirgen__FromState() { + %c4 = arith.constant 4 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %0 = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr> + %1 = quake.alloca !quake.veq<4> + %2 = quake.init_state %1, %0 : (!quake.veq<4>, !cc.ptr>) -> !quake.veq<4> + %3 = cc.loop while ((%arg0 = %c0) -> (index)) { + %4 = arith.cmpi slt, %arg0, %c4 : index + cc.condition %4(%arg0 : index) + } do { + ^bb0(%arg0: index): + %4 = quake.extract_ref %2[%arg0] : (!quake.veq<4>, index) -> !quake.ref + quake.h %4 : (!quake.ref) -> () + cc.continue %arg0 : index + } step { + ^bb0(%arg0: index): + %4 = arith.addi %arg0, %c1 : index + cc.continue %4 : index + } {invariant} + return + } + cc.global constant @__nvqpp__rodata_init_0 (dense<[1.000000e+00, 0.000000e+00, 5.000000e-01, 5.000000e-01]> : tensor<4xf64>) : !cc.array +} + +// CHECK-LABEL: func.func @__nvqpp__mlirgen__FromState() { +// CHECK: %[[VAL_3:.*]] = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr> +// CHECK: %[[VAL_4:.*]] = quake.alloca !quake.veq<4> +// CHECK: %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_3]] : (!quake.veq<4>, !cc.ptr>) -> !quake.veq<4> +// CHECK: quake.dealloc %[[VAL_5]] : !quake.veq<4> +// CHECK: return +// CHECK: } +// CHECK: cc.global constant @__nvqpp__rodata_init_0 (dense<[1.000000e+00, 0.000000e+00, 5.000000e-01, 5.000000e-01]> : tensor<4xf64>) : !cc.array + diff --git a/test/Transforms/mapping_non_unitaries.qke b/test/Transforms/mapping_non_unitaries.qke index 1ff3117e05..d6f15b970b 100644 --- a/test/Transforms/mapping_non_unitaries.qke +++ b/test/Transforms/mapping_non_unitaries.qke @@ -14,14 +14,14 @@ // RUN: cudaq-opt --qubit-mapping=device=grid\(3,3\) %s | CircuitCheck --up-to-mapping %s // RUN: cudaq-opt --qubit-mapping=device=grid\(1,5\) %s | CircuitCheck --up-to-mapping %s // RUN: cudaq-opt --qubit-mapping=device=grid\(5,1\) %s | CircuitCheck --up-to-mapping %s -// RUN: cudaq-opt --qubit-mapping=device=path %s | FileCheck --check-prefix CHECK %s -// RUN: cudaq-opt --qubit-mapping=device=path\(5\) %s | FileCheck --check-prefix CHECK %s -// RUN: cudaq-opt --qubit-mapping=device=ring\(5\) %s | FileCheck --check-prefix CHECK %s -// RUN: cudaq-opt --qubit-mapping=device=star\(5,2\) %s | FileCheck --check-prefix STAR52 %s -// RUN: cudaq-opt --qubit-mapping=device=star\(5,0\) %s | FileCheck --check-prefix STAR50 %s -// RUN: cudaq-opt --qubit-mapping=device=grid\(3,3\) %s | FileCheck --check-prefix CHECK %s -// RUN: cudaq-opt --qubit-mapping=device=grid\(1,5\) %s | FileCheck --check-prefix CHECK %s -// RUN: cudaq-opt --qubit-mapping=device=grid\(5,1\) %s | FileCheck --check-prefix CHECK %s +// RUN: cudaq-opt --qubit-mapping=device=path %s | FileCheck %s +// RUN: cudaq-opt --qubit-mapping=device=path\(5\) %s | FileCheck %s +// RUN: cudaq-opt --qubit-mapping=device=ring\(5\) %s | FileCheck %s +// RUN: cudaq-opt --qubit-mapping=device=star\(5,2\) %s | FileCheck --check-prefix=STAR52 %s +// RUN: cudaq-opt --qubit-mapping=device=star\(5,0\) %s | FileCheck --check-prefix=STAR50 %s +// RUN: cudaq-opt --qubit-mapping=device=grid\(3,3\) %s | FileCheck %s +// RUN: cudaq-opt --qubit-mapping=device=grid\(1,5\) %s | FileCheck %s +// RUN: cudaq-opt --qubit-mapping=device=grid\(5,1\) %s | FileCheck %s func.func @test_measurement() { %0 = quake.null_wire diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in index 480ee1f66e..fb33f8510e 100644 --- a/tools/nvqpp/nvq++.in +++ b/tools/nvqpp/nvq++.in @@ -637,8 +637,12 @@ fi LLC=${LLVMBIN}llc${llvm_suffix} if ${LIBRARY_MODE}; then + PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_LIBRARY_MODE" ENABLE_KERNEL_EXECUTION=false fi +if [[ "${TARGET_CONFIG}" != *"CUDAQ_SIMULATION_SCALAR_"* ]]; then + PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64" +fi RUN_OPT=false OPT_PASSES= @@ -697,7 +701,7 @@ for i in ${SRCS}; do # If LIBRARY_MODE explicitly requested, then # simply compile with the classical compiler. if ${LIBRARY_MODE}; then - run ${CMAKE_FALLBACK_HOST_CXX} ${CLANG_VERBOSE} ${CLANG_RESOURCE_DIR} ${COMPILER_FLAGS} -DCUDAQ_LIBRARY_MODE ${PREPROCESSOR_DEFINES} ${INCLUDES} ${ARGS} -o ${file}.o -c $i + run ${CMAKE_FALLBACK_HOST_CXX} ${CLANG_VERBOSE} ${CLANG_RESOURCE_DIR} ${COMPILER_FLAGS} ${PREPROCESSOR_DEFINES} ${INCLUDES} ${ARGS} -o ${file}.o -c $i OBJS="${OBJS} ${file}.o" # Go to the next iteration, maybe there # will be cudaq kernels there diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index c609f8926d..0f62ae4a97 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -72,15 +72,18 @@ macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER) cudaq-builder gtest_main) set(TEST_LABELS "") + if (${NVQIR_BACKEND} STREQUAL "qpp") + target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_SIMULATION_SCALAR_FP64) + endif() if (${NVQIR_BACKEND} STREQUAL "dm") - target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_DM) + target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_DM -DCUDAQ_SIMULATION_SCALAR_FP64) endif() if (${NVQIR_BACKEND} STREQUAL "tensornet") - target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET) + target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_SIMULATION_SCALAR_FP64) set(TEST_LABELS "gpu_required") endif() if (${NVQIR_BACKEND} STREQUAL "tensornet-mps") - target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET) + target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_SIMULATION_SCALAR_FP64) set(TEST_LABELS "gpu_required") endif() if (${NVQIR_BACKEND} STREQUAL "custatevec-fp32") diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp index 0b7a554fb6..a5a19b7631 100644 --- a/unittests/integration/builder_tester.cpp +++ b/unittests/integration/builder_tester.cpp @@ -1203,7 +1203,73 @@ CUDAQ_TEST(BuilderTester, checkControlledRotations) { } } -#ifndef CUDAQ_BACKEND_DM +#if !defined(CUDAQ_BACKEND_DM) && !defined(CUDAQ_BACKEND_TENSORNET) + +TEST(BuilderTester, checkFromStateVector) { + std::vector vec{M_SQRT1_2, 0., 0., M_SQRT1_2}; + { + auto kernel = cudaq::make_kernel(); + auto qubits = kernel.qalloc(vec); + std::cout << kernel << "\n"; + auto counts = cudaq::sample(kernel); + counts.dump(); + EXPECT_EQ(counts.size(), 2); + std::size_t counter = 0; + for (auto &[k, v] : counts) { + counter += v; + EXPECT_TRUE(k == "00" || k == "11"); + } + EXPECT_EQ(counter, 1000); + } + + { + auto [kernel, initState] = + cudaq::make_kernel>(); + auto qubits = kernel.qalloc(initState); + std::cout << kernel << "\n"; + auto counts = cudaq::sample(kernel, vec); + counts.dump(); + EXPECT_EQ(counts.size(), 2); + std::size_t counter = 0; + for (auto &[k, v] : counts) { + counter += v; + EXPECT_TRUE(k == "00" || k == "11"); + } + EXPECT_EQ(counter, 1000); + } + + { + // 2 qubit 11 state + std::vector vec{0., 0., 0., 1.}; + auto [kernel, initState] = + cudaq::make_kernel>(); + auto qubits = kernel.qalloc(initState); + // induce the need for a kron prod between + // [0,0,0,1] and [1, 0, 0, 0] + auto anotherOne = kernel.qalloc(2); + std::cout << kernel << "\n"; + auto counts = cudaq::sample(kernel, vec); + counts.dump(); + EXPECT_EQ(counts.size(), 1); + EXPECT_EQ(counts.count("1100"), 1000); + } + + { + // 2 qubit 11 state + std::vector vec{0., 0., 0., 1.}; + auto [kernel, initState] = + cudaq::make_kernel>(); + auto qubits = kernel.qalloc(initState); + // induce the need for a kron prod between + // [0,0,0,1] and [1, 0] + auto anotherOne = kernel.qalloc(); + std::cout << kernel << "\n"; + auto counts = cudaq::sample(kernel, vec); + counts.dump(); + EXPECT_EQ(counts.size(), 1); + EXPECT_EQ(counts.count("110"), 1000); + } +} CUDAQ_TEST(BuilderTester, checkCanProgressivelyBuild) { auto kernel = cudaq::make_kernel(); diff --git a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp index e92f21b740..d4ff6b74c7 100644 --- a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp +++ b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp @@ -117,7 +117,11 @@ class SimpleQuditExecutionManager : public cudaq::BasicExecutionManager { cudaq::SpinMeasureResult measure(cudaq::spin_op &op) override { return cudaq::SpinMeasureResult(); } - + void initializeState(const std::vector &targets, + const void *state, + cudaq::simulation_precision precision) override { + throw std::runtime_error("initializeState not implemented."); + } void resetQudit(const cudaq::QuditInfo &id) override {} };