diff --git a/docs/sphinx/examples/cpp/other/builder/builder.cpp b/docs/sphinx/examples/cpp/other/builder/builder.cpp
index 8c9a109864..79afbc0bbd 100644
--- a/docs/sphinx/examples/cpp/other/builder/builder.cpp
+++ b/docs/sphinx/examples/cpp/other/builder/builder.cpp
@@ -17,6 +17,14 @@
 #include <cudaq/gradients.h>
 #include <cudaq/optimizers.h>
 
+static bool results_are_close(const cudaq::sample_result &f1,
+                              const cudaq::sample_result &f2) {
+  // Stub for a fancy comparison.
+  f1.dump();
+  f2.dump();
+  return true;
+}
+
 // This example demonstrates various uses for the `cudaq::builder`
 // type. This type enables one to dynamically construct callable
 // CUDA Quantum kernels via just-in-time compilation. The typical workflow
@@ -114,6 +122,37 @@ int main() {
     counts.dump();
   }
 
+  {
+    // In a simulated environment, it is sometimes useful to be able to
+    // specify an initial state vector. The initial state vector is 2 to the
+    // power `n` where `n` is the number of qubits.
+
+    // In this example, we create a kernel template `sim_kernel` that captures
+    // the variable `init_state` by reference.
+    auto sim_builder = cudaq::make_kernel();
+    std::vector<cudaq::simulation_scalar> init_state;
+    auto q = sim_builder.qalloc(init_state);
+    // Build the quantum circuit template here.
+    sim_builder.mz(q);
+
+    // Now we are ready to instantiate the kernel and invoke it. So we can set
+    // the `init_state` to a vector with 2 complex values (1 qubit) and
+    // get the results.
+    init_state = {{0.0, 1.0}, {1.0, 0.0}};
+    auto counts0 = cudaq::sample(sim_builder);
+
+    // Now suppose we have a different initial state with 4 complex values (2
+    // qubits). Let's rerun the kernel with the new `init_state`.
+    init_state = {{1.0, 0.0}, {0.0, 1.0}, {0.0, 1.0}, {1.0, 0.0}};
+    auto counts1 = cudaq::sample(sim_builder);
+
+    // Finally in this wholly contrived example, we test the results to make
+    // sure they are "close".
+    if (results_are_close(counts0, counts1)) {
+      printf("The two initial states generated results that are \"close\".\n");
+    }
+  }
+
   {
 
     // Let's do a final sampling task. Let's
diff --git a/include/cudaq/Frontend/nvqpp/ASTBridge.h b/include/cudaq/Frontend/nvqpp/ASTBridge.h
index a65c3fed95..51a3d8e2e8 100644
--- a/include/cudaq/Frontend/nvqpp/ASTBridge.h
+++ b/include/cudaq/Frontend/nvqpp/ASTBridge.h
@@ -249,13 +249,8 @@ class QuakeBridgeVisitor
   bool TraverseConditionalOperator(clang::ConditionalOperator *x,
                                    DataRecursionQueue *q = nullptr);
   bool VisitReturnStmt(clang::ReturnStmt *x);
-  bool VisitCXXFunctionalCastExpr(clang::CXXFunctionalCastExpr *x) {
-    return true;
-  }
   bool TraverseInitListExpr(clang::InitListExpr *x,
                             DataRecursionQueue *q = nullptr);
-  bool TraverseCXXTemporaryObjectExpr(clang::CXXTemporaryObjectExpr *x,
-                                      DataRecursionQueue *q = nullptr);
 
   // These misc. statements are not (yet) handled by lowering.
   bool TraverseAsmStmt(clang::AsmStmt *x, DataRecursionQueue *q = nullptr);
@@ -287,16 +282,58 @@ class QuakeBridgeVisitor
   bool TraverseCXXConstructExpr(clang::CXXConstructExpr *x,
                                 DataRecursionQueue *q = nullptr);
   bool VisitCXXConstructExpr(clang::CXXConstructExpr *x);
-  bool VisitCXXTemporaryObjectExpr(clang::CXXTemporaryObjectExpr *x);
   bool VisitCXXOperatorCallExpr(clang::CXXOperatorCallExpr *x);
   bool WalkUpFromCXXOperatorCallExpr(clang::CXXOperatorCallExpr *x);
   bool TraverseDeclRefExpr(clang::DeclRefExpr *x,
                            DataRecursionQueue *q = nullptr);
   bool VisitDeclRefExpr(clang::DeclRefExpr *x);
   bool VisitFloatingLiteral(clang::FloatingLiteral *x);
+
+  // Cast operations.
+  bool TraverseCastExpr(clang::CastExpr *x, DataRecursionQueue *q = nullptr);
+  bool VisitCastExpr(clang::CastExpr *x);
+
   bool TraverseImplicitCastExpr(clang::ImplicitCastExpr *x,
-                                DataRecursionQueue *q = nullptr);
-  bool VisitImplicitCastExpr(clang::ImplicitCastExpr *x);
+                                DataRecursionQueue *q = nullptr) {
+    return TraverseCastExpr(x, q);
+  }
+  bool TraverseExplicitCastExpr(clang::ExplicitCastExpr *x,
+                                DataRecursionQueue *q = nullptr) {
+    return TraverseCastExpr(x, q);
+  }
+  bool TraverseCStyleCastExpr(clang::CStyleCastExpr *x,
+                              DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+  bool TraverseCXXFunctionalCastExpr(clang::CXXFunctionalCastExpr *x,
+                                     DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+  bool TraverseCXXAddrspaceCastExpr(clang::CXXAddrspaceCastExpr *x,
+                                    DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+  bool TraverseCXXConstCastExpr(clang::CXXConstCastExpr *x,
+                                DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+  bool TraverseCXXDynamicCastExpr(clang::CXXDynamicCastExpr *x,
+                                  DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+  bool TraverseCXXReinterpretCastExpr(clang::CXXReinterpretCastExpr *x,
+                                      DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+  bool TraverseCXXStaticCastExpr(clang::CXXStaticCastExpr *x,
+                                 DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+  bool TraverseBuiltinBitCastExpr(clang::BuiltinBitCastExpr *x,
+                                  DataRecursionQueue *q = nullptr) {
+    return TraverseExplicitCastExpr(x, q);
+  }
+
   bool VisitInitListExpr(clang::InitListExpr *x);
   bool VisitIntegerLiteral(clang::IntegerLiteral *x);
   bool VisitCXXBoolLiteralExpr(clang::CXXBoolLiteralExpr *x);
@@ -579,7 +616,6 @@ class QuakeBridgeVisitor
   llvm::DenseMap<clang::RecordType *, mlir::Type> records;
 
   // State Flags
-
   bool skipCompoundScope : 1 = false;
   bool isEntry : 1 = false;
   /// If there is a catastrophic error in the bridge (there is no rational way
@@ -589,6 +625,7 @@ class QuakeBridgeVisitor
   bool visitImplicitCode : 1 = false;
   bool inRecType : 1 = false;
   bool allowUnknownRecordType : 1 = false;
+  bool initializerIsGlobal : 1 = false;
 };
 } // namespace details
 
diff --git a/include/cudaq/Optimizer/Builder/Factory.h b/include/cudaq/Optimizer/Builder/Factory.h
index 9da117f169..8e07d9418f 100644
--- a/include/cudaq/Optimizer/Builder/Factory.h
+++ b/include/cudaq/Optimizer/Builder/Factory.h
@@ -159,6 +159,12 @@ inline mlir::Value createF64Constant(mlir::Location loc,
   return createFloatConstant(loc, builder, value, builder.getF64Type());
 }
 
+/// Return the integer value if \p v is an integer constant.
+std::optional<std::uint64_t> maybeValueOfIntConstant(mlir::Value v);
+
+/// Return the floating point value if \p v is a floating-point constant.
+std::optional<double> maybeValueOfFloatConstant(mlir::Value v);
+
 //===----------------------------------------------------------------------===//
 
 inline mlir::Block *addEntryBlock(mlir::LLVM::GlobalOp initVar) {
diff --git a/include/cudaq/Optimizer/CodeGen/CMakeLists.txt b/include/cudaq/Optimizer/CodeGen/CMakeLists.txt
index 123158afea..5c2c15f8d7 100644
--- a/include/cudaq/Optimizer/CodeGen/CMakeLists.txt
+++ b/include/cudaq/Optimizer/CodeGen/CMakeLists.txt
@@ -6,6 +6,9 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
+add_cudaq_dialect(CodeGen codegen)
+add_cudaq_dialect_doc(CodeGenDialect codegen)
+
 set(LLVM_TARGET_DEFINITIONS Passes.td)
 mlir_tablegen(Passes.h.inc -gen-pass-decls -name OptCodeGen)
 add_public_tablegen_target(OptCodeGenPassIncGen)
@@ -13,4 +16,3 @@ add_public_tablegen_target(OptCodeGenPassIncGen)
 set(LLVM_TARGET_DEFINITIONS Peephole.td)
 mlir_tablegen(Peephole.inc -gen-rewriters)
 add_public_tablegen_target(OptPeepholeIncGen)
-
diff --git a/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td b/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td
new file mode 100644
index 0000000000..6a0964fc9d
--- /dev/null
+++ b/include/cudaq/Optimizer/CodeGen/CodeGenDialect.td
@@ -0,0 +1,34 @@
+/********************************************************** -*- tablegen -*- ***
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#ifndef CUDAQ_OPTIMIZER_CODEGEN_DIALECT
+#define CUDAQ_OPTIMIZER_CODEGEN_DIALECT
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+//===----------------------------------------------------------------------===//
+// Dialect definition.
+//===----------------------------------------------------------------------===//
+
+def CodeGenDialect : Dialect {
+  let name = "codegen";
+  let summary = "Code generation helpers";
+  let description = [{
+    Do not use this dialect outside of code generation.
+  }];
+
+  let cppNamespace = "cudaq::codegen";
+  let useDefaultTypePrinterParser = 1;
+  let useFoldAPI = kEmitFoldAdaptorFolder;
+
+  let extraClassDeclaration = [{
+    void registerTypes(); // register at least a bogo type.
+  }];
+}
+
+#endif
diff --git a/include/cudaq/Optimizer/CodeGen/CodeGenOps.td b/include/cudaq/Optimizer/CodeGen/CodeGenOps.td
new file mode 100644
index 0000000000..29f0ccbefc
--- /dev/null
+++ b/include/cudaq/Optimizer/CodeGen/CodeGenOps.td
@@ -0,0 +1,47 @@
+/********************************************************** -*- tablegen -*- ***
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#ifndef CUDAQ_OPTIMIZER_CODEGEN_OPS
+#define CUDAQ_OPTIMIZER_CODEGEN_OPS
+
+include "cudaq/Optimizer/CodeGen/CodeGenDialect.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "cudaq/Optimizer/Dialect/Common/Traits.td"
+include "cudaq/Optimizer/Dialect/CC/CCTypes.td"
+include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.td"
+
+//===----------------------------------------------------------------------===//
+// The codegen quake dialect is a transitory set of operations used exclusively
+// during codegen. The ops defined here make the process of converting the Quake
+// code to a another target dialect (e.g. LLVM-IR) easier. They should not be
+// used outside of the codegen passes.
+//===----------------------------------------------------------------------===//
+
+class CGQOp<string mnemonic, list<Trait> traits = []>
+    : Op<CodeGenDialect, mnemonic, traits>;
+
+def cgq_RAIIOp : CGQOp<"qmem_raii", [MemoryEffects<[MemAlloc, MemWrite]>]> {
+  let summary = "Combine allocation and initialization of set of qubits.";
+  let description = [{
+    Used only in QIR code generation.
+  }];
+
+  let arguments = (ins
+    cc_PointerType:$initState,
+    TypeAttr:$allocType,
+    Optional<AnySignlessIntegerOrIndex>:$allocSize
+  );
+  let results = (outs VeqType);
+
+  let assemblyFormat = [{
+    $initState `(` $allocType ( `[` $allocSize^ `]` )? `)` `:`
+    functional-type(operands, results) attr-dict
+  }];
+}
+
+#endif
diff --git a/include/cudaq/Optimizer/CodeGen/CodeGenTypes.td b/include/cudaq/Optimizer/CodeGen/CodeGenTypes.td
new file mode 100644
index 0000000000..e268494f34
--- /dev/null
+++ b/include/cudaq/Optimizer/CodeGen/CodeGenTypes.td
@@ -0,0 +1,34 @@
+/********************************************************** -*- tablegen -*- ***
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#ifndef CUDAQ_OPTIMIZER_CODEGEN_TYPES
+#define CUDAQ_OPTIMIZER_CODEGEN_TYPES
+
+include "cudaq/Optimizer/CodeGen/CodeGenDialect.td"
+include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/IR/AttrTypeBase.td"
+
+//===----------------------------------------------------------------------===//
+// BaseType
+//===----------------------------------------------------------------------===//
+
+class CodeGenType<string name, string typeMnemonic, list<Trait> traits = [],
+             string base = "mlir::Type">
+    : TypeDef<CodeGenDialect, name, traits, base> {
+  let mnemonic = typeMnemonic;
+}
+
+// There are no codegen dialect types, but the dialect tablegen generates type
+// boilerplate for the dialect anyway.
+
+def codegen_DoNotUseType : CodeGenType<"DoNotUse", "dummy"> {
+  let summary = "";
+  let description = [{ }];
+}
+
+#endif // CUDAQ_OPTIMIZER_CODEGEN_TYPES
diff --git a/include/cudaq/Optimizer/CodeGen/Passes.h b/include/cudaq/Optimizer/CodeGen/Passes.h
index 7496965ab2..61c297175c 100644
--- a/include/cudaq/Optimizer/CodeGen/Passes.h
+++ b/include/cudaq/Optimizer/CodeGen/Passes.h
@@ -52,6 +52,9 @@ std::unique_ptr<mlir::Pass> createRemoveMeasurementsPass();
 /// Register target pipelines.
 void registerTargetPipelines();
 
+/// Register CodeGenDialect with the provided DialectRegistry.
+void registerCodeGenDialect(mlir::DialectRegistry &registry);
+
 // declarative passes
 #define GEN_PASS_DECL
 #define GEN_PASS_REGISTRATION
diff --git a/include/cudaq/Optimizer/CodeGen/Passes.td b/include/cudaq/Optimizer/CodeGen/Passes.td
index 22930e542b..fa450745b1 100644
--- a/include/cudaq/Optimizer/CodeGen/Passes.td
+++ b/include/cudaq/Optimizer/CodeGen/Passes.td
@@ -19,7 +19,9 @@ def QuakeToQIR : Pass<"quake-to-qir", "mlir::ModuleOp"> {
     QIR qubits.
   }];
 
-  let dependentDialects = ["mlir::LLVM::LLVMDialect"];
+  let dependentDialects = [
+    "cudaq::codegen::CodeGenDialect", "mlir::LLVM::LLVMDialect"
+  ];
   let constructor = "cudaq::opt::createConvertToQIRPass()";
 }
 
diff --git a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
index 43b6cf35da..52595c14b2 100644
--- a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
+++ b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
@@ -43,6 +43,10 @@ constexpr static const char QIRArrayGetElementPtr1d[] =
     "__quantum__rt__array_get_element_ptr_1d";
 constexpr static const char QIRArrayQubitAllocateArray[] =
     "__quantum__rt__qubit_allocate_array";
+constexpr static const char QIRArrayQubitAllocateArrayWithStateFP64[] =
+    "__quantum__rt__qubit_allocate_array_with_state_fp64";
+constexpr static const char QIRArrayQubitAllocateArrayWithStateFP32[] =
+    "__quantum__rt__qubit_allocate_array_with_state_fp32";
 constexpr static const char QIRQubitAllocate[] =
     "__quantum__rt__qubit_allocate";
 constexpr static const char QIRArrayQubitReleaseArray[] =
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index 6d77169329..dec070384c 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -794,6 +794,35 @@ def cc_AddressOfOp : CCOp<"address_of", [Pure,
   }];
 }
 
+def cc_GlobalOp : CCOp<"global", [IsolatedFromAbove, Symbol]> {
+  let summary = "Create a global constant or variable";
+  let description = [{
+    A GlobalOp is used to create a global variable or constant that can be
+    referenced by symbol using the AddressOfOp. The type of this op is always
+    implicitly a cc::PointerType.
+
+    For example, this op may be used to define arrays of doubles, which may in
+    turn be used as initial state vectors for quantum memory (VeqType).
+  }];
+
+  let arguments = (ins
+    TypeAttr:$global_type,
+    StrAttr:$sym_name,
+    OptionalAttr<AnyAttr>:$value,
+    UnitAttr:$constant,
+    UnitAttr:$external
+  );
+
+  let hasCustomAssemblyFormat = 1;
+
+  let extraClassDeclaration = [{
+    cudaq::cc::PointerType getType() {
+      auto globalTy = getGlobalType();
+      return cudaq::cc::PointerType::get(globalTy);
+    }
+  }];
+}
+
 def cc_ExtractValueOp : CCOp<"extract_value", [Pure]> {
   let summary = "Extract a value from an aggregate value.";
   let description = [{
diff --git a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
index b1e8ea39eb..802be8d2e4 100644
--- a/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
+++ b/include/cudaq/Optimizer/Dialect/Quake/QuakeOps.td
@@ -71,8 +71,12 @@ def quake_AllocaOp : QuakeOp<"alloca", [MemoryEffects<[MemAlloc, MemWrite]>]> {
     See DeallocOp.
   }];
 
-  let arguments = (ins Optional<AnySignlessInteger>:$size);
-  let results = (outs AnyRefType:$ref_or_vec);
+  let arguments = (ins
+    Optional<AnySignlessInteger>:$size
+  );
+  let results = (outs
+    AnyRefType:$ref_or_vec
+  );
 
   let builders = [
     OpBuilder<(ins ), [{
@@ -92,6 +96,16 @@ def quake_AllocaOp : QuakeOp<"alloca", [MemoryEffects<[MemAlloc, MemWrite]>]> {
 
   let hasCanonicalizer = 1;
   let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    bool hasInitializedState() {
+      auto *self = getOperation();
+      return self->hasOneUse() &&
+        mlir::isa<quake::InitializeStateOp>(*self->getUsers().begin());
+    }
+
+    quake::InitializeStateOp getInitializedState();
+  }];
 }
 
 def quake_ConcatOp : QuakeOp<"concat", [Pure]> {
@@ -281,6 +295,31 @@ def quake_ExtractRefOp : QuakeOp<"extract_ref", [Pure]> {
   }];
 }
 
+def quake_InitializeStateOp : QuakeOp<"init_state", [MemoryEffects<[MemAlloc,
+    MemWrite]>]> {
+  let summary = "Initialize the quantum state to a specific complex vector.";
+  let description = [{
+    Given a !cc.ptr pointing to a complex data array of size 2**N, where N is
+    the number of qubits in the targets operand, initialize the state of those
+    target qubits to the provided state vector. This operation returns a new
+    quake.veq instance. There should be no other uses of the input veq value,
+    \em{targets}, that was allocated. This supports a RAII (resource allocation
+    is initialization) semantics on the qubits in the vector.
+  }];
+
+  let arguments = (ins
+    VeqType:$targets,
+    cc_PointerType:$state
+  );
+  let results = (outs VeqType);
+
+  let assemblyFormat = [{
+    $targets `,` $state `:` functional-type(operands, results) attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
 def quake_RelaxSizeOp : QuakeOp<"relax_size", [Pure]> {
   let summary = "Relax the constant size on a !veq to be unknown.";
   let description = [{
diff --git a/lib/Frontend/nvqpp/ConvertDecl.cpp b/lib/Frontend/nvqpp/ConvertDecl.cpp
index d099ea1ecc..c456e42ba2 100644
--- a/lib/Frontend/nvqpp/ConvertDecl.cpp
+++ b/lib/Frontend/nvqpp/ConvertDecl.cpp
@@ -178,9 +178,9 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
     if (name.equals("basic_string"))
       return pushType(cc::CharspanType::get(ctx));
     if (name.equals("vector")) {
-      auto *cts = cast<clang::ClassTemplateSpecializationDecl>(x);
+      auto *cts = dyn_cast<clang::ClassTemplateSpecializationDecl>(x);
       // Traverse template argument 0 to get the vector's element type.
-      if (!TraverseType(cts->getTemplateArgs()[0].getAsType()))
+      if (!cts || !TraverseType(cts->getTemplateArgs()[0].getAsType()))
         return false;
       return pushType(cc::StdvecType::get(ctx, popType()));
     }
@@ -191,6 +191,22 @@ bool QuakeBridgeVisitor::interceptRecordDecl(clang::RecordDecl *x) {
     }
     if (name.equals("_Bit_type"))
       return pushType(builder.getI64Type());
+    if (name.equals("complex")) {
+      auto *cts = dyn_cast<clang::ClassTemplateSpecializationDecl>(x);
+      // Traverse template argument 0 to get the complex's element type.
+      if (!cts || !TraverseType(cts->getTemplateArgs()[0].getAsType()))
+        return false;
+      auto memTy = popType();
+      return pushType(ComplexType::get(memTy));
+    }
+    if (name.equals("initializer_list")) {
+      auto *cts = dyn_cast<clang::ClassTemplateSpecializationDecl>(x);
+      // Traverse template argument 0, the initializer list's element type.
+      if (!cts || !TraverseType(cts->getTemplateArgs()[0].getAsType()))
+        return false;
+      auto memTy = popType();
+      return pushType(cc::ArrayType::get(memTy));
+    }
     if (name.equals("function")) {
       auto *cts = cast<clang::ClassTemplateSpecializationDecl>(x);
       // Traverse template argument 0 to get the function's signature.
diff --git a/lib/Frontend/nvqpp/ConvertExpr.cpp b/lib/Frontend/nvqpp/ConvertExpr.cpp
index 09d534ae4b..ddc05456c3 100644
--- a/lib/Frontend/nvqpp/ConvertExpr.cpp
+++ b/lib/Frontend/nvqpp/ConvertExpr.cpp
@@ -53,8 +53,7 @@ static clang::NamedDecl *getNamedDecl(clang::Expr *expr) {
 static std::pair<SmallVector<Value>, SmallVector<Value>>
 maybeUnpackOperands(OpBuilder &builder, Location loc, ValueRange operands,
                     bool isControl = false) {
-  // If this is not a controlled op, then we just keep all operands
-  // as targets.
+  // If this is not a controlled op, then we just keep all operands as targets.
   if (!isControl)
     return std::make_pair(operands, SmallVector<Value>{});
 
@@ -599,30 +598,30 @@ Value QuakeBridgeVisitor::integerCoercion(Location loc,
   TODO_loc(loc, "Integer conversion but not integer types");
 }
 
-bool QuakeBridgeVisitor::TraverseImplicitCastExpr(clang::ImplicitCastExpr *x,
-                                                  DataRecursionQueue *) {
-  // RecursiveASTVisitor is tuned for dumping surface syntax so doesn't visit
-  // the type. Override so that the casted to type is visited and pushed on the
-  // stack.
+bool QuakeBridgeVisitor::TraverseCastExpr(clang::CastExpr *x,
+                                          DataRecursionQueue *) {
+  // RecursiveASTVisitor is tuned for dumping surface syntax so doesn't
+  // necessarily visit the type. Override so that the casted to type is visited
+  // and pushed on the stack.
   [[maybe_unused]] auto typeStackDepth = typeStack.size();
   LLVM_DEBUG(llvm::dbgs() << "%% "; x->dump());
   if (!TraverseType(x->getType()))
     return false;
   assert(typeStack.size() == typeStackDepth + 1 && "must push a type");
-  auto result = Base::TraverseImplicitCastExpr(x);
+  for (auto *sub : getStmtChildren(x))
+    if (!TraverseStmt(sub))
+      return false;
+  bool result = WalkUpFromCastExpr(x);
   if (result) {
     assert(typeStack.size() == typeStackDepth && "must be original depth");
   }
   return result;
 }
 
-bool QuakeBridgeVisitor::VisitImplicitCastExpr(clang::ImplicitCastExpr *x) {
+bool QuakeBridgeVisitor::VisitCastExpr(clang::CastExpr *x) {
   // The type to cast the expression into is pushed during the traversal of the
   // ImplicitCastExpr in non-error cases.
   auto castToTy = popType();
-  if (x->getCastKind() == clang::CastKind::CK_FunctionToPointerDecay)
-    return true; // NOP
-
   auto loc = toLocation(x);
   auto intToIntCast = [&](Location locSub, Value mlirVal) {
     clang::QualType srcTy = x->getSubExpr()->getType();
@@ -654,9 +653,10 @@ bool QuakeBridgeVisitor::VisitImplicitCastExpr(clang::ImplicitCastExpr *x) {
     assert(result && "integer conversion failed");
     return result;
   }
+  case clang::CastKind::CK_FunctionToPointerDecay:
   case clang::CastKind::CK_ArrayToPointerDecay:
-    return true;
   case clang::CastKind::CK_NoOp:
+  case clang::CastKind::CK_ToVoid:
     return true;
   case clang::CastKind::CK_FloatingToIntegral: {
     auto qualTy = x->getType();
@@ -720,6 +720,9 @@ bool QuakeBridgeVisitor::VisitImplicitCastExpr(clang::ImplicitCastExpr *x) {
         if (cxxExpr->getNumArgs() == 1)
           return true;
     }
+    if (isa<ComplexType>(castToTy) && isa<ComplexType>(peekValue().getType())) {
+      return true;
+    }
     if (auto funcTy = peelPointerFromFunction(castToTy))
       if (auto fromTy = dyn_cast<cc::CallableType>(peekValue().getType())) {
         auto inputs = funcTy.getInputs();
@@ -1082,8 +1085,8 @@ bool QuakeBridgeVisitor::TraverseLambdaExpr(clang::LambdaExpr *x,
 bool QuakeBridgeVisitor::TraverseMemberExpr(clang::MemberExpr *x,
                                             DataRecursionQueue *) {
   if (auto *methodDecl = dyn_cast<clang::CXXMethodDecl>(x->getMemberDecl())) {
-    // For function members, we want to push the type of the function, since the
-    // visit to CallExpr requires a type to have been pushed.
+    // For function members, we want to push the type of the function, since
+    // the visit to CallExpr requires a type to have been pushed.
     [[maybe_unused]] auto typeStackDepth = typeStack.size();
     if (!TraverseType(methodDecl->getType()))
       return false;
@@ -1426,7 +1429,6 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
         } else if (auto load = v.getDefiningOp<cudaq::cc::LoadOp>()) {
           processedArgs.push_back(load.getPtrvalue());
         } else {
-          v.dump();
           reportClangError(x, mangler, "could not determine string argument");
         }
       };
@@ -1576,8 +1578,8 @@ bool QuakeBridgeVisitor::VisitCallExpr(clang::CallExpr *x) {
       SymbolRefAttr calleeSymbol;
       auto *ctx = builder.getContext();
 
-      // Expand the negations inline around the quake.apply. This will result
-      // in less duplication of code than threading the negated sense of the
+      // Expand the negations inline around the quake.apply. This will result in
+      // less duplication of code than threading the negated sense of the
       // control recursively through the callable.
       auto inlinedStartControlNegations = [&]() {
         if (!negations.empty()) {
@@ -2044,8 +2046,8 @@ bool QuakeBridgeVisitor::VisitCXXOperatorCallExpr(
     }
     if (typeName == "_Bit_reference" || typeName == "__bit_reference") {
       // For vector<bool>, on the kernel side this is represented as a sequence
-      // of byte-sized boolean values (true and false). On the host side, C++
-      // is likely going to pack the booleans as bits in words.
+      // of byte-sized boolean values (true and false). On the host side, C++ is
+      // likely going to pack the booleans as bits in words.
       auto indexVar = popValue();
       auto svec = popValue();
       assert(svec.getType().isa<cc::StdvecType>());
@@ -2136,22 +2138,6 @@ void QuakeBridgeVisitor::maybeAddCallOperationSignature(clang::Decl *x) {
   }
 }
 
-bool QuakeBridgeVisitor::TraverseCXXTemporaryObjectExpr(
-    clang::CXXTemporaryObjectExpr *x, DataRecursionQueue *) {
-  if (auto *ctor = x->getConstructor())
-    maybeAddCallOperationSignature(ctor);
-  if (!TraverseType(x->getType()))
-    return false;
-  return WalkUpFromCXXTemporaryObjectExpr(x);
-}
-
-bool QuakeBridgeVisitor::VisitCXXTemporaryObjectExpr(
-    clang::CXXTemporaryObjectExpr *x) {
-  // We probably want a distinctive op here instead of just leaving a type.
-  // Really this means allocating at least 1 byte and calling the default ctor.
-  return true;
-}
-
 bool QuakeBridgeVisitor::TraverseInitListExpr(clang::InitListExpr *x,
                                               DataRecursionQueue *) {
   if (x->isSyntacticForm()) {
@@ -2228,12 +2214,12 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
     return pushValue(last[0]);
   }
 
-  // These initializer expressions are not quantum references. In this case,
-  // we allocate some memory for a variable and store the init list elements
-  // there. Add the array size value
+  // These initializer expressions are not quantum references. In this case, we
+  // allocate some memory for a variable and store the init list elements there.
   auto structTy = dyn_cast<cc::StructType>(initListTy);
   std::int32_t structMems = structTy ? structTy.getMembers().size() : 0;
   std::int32_t numEles = structMems ? size / structMems : size;
+  // Generate the array size value.
   Value arrSize = builder.create<arith::ConstantIntOp>(loc, numEles, 64);
 
   // Allocate the required memory chunk.
@@ -2242,6 +2228,40 @@ bool QuakeBridgeVisitor::VisitInitListExpr(clang::InitListExpr *x) {
       return arrTy.getElementType();
     return initListTy;
   }();
+
+  bool lastValuesAreConstant = [&]() {
+    for (Value v : last)
+      if (!opt::factory::maybeValueOfFloatConstant(v))
+        return false;
+    return true;
+  }();
+
+  if (initializerIsGlobal && lastValuesAreConstant) {
+    static unsigned counter = 0;
+    auto *ctx = builder.getContext();
+    auto globalTy = cc::ArrayType::get(ctx, eleTy, size);
+    SmallVector<Attribute> values;
+    auto f64Ty = builder.getF64Type();
+    for (Value v : last) {
+      auto fp = opt::factory::maybeValueOfFloatConstant(v);
+      values.push_back(FloatAttr::get(f64Ty, *fp));
+    }
+    // NB: Unfortunately, the LLVM-IR dialect doesn't lower DenseF64ArrayAttr to
+    // LLVM IR without throwing errors.
+    auto tensorTy = RankedTensorType::get(size, eleTy);
+    auto f64Attr = DenseElementsAttr::get(tensorTy, values);
+    // Create a unique name.
+    std::string name = "__nvqpp__rodata_init_" + std::to_string(counter++);
+    {
+      OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPointToEnd(module.getBody());
+      builder.create<cc::GlobalOp>(loc, globalTy, name, f64Attr,
+                                   /*constant=*/true);
+    }
+    auto ptrTy = cc::PointerType::get(globalTy);
+    auto globalInit = builder.create<cc::AddressOfOp>(loc, ptrTy, name);
+    return pushValue(globalInit);
+  }
   Value alloca = (numEles > 1)
                      ? builder.create<cc::AllocaOp>(loc, eleTy, arrSize)
                      : builder.create<cc::AllocaOp>(loc, eleTy);
@@ -2283,12 +2303,25 @@ bool QuakeBridgeVisitor::TraverseCXXConstructExpr(clang::CXXConstructExpr *x,
   if (x->isElidable())
     return true;
   [[maybe_unused]] auto typeStackDepth = typeStack.size();
+  bool saveInitializerIsGlobal = initializerIsGlobal;
   if (x->getConstructor()) {
     if (!TraverseType(x->getType()))
       return false;
     assert(typeStack.size() == typeStackDepth + 1);
+    if (x->isStdInitListInitialization() && isa<quake::VeqType>(peekType()))
+      initializerIsGlobal = true;
   }
+  auto *ctor = x->getConstructor();
+  // FIXME: this implicit code visit setting is a hack to only visit a default
+  // argument value when constructing a complex value. We should always be able
+  // to visit default arguments, but we currently trip over default allocators,
+  // etc.
+  bool saveVisitImplicitCode = visitImplicitCode;
+  if (isInClassInNamespace(ctor, "complex", "std"))
+    visitImplicitCode = true;
   auto result = Base::TraverseCXXConstructExpr(x);
+  visitImplicitCode = saveVisitImplicitCode;
+  initializerIsGlobal = saveInitializerIsGlobal;
   assert(typeStack.size() == typeStackDepth || raisedError);
   return result;
 }
@@ -2301,7 +2334,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
   }
   // The ctor type is the class for which the ctor is a member.
   auto ctorTy = popType();
-  auto ctorName = ctor->getNameAsString();
+  // FIXME: not every constructor has a name.
+  std::string ctorName = ctor->getNameAsString();
   if (isInNamespace(ctor, "cudaq")) {
     if (x->getNumArgs() == 0) {
       if (ctorName == "qudit") {
@@ -2323,17 +2357,81 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
         return pushValue(builder.create<quake::AllocaOp>(loc, veq1Ty));
       }
     } else if (x->getNumArgs() == 1) {
-      if (ctorName == "qreg" || ctorName == "qvector") {
+      if (ctorName == "qreg") {
         // This is a cudaq::qreg(std::size_t).
         auto sizeVal = popValue();
         assert(isa<IntegerType>(sizeVal.getType()));
         return pushValue(builder.create<quake::AllocaOp>(
             loc, quake::VeqType::getUnsized(builder.getContext()), sizeVal));
       }
+      if (ctorName == "qudit") {
+        auto initials = popValue();
+        bool ok = false;
+        if (auto ptrTy = dyn_cast<cc::PointerType>(initials.getType()))
+          if (auto arrTy = dyn_cast<cc::ArrayType>(ptrTy.getElementType()))
+            ok = isa<ComplexType>(arrTy.getElementType());
+        if (!ok) {
+          // Invalid initializer ignored, but emit an error.
+          reportClangError(x, mangler, "invalid qudit initial value");
+          return pushValue(builder.create<quake::AllocaOp>(loc));
+        }
+        auto *ctx = builder.getContext();
+        auto veqTy = quake::VeqType::get(ctx, 1);
+        auto alloc = builder.create<quake::AllocaOp>(loc, veqTy);
+        auto init = builder.create<quake::InitializeStateOp>(loc, veqTy, alloc,
+                                                             initials);
+        return pushValue(builder.create<quake::ExtractRefOp>(loc, init, 0));
+      }
+      if (ctorName == "qvector") {
+        auto initials = popValue();
+        auto *ctx = builder.getContext();
+        if (isa<IntegerType>(initials.getType())) {
+          // This is the cudaq::qvector(std::size_t) ctor.
+          return pushValue(builder.create<quake::AllocaOp>(
+              loc, quake::VeqType::getUnsized(ctx), initials));
+        }
+        // Otherwise, it is the cudaq::qvector(std::vector<complex>) ctor.
+        Value numQubits;
+        Type initialsTy = initials.getType();
+        if (auto ptrTy = dyn_cast<cc::PointerType>(initialsTy)) {
+          if (auto arrTy = dyn_cast<cc::ArrayType>(ptrTy.getElementType())) {
+            if (arrTy.isUnknownSize()) {
+              if (auto allocOp = initials.getDefiningOp<cc::AllocaOp>())
+                if (auto size = allocOp.getSeqSize())
+                  numQubits =
+                      builder.create<math::CountTrailingZerosOp>(loc, size);
+            } else {
+              std::size_t arraySize = arrTy.getSize();
+              if (!std::has_single_bit(arraySize)) {
+                reportClangError(x, mangler,
+                                 "state vector must be a power of 2 in length");
+              }
+              numQubits = builder.create<arith::ConstantIntOp>(
+                  loc, std::countr_zero(arraySize), 64);
+            }
+          }
+        } else if (auto stdvecTy = dyn_cast<cc::StdvecType>(initialsTy)) {
+          Value vecLen = builder.create<cc::StdvecSizeOp>(
+              loc, builder.getI64Type(), initials);
+          numQubits = builder.create<math::CountTrailingZerosOp>(loc, vecLen);
+          auto ptrTy = cc::PointerType::get(stdvecTy.getElementType());
+          initials = builder.create<cc::StdvecDataOp>(loc, ptrTy, initials);
+        }
+        if (!numQubits) {
+          reportClangError(
+              x, mangler,
+              "internal error: could not determine the number of qubits");
+          return false;
+        }
+        auto veqTy = quake::VeqType::getUnsized(ctx);
+        auto alloc = builder.create<quake::AllocaOp>(loc, veqTy, numQubits);
+        return pushValue(builder.create<quake::InitializeStateOp>(
+            loc, veqTy, alloc, initials));
+      }
       if ((ctorName == "qspan" || ctorName == "qview") &&
           isa<quake::VeqType>(peekValue().getType())) {
-        // One of the qspan ctors, which effectively just makes a copy. Here
-        // we omit making a copy and just forward the veq argument.
+        // One of the qspan ctors, which effectively just makes a copy. Here we
+        // omit making a copy and just forward the veq argument.
         assert(isa<quake::VeqType>(ctorTy));
         return true;
       }
@@ -2352,6 +2450,12 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
     }();
     if (isVectorOfQubitRefs)
       return true;
+    if (ctorName == "complex") {
+      Value imag = popValue();
+      Value real = popValue();
+      return pushValue(builder.create<complex::CreateOp>(
+          loc, ComplexType::get(real.getType()), real, imag));
+    }
     if (ctorName == "function") {
       // Are we converting a lambda expr to a std::function?
       auto backVal = peekValue();
@@ -2408,15 +2512,14 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
       return true;
     }
 
-    // We check for vector constructors with 2 args, the first
-    // could be an initializer_list or an integer, while the
-    // second is the allocator
+    // We check for vector constructors with 2 args, the first could be an
+    // initializer_list or an integer, while the second is the allocator
     if (ctorName == "vector") {
       if (x->getNumArgs() == 2) {
-        // This is a std::vector constructor, first we'll check if it
-        // is constructed from a constant initializer list, in that case
-        // we'll have a AllocaOp at the top of the stack that allocates a
-        // ptr<array<TxC>>, where C is constant / known
+        // This is a std::vector constructor, first we'll check if it is
+        // constructed from a constant initializer list, in that case we'll have
+        // a AllocaOp at the top of the stack that allocates a ptr<array<TxC>>,
+        // where C is constant / known
         auto desugared = x->getArg(0)->getType().getCanonicalType();
         if (auto recordType =
                 dyn_cast<clang::RecordType>(desugared.getTypePtr()))
@@ -2431,9 +2534,9 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
                       allocation, definingOp.getSeqSize()));
           }
 
-        // Next check if its created from a size integer
-        // Let's do a check on the first argument, make sure that when
-        // we peel off all the typedefs that it is an integer
+        // Next check if its created from a size integer. Let's do a check on
+        // the first argument, make sure that when we peel off all the typedefs
+        // that it is an integer.
         if (auto builtInType =
                 dyn_cast<clang::BuiltinType>(desugared.getTypePtr()))
           if (builtInType->isInteger() &&
@@ -2447,8 +2550,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
               return ctorTy;
             }();
 
-            // create stdvec init op without a buffer.
-            // Allocate the required memory chunk
+            // Create stdvec init op without a buffer. Allocate the required
+            // memory chunk
             Value alloca = builder.create<cc::AllocaOp>(loc, eleTy, arrSize);
 
             // Create the stdvec_init op
@@ -2456,8 +2559,8 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
                 loc, cc::StdvecType::get(eleTy), alloca, arrSize));
           }
       }
-      // Disallow any default vector construction bc we don't
-      // want any .push_back
+      // Disallow any default vector construction bc we don't want any
+      // .push_back
       if (ctor->isDefaultConstructor())
         reportClangError(ctor, mangler,
                          "Default std::vector<T> constructor within quantum "
@@ -2468,15 +2571,15 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
 
   auto *parent = ctor->getParent();
   if (ctor->isCopyConstructor() && parent->isLambda()) {
-    // Copy-ctor on a lambda. For now on the QPU device side, we do not
-    // make a copy of a lambda. Any capture data will be marshalled at
-    // runtime and passed as ordinary arguments via lambda lifting.
+    // Copy-ctor on a lambda. For now on the QPU device side, we do not make a
+    // copy of a lambda. Any capture data will be marshalled at runtime and
+    // passed as ordinary arguments via lambda lifting.
     return true;
   }
 
   if (ctor->isCopyOrMoveConstructor() && parent->isPOD()) {
-    // Copy or move constructor on a POD struct. The value stack should
-    // contain the object to load the value from.
+    // Copy or move constructor on a POD struct. The value stack should contain
+    // the object to load the value from.
     auto fromStruct = popValue();
     assert(isa<cc::StructType>(ctorTy) && "POD must be a struct type");
     return pushValue(builder.create<cc::LoadOp>(loc, fromStruct));
@@ -2494,11 +2597,10 @@ bool QuakeBridgeVisitor::VisitCXXConstructExpr(clang::CXXConstructExpr *x) {
   //    one byte.
   // 2) Allocate a new object.
   // 3) Call the constructor passing the address of the allocation as `this`.
-
   auto mem = builder.create<cc::AllocaOp>(loc, ctorTy);
-  // FIXME: Using Ctor_Complete for mangled name generation blindly here.
-  // Is there a programmatic way of determining which enum to use from the
-  // AST?
+
+  // FIXME: Using Ctor_Complete for mangled name generation blindly here. Is
+  // there a programmatic way of determining which enum to use from the AST?
   auto mangledName =
       cxxMangledDeclName(clang::GlobalDecl{ctor, clang::Ctor_Complete});
   auto funcTy =
diff --git a/lib/Frontend/nvqpp/ConvertType.cpp b/lib/Frontend/nvqpp/ConvertType.cpp
index 7db5bd6398..46488590a7 100644
--- a/lib/Frontend/nvqpp/ConvertType.cpp
+++ b/lib/Frontend/nvqpp/ConvertType.cpp
@@ -11,12 +11,16 @@
 #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Todo.h"
+#include "clang/Basic/TargetInfo.h"
+#include "llvm/TargetParser/Triple.h"
 
 #define DEBUG_TYPE "lower-ast-type"
 
 using namespace mlir;
 
-static bool isArithmeticType(Type t) { return isa<IntegerType, FloatType>(t); }
+static bool isArithmeticType(Type t) {
+  return isa<IntegerType, FloatType, ComplexType>(t);
+}
 
 /// Is \p t a quantum reference type. In the bridge, quantum types are always
 /// reference types.
@@ -321,9 +325,15 @@ Type QuakeBridgeVisitor::builtinTypeToType(const clang::BuiltinType *t) {
     return builder.getF32Type();
   case BuiltinType::Double:
     return builder.getF64Type();
-  case BuiltinType::LongDouble:
-    return astContext->getTypeSize(t) == 64 ? builder.getF64Type()
-                                            : builder.getF128Type();
+  case BuiltinType::LongDouble: {
+    auto bitWidth = astContext->getTargetInfo().getLongDoubleWidth();
+    if (bitWidth == 64)
+      return builder.getF64Type();
+    llvm::Triple triple(astContext->getTargetInfo().getTargetOpts().Triple);
+    if (triple.isX86())
+      return builder.getF80Type();
+    return builder.getF128Type();
+  }
   case BuiltinType::Float128:
   case BuiltinType::Ibm128: /* double double format -> {double, double} */
     return builder.getF128Type();
diff --git a/lib/Optimizer/Builder/Factory.cpp b/lib/Optimizer/Builder/Factory.cpp
index f599113781..7980537f51 100644
--- a/lib/Optimizer/Builder/Factory.cpp
+++ b/lib/Optimizer/Builder/Factory.cpp
@@ -11,6 +11,7 @@
 #include "cudaq/Optimizer/Dialect/CC/CCOps.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeOps.h"
 #include "llvm/TargetParser/Triple.h"
+#include "mlir/IR/Matchers.h"
 
 using namespace mlir;
 
@@ -151,6 +152,20 @@ func::FuncOp factory::createFunction(StringRef name, ArrayRef<Type> retTypes,
   return func;
 }
 
+std::optional<std::uint64_t> factory::maybeValueOfIntConstant(Value v) {
+  APInt cst;
+  if (matchPattern(v, m_ConstantInt(&cst)))
+    return {cst.getZExtValue()};
+  return std::nullopt;
+}
+
+std::optional<double> factory::maybeValueOfFloatConstant(Value v) {
+  APFloat cst(0.0);
+  if (matchPattern(v, m_ConstantFloat(&cst)))
+    return {cst.convertToDouble()};
+  return std::nullopt;
+}
+
 void factory::createGlobalCtorCall(ModuleOp mod, FlatSymbolRefAttr ctor) {
   auto *ctx = mod.getContext();
   auto loc = mod.getLoc();
diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index f07c0396c7..60bf6619c0 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -92,6 +92,23 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     return %9 : !cc.struct<{!cc.ptr<i8>, i64}>
   })#"},
 
+    {"__nvqpp_getStateVectorData_fp32", {}, R"#(
+  func.func private @__nvqpp_getStateVectorData_fp32(%p : i64, %o : i64) -> !cc.ptr<complex<f32>>
+  )#"},
+    {"__nvqpp_getStateVectorData_fp64", {}, R"#(
+  func.func private @__nvqpp_getStateVectorData_fp64(%p : i64, %o : i64) -> !cc.ptr<complex<f64>>
+  )#"},
+    {"__nvqpp_getStateVectorLength_fp32",
+     {},
+     R"#(
+  func.func private @__nvqpp_getStateVectorLength_fp32(%p : i64, %o : i64) -> i64
+  )#"},
+    {"__nvqpp_getStateVectorLength_fp64",
+     {},
+     R"#(
+  func.func private @__nvqpp_getStateVectorLength_fp64(%p : i64, %o : i64) -> i64
+  )#"},
+
     // __nvqpp_initializer_list_to_vector_bool
     {cudaq::stdvecBoolCtorFromInitList,
      {},
diff --git a/lib/Optimizer/CodeGen/CMakeLists.txt b/lib/Optimizer/CodeGen/CMakeLists.txt
index 701881cece..5c44565903 100644
--- a/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -11,6 +11,9 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 endif()
 
 add_cudaq_library(OptCodeGen
+  CodeGenDialect.cpp
+  CodeGenOps.cpp
+  CodeGenTypes.cpp
   LowerToQIRProfile.cpp
   LowerToQIR.cpp
   Passes.cpp
@@ -19,6 +22,9 @@ add_cudaq_library(OptCodeGen
 
   DEPENDS
     CCDialect
+    CodeGenDialectIncGen
+    CodeGenOpsIncGen
+    CodeGenTypesIncGen
     OptCodeGenPassIncGen
     OptPeepholeIncGen
     OptTransformsPassIncGen
diff --git a/lib/Optimizer/CodeGen/CodeGenDialect.cpp b/lib/Optimizer/CodeGen/CodeGenDialect.cpp
new file mode 100644
index 0000000000..1689e1eaf3
--- /dev/null
+++ b/lib/Optimizer/CodeGen/CodeGenDialect.cpp
@@ -0,0 +1,27 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CodeGenDialect.h"
+#include "CodeGenOps.h"
+#include "mlir/IR/DialectImplementation.h"
+
+//===----------------------------------------------------------------------===//
+// Generated logic
+//===----------------------------------------------------------------------===//
+
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+
+void cudaq::codegen::CodeGenDialect::initialize() {
+  registerTypes();
+  addOperations<
+#define GET_OP_LIST
+#include "cudaq/Optimizer/CodeGen/CodeGenOps.cpp.inc"
+      >();
+}
diff --git a/runtime/host_config.h b/lib/Optimizer/CodeGen/CodeGenDialect.h
similarity index 60%
rename from runtime/host_config.h
rename to lib/Optimizer/CodeGen/CodeGenDialect.h
index b930c834ad..a61d3db6bf 100644
--- a/runtime/host_config.h
+++ b/lib/Optimizer/CodeGen/CodeGenDialect.h
@@ -1,5 +1,5 @@
 /****************************************************************-*- C++ -*-****
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -8,5 +8,10 @@
 
 #pragma once
 
-#define CUDAQ_USE_STD20 (__cplusplus >= 202002L)
-#define CUDAQ_APPLE_CLANG (defined(__apple_build_version__))
+#include "mlir/IR/Dialect.h"
+
+//===----------------------------------------------------------------------===//
+// Generated logic
+//===----------------------------------------------------------------------===//
+
+#include "cudaq/Optimizer/CodeGen/CodeGenDialect.h.inc"
diff --git a/lib/Optimizer/CodeGen/CodeGenOps.cpp b/lib/Optimizer/CodeGen/CodeGenOps.cpp
new file mode 100644
index 0000000000..502657ecf1
--- /dev/null
+++ b/lib/Optimizer/CodeGen/CodeGenOps.cpp
@@ -0,0 +1,22 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CodeGenOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Generated logic
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "cudaq/Optimizer/CodeGen/CodeGenOps.cpp.inc"
diff --git a/lib/Optimizer/CodeGen/CodeGenOps.h b/lib/Optimizer/CodeGen/CodeGenOps.h
new file mode 100644
index 0000000000..63d1a8154f
--- /dev/null
+++ b/lib/Optimizer/CodeGen/CodeGenOps.h
@@ -0,0 +1,23 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
+#include "cudaq/Optimizer/Dialect/Common/Traits.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+
+//===----------------------------------------------------------------------===//
+// Generated logic
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "cudaq/Optimizer/CodeGen/CodeGenOps.h.inc"
diff --git a/lib/Optimizer/CodeGen/CodeGenTypes.cpp b/lib/Optimizer/CodeGen/CodeGenTypes.cpp
new file mode 100644
index 0000000000..94bc9e71b6
--- /dev/null
+++ b/lib/Optimizer/CodeGen/CodeGenTypes.cpp
@@ -0,0 +1,24 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CodeGenTypes.h"
+#include "CodeGenDialect.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+
+//===----------------------------------------------------------------------===//
+// Generated logic
+//===----------------------------------------------------------------------===//
+
+#define GET_TYPEDEF_CLASSES
+#include "cudaq/Optimizer/CodeGen/CodeGenTypes.cpp.inc"
+
+void cudaq::codegen::CodeGenDialect::registerTypes() {
+  addTypes<DoNotUseType>();
+}
diff --git a/lib/Optimizer/CodeGen/CodeGenTypes.h b/lib/Optimizer/CodeGen/CodeGenTypes.h
new file mode 100644
index 0000000000..ab6f49d033
--- /dev/null
+++ b/lib/Optimizer/CodeGen/CodeGenTypes.h
@@ -0,0 +1,20 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+
+//===----------------------------------------------------------------------===//
+// Generated logic
+//===----------------------------------------------------------------------===//
+
+#define GET_TYPEDEF_CLASSES
+#include "cudaq/Optimizer/CodeGen/CodeGenTypes.h.inc"
diff --git a/lib/Optimizer/CodeGen/LowerToQIR.cpp b/lib/Optimizer/CodeGen/LowerToQIR.cpp
index 2f7e70e88e..02270ca40c 100644
--- a/lib/Optimizer/CodeGen/LowerToQIR.cpp
+++ b/lib/Optimizer/CodeGen/LowerToQIR.cpp
@@ -16,6 +16,7 @@
 #pragma GCC diagnostic pop
 #endif
 
+#include "CodeGenOps.h"
 #include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/Peephole.h"
@@ -39,6 +40,9 @@
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "lower-to-qir"
 
 using namespace mlir;
 
@@ -81,6 +85,12 @@ class AllocaOpRewrite : public ConvertOpToLLVMPattern<quake::AllocaOp> {
       return success();
     }
 
+    if (alloca.hasInitializedState()) {
+      // If allocation has initialized state, the sub-graph should already be
+      // folded. Seeing it here is an error.
+      return alloca.emitOpError("initialize state must be folded");
+    }
+
     // Create a QIR call to allocate the qubits.
     StringRef qir_qubit_array_allocate = cudaq::opt::QIRArrayQubitAllocateArray;
     auto array_qbit_type = cudaq::opt::getArrayType(context);
@@ -111,6 +121,78 @@ class AllocaOpRewrite : public ConvertOpToLLVMPattern<quake::AllocaOp> {
   }
 };
 
+// Lower quake.init_state to a QIR function to allocate the
+// qubits with the provided state vector.
+class QmemRAIIOpRewrite
+    : public ConvertOpToLLVMPattern<cudaq::codegen::RAIIOp> {
+public:
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(cudaq::codegen::RAIIOp raii, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = raii->getLoc();
+    auto parentModule = raii->getParentOfType<ModuleOp>();
+    auto array_qbit_type = cudaq::opt::getArrayType(rewriter.getContext());
+
+    // Get the CC Pointer for the state
+    auto ccState = adaptor.getInitState();
+
+    // Inspect the element type of the complex data, need to
+    // know if its f32 or f64
+    Type eleTy = rewriter.getF64Type();
+    if (auto llvmPtrTy = dyn_cast<LLVM::LLVMPointerType>(ccState.getType())) {
+      auto ptrEleTy = llvmPtrTy.getElementType();
+      if (auto llvmStructTy = dyn_cast<LLVM::LLVMStructType>(ptrEleTy))
+        if (llvmStructTy.getBody().size() == 2 &&
+            llvmStructTy.getBody()[0] == llvmStructTy.getBody()[1] &&
+            llvmStructTy.getBody()[0].isF32())
+          eleTy = rewriter.getF32Type();
+    }
+
+    if (!isa<FloatType>(eleTy))
+      return raii.emitOpError("invalid type on initialize state operation, "
+                              "must be complex floating point.");
+
+    // Get the size of the qubit register
+    Type allocTy = adaptor.getAllocType();
+    auto allocSize = adaptor.getAllocSize();
+    Value sizeOperand;
+    auto i64Ty = rewriter.getI64Type();
+    if (allocSize) {
+      sizeOperand = allocSize;
+      auto sizeTy = cast<IntegerType>(sizeOperand.getType());
+      if (sizeTy.getWidth() < 64)
+        sizeOperand = rewriter.create<LLVM::ZExtOp>(loc, i64Ty, sizeOperand);
+      else if (sizeTy.getWidth() > 64)
+        sizeOperand = rewriter.create<LLVM::TruncOp>(loc, i64Ty, sizeOperand);
+    } else {
+      auto type = cast<quake::VeqType>(allocTy);
+      auto constantSize = type.getSize();
+      sizeOperand =
+          rewriter.create<arith::ConstantIntOp>(loc, constantSize, 64);
+    }
+
+    // Create QIR allocation with initializer function.
+    auto *ctx = rewriter.getContext();
+    auto ptrTy = cudaq::opt::factory::getPointerType(ctx);
+    StringRef functionName =
+        eleTy.isF64() ? cudaq::opt::QIRArrayQubitAllocateArrayWithStateFP64
+                      : cudaq::opt::QIRArrayQubitAllocateArrayWithStateFP32;
+    FlatSymbolRefAttr raiiSymbolRef =
+        cudaq::opt::factory::createLLVMFunctionSymbol(
+            functionName, array_qbit_type, {i64Ty, ptrTy}, parentModule);
+
+    // Call the allocation function
+    Value castedInitState =
+        rewriter.create<LLVM::BitcastOp>(loc, ptrTy, ccState);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(
+        raii, array_qbit_type, raiiSymbolRef,
+        ArrayRef<Value>{sizeOperand, castedInitState});
+    return success();
+  }
+};
+
 /// Lower Quake Dealloc Ops to QIR function calls.
 class DeallocOpRewrite : public ConvertOpToLLVMPattern<quake::DeallocOp> {
 public:
@@ -139,9 +221,9 @@ class DeallocOpRewrite : public ConvertOpToLLVMPattern<quake::DeallocOp> {
         cudaq::opt::factory::createLLVMFunctionSymbol(
             qirQuantumDeallocateFunc, retType, {operandType}, parentModule);
 
-    rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
-        dealloc, ArrayRef<Type>({}), deallocSymbolRef,
-        adaptor.getOperands().front());
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(dealloc, ArrayRef<Type>({}),
+                                              deallocSymbolRef,
+                                              adaptor.getOperands().front());
     return success();
   }
 };
@@ -976,6 +1058,22 @@ class GetVeqSizeOpRewrite : public OpConversionPattern<quake::VeqSizeOp> {
 // Conversion patterns for CC dialect ops.
 //===----------------------------------------------------------------------===//
 
+class AddressOfOpPattern
+    : public ConvertOpToLLVMPattern<cudaq::cc::AddressOfOp> {
+public:
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  // One-to-one conversion to llvm.addressof op.
+  LogicalResult
+  matchAndRewrite(cudaq::cc::AddressOfOp addr, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Type type = getTypeConverter()->convertType(addr.getType());
+    auto name = addr.getGlobalName();
+    rewriter.replaceOpWithNewOp<LLVM::AddressOfOp>(addr, type, name);
+    return success();
+  }
+};
+
 class AllocaOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::AllocaOp> {
 public:
   using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
@@ -1264,6 +1362,30 @@ class FuncToPtrOpPattern
   }
 };
 
+class GlobalOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::GlobalOp> {
+public:
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  // Replace the cc.global with an llvm.global, updating the types, etc.
+  LogicalResult
+  matchAndRewrite(cudaq::cc::GlobalOp global, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto loc = global.getLoc();
+    auto ptrTy = cast<cudaq::cc::PointerType>(global.getType());
+    auto eleTy = ptrTy.getElementType();
+    Type type = getTypeConverter()->convertType(eleTy);
+    auto name = global.getSymName();
+    bool isReadOnly = global.getConstant();
+    Attribute initializer = global.getValue().value_or(Attribute{});
+    auto linkage =
+        global.getExternal() ? LLVM::Linkage::Linkonce : LLVM::Linkage::Private;
+    rewriter.create<LLVM::GlobalOp>(loc, type, isReadOnly, linkage, name,
+                                    initializer, /*alignment=*/0);
+    rewriter.eraseOp(global);
+    return success();
+  }
+};
+
 class InsertValueOpPattern
     : public ConvertOpToLLVMPattern<cudaq::cc::InsertValueOp> {
 public:
@@ -1563,8 +1685,8 @@ class ReturnBitRewrite : public OpConversionPattern<func::ReturnOp> {
   }
 };
 
-/// In case we still have a RelaxSizeOp, we can just remove it,
-/// since QIR works on Array * for all sized veqs.
+/// In case we still have a RelaxSizeOp, we can just remove it, since QIR works
+/// on `Array*` for all sized veqs.
 class RemoveRelaxSizeRewrite : public OpConversionPattern<quake::RelaxSizeOp> {
 public:
   using OpConversionPattern::OpConversionPattern;
@@ -1577,6 +1699,24 @@ class RemoveRelaxSizeRewrite : public OpConversionPattern<quake::RelaxSizeOp> {
   }
 };
 
+class CodeGenRAIIPattern : public OpRewritePattern<quake::InitializeStateOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::InitializeStateOp init,
+                                PatternRewriter &rewriter) const override {
+    Value mem = init.getTargets();
+    auto alloc = mem.getDefiningOp<quake::AllocaOp>();
+    if (!alloc)
+      return init.emitOpError("init_state must have alloca as input");
+    rewriter.replaceOpWithNewOp<cudaq::codegen::RAIIOp>(
+        init, init.getType(), init.getState(), alloc.getType(),
+        alloc.getSize());
+    rewriter.eraseOp(alloc);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Code generation: converts the Quake IR to QIR.
 //===----------------------------------------------------------------------===//
@@ -1650,7 +1790,20 @@ class QuakeToQIRRewrite : public cudaq::opt::QuakeToQIRBase<QuakeToQIRRewrite> {
     return ok ? success() : failure();
   }
 
+  /// Greedy pass to match subgraphs in the IR and replace them with codegen
+  /// ops. This step makes converting a DAG of nodes in the conversion step
+  /// simpler.
+  void fuseSubgraphPatterns() {
+    auto *ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<CodeGenRAIIPattern>(ctx);
+    if (failed(applyPatternsAndFoldGreedily(getModule(), std::move(patterns))))
+      signalPassFailure();
+  }
+
   void runOnOperation() override final {
+    fuseSubgraphPatterns();
+
     auto *context = &getContext();
 
     // Ad hoc deal with ConstantArrayOp transformation.
@@ -1680,14 +1833,15 @@ class QuakeToQIRRewrite : public cudaq::opt::QuakeToQIRBase<QuakeToQIRRewrite> {
 
     patterns.insert<GetVeqSizeOpRewrite, RemoveRelaxSizeRewrite, MxToMz, MyToMz,
                     ReturnBitRewrite>(context);
+
     patterns.insert<
-        AllocaOpRewrite, AllocaOpPattern, CallableClosureOpPattern,
-        CallableFuncOpPattern, CallCallableOpPattern, CastOpPattern,
-        ComputePtrOpPattern, ConcatOpRewrite, DeallocOpRewrite,
+        AddressOfOpPattern, AllocaOpRewrite, AllocaOpPattern,
+        CallableClosureOpPattern, CallableFuncOpPattern, CallCallableOpPattern,
+        CastOpPattern, ComputePtrOpPattern, ConcatOpRewrite, DeallocOpRewrite,
         CreateStringLiteralOpPattern, DiscriminateOpPattern,
         ExtractQubitOpRewrite, ExtractValueOpPattern, FuncToPtrOpPattern,
-        InsertValueOpPattern, InstantiateCallableOpPattern, LoadOpPattern,
-        ExpPauliRewrite, OneTargetRewrite<quake::HOp>,
+        GlobalOpPattern, InsertValueOpPattern, InstantiateCallableOpPattern,
+        LoadOpPattern, ExpPauliRewrite, OneTargetRewrite<quake::HOp>,
         OneTargetRewrite<quake::XOp>, OneTargetRewrite<quake::YOp>,
         OneTargetRewrite<quake::ZOp>, OneTargetRewrite<quake::SOp>,
         OneTargetRewrite<quake::TOp>, OneTargetOneParamRewrite<quake::R1Op>,
@@ -1696,17 +1850,20 @@ class QuakeToQIRRewrite : public cudaq::opt::QuakeToQIRBase<QuakeToQIRRewrite> {
         OneTargetOneParamRewrite<quake::RyOp>,
         OneTargetOneParamRewrite<quake::RzOp>,
         OneTargetTwoParamRewrite<quake::U2Op>,
-        OneTargetTwoParamRewrite<quake::U3Op>, PoisonOpPattern, ResetRewrite,
-        StdvecDataOpPattern, StdvecInitOpPattern, StdvecSizeOpPattern,
-        StoreOpPattern, SubveqOpRewrite, TwoTargetRewrite<quake::SwapOp>,
-        UndefOpPattern>(typeConverter);
+        OneTargetTwoParamRewrite<quake::U3Op>, PoisonOpPattern,
+        QmemRAIIOpRewrite, ResetRewrite, StdvecDataOpPattern,
+        StdvecInitOpPattern, StdvecSizeOpPattern, StoreOpPattern,
+        SubveqOpRewrite, TwoTargetRewrite<quake::SwapOp>, UndefOpPattern>(
+        typeConverter);
     patterns.insert<MeasureRewrite<quake::MzOp>>(typeConverter, measureCounter);
 
     target.addLegalDialect<LLVM::LLVMDialect>();
     target.addLegalOp<ModuleOp>();
 
-    if (failed(applyFullConversion(getModule(), target, std::move(patterns))))
+    if (failed(applyFullConversion(getModule(), target, std::move(patterns)))) {
+      LLVM_DEBUG(getModule().dump());
       signalPassFailure();
+    }
   }
 };
 
diff --git a/lib/Optimizer/CodeGen/PassDetails.h b/lib/Optimizer/CodeGen/PassDetails.h
index 48ae6c5301..719c03b391 100644
--- a/lib/Optimizer/CodeGen/PassDetails.h
+++ b/lib/Optimizer/CodeGen/PassDetails.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "CodeGenDialect.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/lib/Optimizer/CodeGen/Passes.cpp b/lib/Optimizer/CodeGen/Passes.cpp
index 40e38f7d38..f11a0579eb 100644
--- a/lib/Optimizer/CodeGen/Passes.cpp
+++ b/lib/Optimizer/CodeGen/Passes.cpp
@@ -71,3 +71,7 @@ void cudaq::opt::registerTargetPipelines() {
                              "Convert kernels to IonQ gate set.",
                              addIonQPipeline);
 }
+
+void cudaq::opt::registerCodeGenDialect(DialectRegistry &registry) {
+  registry.insert<cudaq::codegen::CodeGenDialect>();
+}
diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp
index 858925884b..e2b4a57acf 100644
--- a/lib/Optimizer/Dialect/CC/CCOps.cpp
+++ b/lib/Optimizer/Dialect/CC/CCOps.cpp
@@ -40,8 +40,7 @@ cudaq::cc::AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
       getParentOfType<ModuleOp>(getOperation()), getGlobalNameAttr());
 
   // TODO: add globals?
-  auto function = dyn_cast_or_null<func::FuncOp>(op);
-  if (!function)
+  if (!isa_and_nonnull<func::FuncOp, GlobalOp>(op))
     return emitOpError("must reference a global defined by 'func.func'");
   return success();
 }
@@ -408,7 +407,7 @@ OpFoldResult cudaq::cc::GetConstantElementOp::fold(FoldAdaptor adaptor) {
   auto conArr = getConstantArray().getDefiningOp<ConstantArrayOp>();
   if (!conArr)
     return nullptr;
-  cudaq::cc::ArrayType arrTy = conArr.getType();
+  cc::ArrayType arrTy = conArr.getType();
   if (arrTy.isUnknownSize())
     return nullptr;
   auto eleTy = arrTy.getElementType();
@@ -420,18 +419,75 @@ OpFoldResult cudaq::cc::GetConstantElementOp::fold(FoldAdaptor adaptor) {
     if (auto fltTy = dyn_cast<FloatType>(eleTy)) {
       auto floatConstVal =
           cast<FloatAttr>(conArr.getConstantValues()[offset]).getValue();
-      Value val =
-          builder.create<arith::ConstantFloatOp>(loc, floatConstVal, fltTy);
-      return val;
+      return builder.create<arith::ConstantFloatOp>(loc, floatConstVal, fltTy)
+          .getResult();
     }
     auto intConstVal =
         cast<IntegerAttr>(conArr.getConstantValues()[offset]).getInt();
     auto intTy = cast<IntegerType>(eleTy);
-    Value val = builder.create<arith::ConstantIntOp>(loc, intConstVal, intTy);
-    return val;
+    return builder.create<arith::ConstantIntOp>(loc, intConstVal, intTy)
+        .getResult();
   }
-  Value val = builder.create<cc::PoisonOp>(loc, eleTy);
-  return val;
+  return builder.create<cc::PoisonOp>(loc, eleTy).getResult();
+}
+
+//===----------------------------------------------------------------------===//
+// GlobalOp
+//===----------------------------------------------------------------------===//
+
+ParseResult cudaq::cc::GlobalOp::parse(OpAsmParser &parser,
+                                       OperationState &result) {
+  // Check for the `extern` optional keyword first.
+  if (succeeded(parser.parseOptionalKeyword("extern")))
+    result.addAttribute(getExternalAttrName(result.name),
+                        parser.getBuilder().getUnitAttr());
+
+  // Check for the `constant` optional keyword second.
+  if (succeeded(parser.parseOptionalKeyword("constant")))
+    result.addAttribute(getConstantAttrName(result.name),
+                        parser.getBuilder().getUnitAttr());
+
+  // Parse the rest of the global.
+  //   @<symbol> ( <initializer-attr> ) : <result-type>
+  StringAttr name;
+  if (parser.parseSymbolName(name, getSymNameAttrName(result.name),
+                             result.attributes))
+    return failure();
+  if (succeeded(parser.parseOptionalLParen())) {
+    Attribute value;
+    if (parser.parseAttribute(value, getValueAttrName(result.name),
+                              result.attributes) ||
+        parser.parseRParen())
+      return failure();
+  }
+  SmallVector<Type, 1> types;
+  if (parser.parseOptionalColonTypeList(types) ||
+      parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+  if (types.size() > 1)
+    return parser.emitError(parser.getNameLoc(), "expected zero or one type");
+  result.addAttribute(getGlobalTypeAttrName(result.name),
+                      TypeAttr::get(types[0]));
+  return success();
+}
+
+void cudaq::cc::GlobalOp::print(OpAsmPrinter &p) {
+  p << ' ';
+  if (getExternal())
+    p << "extern ";
+  if (getConstant())
+    p << "constant ";
+  p.printSymbolName(getSymName());
+  if (auto value = getValue()) {
+    p << " (";
+    p.printAttribute(*value);
+    p << ")";
+  }
+  p << " : " << getGlobalType();
+  p.printOptionalAttrDictWithKeyword(
+      (*this)->getAttrs(),
+      {getSymNameAttrName(), getValueAttrName(), getGlobalTypeAttrName(),
+       getConstantAttrName(), getExternalAttrName()});
 }
 
 //===----------------------------------------------------------------------===//
@@ -557,8 +613,8 @@ LogicalResult cudaq::cc::LoopOp::verify() {
     return emitOpError("size of init args and outputs must be equal");
   if (getWhileRegion().front().getArguments().size() != initArgsSize)
     return emitOpError("size of init args and while region args must be equal");
-  if (auto condOp = dyn_cast<cudaq::cc::ConditionOp>(
-          getWhileRegion().front().getTerminator())) {
+  if (auto condOp =
+          dyn_cast<ConditionOp>(getWhileRegion().front().getTerminator())) {
     if (condOp.getResults().size() != initArgsSize)
       return emitOpError("size of init args and condition op must be equal");
   } else {
@@ -570,8 +626,8 @@ LogicalResult cudaq::cc::LoopOp::verify() {
     if (getStepRegion().front().getArguments().size() != initArgsSize)
       return emitOpError(
           "size of init args and step region args must be equal");
-    if (auto contOp = dyn_cast<cudaq::cc::ContinueOp>(
-            getStepRegion().front().getTerminator())) {
+    if (auto contOp =
+            dyn_cast<ContinueOp>(getStepRegion().front().getTerminator())) {
       if (contOp.getOperands().size() != initArgsSize)
         return emitOpError("size of init args and continue op must be equal");
     } else {
@@ -1168,7 +1224,7 @@ ParseResult cudaq::cc::CreateLambdaOp::parse(OpAsmParser &parser,
 LogicalResult cudaq::cc::CallCallableOp::verify() {
   FunctionType funcTy;
   auto ty = getCallee().getType();
-  if (auto lambdaTy = dyn_cast<cudaq::cc::CallableType>(ty))
+  if (auto lambdaTy = dyn_cast<CallableType>(ty))
     funcTy = lambdaTy.getSignature();
   else if (auto fTy = dyn_cast<FunctionType>(ty))
     funcTy = fTy;
@@ -1246,7 +1302,7 @@ struct ReplaceInFunc : public OpRewritePattern<FROM> {
 
 void cudaq::cc::ReturnOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<ReplaceInFunc<cudaq::cc::ReturnOp, func::ReturnOp>>(context);
+  patterns.add<ReplaceInFunc<ReturnOp, func::ReturnOp>>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1316,8 +1372,7 @@ struct ReplaceInLoop : public OpRewritePattern<FROM> {
 
 void cudaq::cc::UnwindBreakOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns.add<ReplaceInLoop<cudaq::cc::UnwindBreakOp, cudaq::cc::BreakOp>>(
-      context);
+  patterns.add<ReplaceInLoop<UnwindBreakOp, BreakOp>>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1346,9 +1401,7 @@ LogicalResult cudaq::cc::UnwindContinueOp::verify() {
 
 void cudaq::cc::UnwindContinueOp::getCanonicalizationPatterns(
     RewritePatternSet &patterns, MLIRContext *context) {
-  patterns
-      .add<ReplaceInLoop<cudaq::cc::UnwindContinueOp, cudaq::cc::ContinueOp>>(
-          context);
+  patterns.add<ReplaceInLoop<UnwindContinueOp, ContinueOp>>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp
index fa53604c1d..5d52c8baeb 100644
--- a/lib/Optimizer/Dialect/CC/CCTypes.cpp
+++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp
@@ -150,35 +150,35 @@ void cc::ArrayType::print(AsmPrinter &printer) const {
 
 //===----------------------------------------------------------------------===//
 
-namespace cudaq {
+namespace cudaq::cc {
 
 Type cc::SpanLikeType::getElementType() const {
   return llvm::TypeSwitch<Type, Type>(*this).Case<StdvecType, CharspanType>(
       [](auto type) { return type.getElementType(); });
 }
 
-bool cc::isDynamicType(Type ty) {
-  if (isa<cc::StdvecType>(ty))
+bool isDynamicType(Type ty) {
+  if (isa<StdvecType>(ty))
     return true;
-  if (auto strTy = dyn_cast<cc::StructType>(ty)) {
+  if (auto strTy = dyn_cast<StructType>(ty)) {
     for (auto memTy : strTy.getMembers())
       if (isDynamicType(memTy))
         return true;
     return false;
   }
-  if (auto arrTy = dyn_cast<cc::ArrayType>(ty))
+  if (auto arrTy = dyn_cast<ArrayType>(ty))
     return arrTy.isUnknownSize() || isDynamicType(arrTy.getElementType());
   // Note: this isn't considering quake, builtin, etc. types.
   return false;
 }
 
-cc::CallableType cc::CallableType::getNoSignature(MLIRContext *ctx) {
+CallableType CallableType::getNoSignature(MLIRContext *ctx) {
   return CallableType::get(ctx, FunctionType::get(ctx, {}, {}));
 }
 
-void cc::CCDialect::registerTypes() {
+void CCDialect::registerTypes() {
   addTypes<ArrayType, CallableType, CharspanType, PointerType, StdvecType,
            StructType>();
 }
 
-} // namespace cudaq
+} // namespace cudaq::cc
diff --git a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
index abea24d1d0..3a939f57af 100644
--- a/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
+++ b/lib/Optimizer/Dialect/Quake/QuakeOps.cpp
@@ -94,7 +94,7 @@ bool quake::isSupportedMappingOperation(Operation *op) {
   return isa<OperatorInterface, MeasurementInterface, SinkOp>(op);
 }
 
-mlir::ValueRange quake::getQuantumTypesFromRange(mlir::ValueRange range) {
+ValueRange quake::getQuantumTypesFromRange(ValueRange range) {
 
   // Skip over classical types at the beginning
   int numClassical = 0;
@@ -105,7 +105,7 @@ mlir::ValueRange quake::getQuantumTypesFromRange(mlir::ValueRange range) {
       break;
   }
 
-  mlir::ValueRange retVals = range.drop_front(numClassical);
+  ValueRange retVals = range.drop_front(numClassical);
 
   // Make sure all remaining operands are quantum
   for (auto operand : retVals)
@@ -115,17 +115,16 @@ mlir::ValueRange quake::getQuantumTypesFromRange(mlir::ValueRange range) {
   return retVals;
 }
 
-mlir::ValueRange quake::getQuantumResults(Operation *op) {
+ValueRange quake::getQuantumResults(Operation *op) {
   return getQuantumTypesFromRange(op->getResults());
 }
 
-mlir::ValueRange quake::getQuantumOperands(Operation *op) {
+ValueRange quake::getQuantumOperands(Operation *op) {
   return getQuantumTypesFromRange(op->getOperands());
 }
 
 LogicalResult quake::setQuantumOperands(Operation *op, ValueRange quantumVals) {
-  mlir::ValueRange quantumOperands =
-      getQuantumTypesFromRange(op->getOperands());
+  ValueRange quantumOperands = getQuantumTypesFromRange(op->getOperands());
 
   if (quantumOperands.size() != quantumVals.size())
     return failure();
@@ -183,6 +182,14 @@ LogicalResult quake::AllocaOp::verify() {
       }
     }
   }
+
+  // Check the uses. If any use is a InitializeStateOp, then it must be the only
+  // use.
+  Operation *self = getOperation();
+  if (!self->getUsers().empty() && !self->hasOneUse())
+    for (auto *op : self->getUsers())
+      if (isa<quake::InitializeStateOp>(op))
+        return emitOpError("init_state must be the only use");
   return success();
 }
 
@@ -194,6 +201,15 @@ void quake::AllocaOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<FuseConstantToAllocaPattern>(context);
 }
 
+quake::InitializeStateOp quake::AllocaOp::getInitializedState() {
+  auto *self = getOperation();
+  if (self->hasOneUse()) {
+    auto x = self->getUsers().begin();
+    return dyn_cast<quake::InitializeStateOp>(*x);
+  }
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // Apply
 //===----------------------------------------------------------------------===//
@@ -480,6 +496,19 @@ LogicalResult quake::ExtractRefOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// InitializeStateOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult quake::InitializeStateOp::verify() {
+  auto veqTy = cast<quake::VeqType>(getTargets().getType());
+  if (veqTy.hasSpecifiedSize())
+    if (!std::has_single_bit(veqTy.getSize()))
+      return emitOpError("initialize state vector must be power of 2, but is " +
+                         std::to_string(veqTy.getSize()) + " instead.");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // RelaxSizeOp
 //===----------------------------------------------------------------------===//
@@ -694,8 +723,8 @@ static LogicalResult getParameterAsDouble(Value parameter, double &result) {
   auto paramDefOp = parameter.getDefiningOp();
   if (!paramDefOp)
     return failure();
-  if (auto constOp = mlir::dyn_cast<mlir::arith::ConstantOp>(paramDefOp)) {
-    if (auto value = dyn_cast<mlir::FloatAttr>(constOp.getValue())) {
+  if (auto constOp = dyn_cast<arith::ConstantOp>(paramDefOp)) {
+    if (auto value = dyn_cast<FloatAttr>(constOp.getValue())) {
       result = value.getValueAsDouble();
       return success();
     }
diff --git a/lib/Optimizer/Transforms/AddDeallocs.cpp b/lib/Optimizer/Transforms/AddDeallocs.cpp
index d9c06ca05a..4fc41b83a8 100644
--- a/lib/Optimizer/Transforms/AddDeallocs.cpp
+++ b/lib/Optimizer/Transforms/AddDeallocs.cpp
@@ -49,7 +49,11 @@ struct DeallocationAnalysisInfo {
               deallocMap.insert(std::make_pair(&op, false));
           } else if (auto dealloc = dyn_cast<quake::DeallocOp>(op)) {
             auto val = dealloc.getReference();
-            Operation *alloc = cast<quake::AllocaOp>(val.getDefiningOp());
+            Operation *alloc = val.getDefiningOp();
+            if (!isa<quake::AllocaOp>(alloc)) {
+              auto initState = cast<quake::InitializeStateOp>(alloc);
+              alloc = initState.getTargets().getDefiningOp();
+            }
             if (deallocMap.count(alloc))
               deallocMap[alloc] = true;
             else
@@ -99,6 +103,8 @@ class DeallocationAnalysis {
         }
       } else if (auto dealloc = dyn_cast<quake::DeallocOp>(o)) {
         auto val = dealloc.getReference();
+        if (auto init = val.getDefiningOp<quake::InitializeStateOp>())
+          val = init.getTargets();
         if (auto alloc = val.getDefiningOp<quake::AllocaOp>()) {
           auto *op = alloc.getOperation();
           if (allocMap.count(op))
@@ -121,8 +127,16 @@ class DeallocationAnalysis {
 
 inline void generateDeallocsForSet(PatternRewriter &rewriter,
                                    llvm::DenseSet<Operation *> &allocSet) {
-  for (Operation *a : allocSet)
-    rewriter.create<quake::DeallocOp>(a->getLoc(), cast<quake::AllocaOp>(a));
+  for (Operation *a : allocSet) {
+    auto alloc = cast<quake::AllocaOp>(a);
+    Value v = alloc;
+    if (a->hasOneUse()) {
+      if (auto initState =
+              dyn_cast<quake::InitializeStateOp>(*a->getUsers().begin()))
+        v = initState;
+    }
+    rewriter.create<quake::DeallocOp>(a->getLoc(), v);
+  }
 }
 
 // The different rewrite cases involve the same work, but use different types.
diff --git a/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp b/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp
index 06cdc68161..abd04d6f58 100644
--- a/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp
+++ b/lib/Optimizer/Transforms/CombineQuantumAlloc.cpp
@@ -165,7 +165,7 @@ class CombineQuantumAllocationsPass
     for (auto &block : func.getRegion())
       for (auto &op : block) {
         if (auto alloc = dyn_cast_or_null<quake::AllocaOp>(&op)) {
-          if (alloc.getSize())
+          if (alloc.getSize() || alloc.hasInitializedState())
             return;
           analysis.allocations.push_back(alloc);
           auto size = allocationSize(alloc);
diff --git a/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp b/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp
index 59fe5ef137..d597bad483 100644
--- a/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp
+++ b/lib/Optimizer/Transforms/FactorQuantumAlloc.cpp
@@ -123,6 +123,8 @@ class FactorQuantumAllocationsPass
     ConversionTarget target(*ctx);
     target.addLegalDialect<quake::QuakeDialect>();
     target.addDynamicallyLegalOp<quake::DeallocOp>([](quake::DeallocOp d) {
+      if (d.getReference().getDefiningOp<quake::InitializeStateOp>())
+        return true;
       if (auto ty = dyn_cast<quake::VeqType>(d.getReference().getType()))
         return !ty.hasSpecifiedSize();
       return true;
@@ -147,6 +149,8 @@ class FactorQuantumAllocationsPass
              allocations.end();
     });
     target.addDynamicallyLegalOp<quake::DeallocOp>([](quake::DeallocOp d) {
+      if (d.getReference().getDefiningOp<quake::InitializeStateOp>())
+        return true;
       if (auto ty = dyn_cast<quake::VeqType>(d.getReference().getType()))
         return !ty.hasSpecifiedSize();
       return true;
@@ -164,7 +168,8 @@ class FactorQuantumAllocationsPass
   LogicalResult runAnalysis(SmallVector<quake::AllocaOp> &allocations) {
     auto func = getOperation();
     func.walk([&](quake::AllocaOp alloc) {
-      if (!allocaOfVeq(alloc) || allocaOfUnspecifiedSize(alloc))
+      if (!allocaOfVeq(alloc) || allocaOfUnspecifiedSize(alloc) ||
+          alloc.hasInitializedState())
         return;
       bool usesAreConvertible = [&]() {
         for (auto *users : alloc->getUsers()) {
diff --git a/lib/Optimizer/Transforms/LowerUnwind.cpp b/lib/Optimizer/Transforms/LowerUnwind.cpp
index a9767e3834..2e3ba4f216 100644
--- a/lib/Optimizer/Transforms/LowerUnwind.cpp
+++ b/lib/Optimizer/Transforms/LowerUnwind.cpp
@@ -321,6 +321,16 @@ static bool anyPrimitiveAncestor(
   return false;
 }
 
+static Value adjustedDeallocArg(quake::AllocaOp alloc) {
+  if (auto init = alloc.getInitializedState())
+    return init.getResult();
+  return alloc.getResult();
+}
+
+static Value adjustedDeallocArg(Operation *op) {
+  return adjustedDeallocArg(cast<quake::AllocaOp>(op));
+}
+
 namespace {
 /// A scope op that contains an unwind op and is contained by a loop (for break
 /// or continue) or for return always, dictates that the unwind op must transfer
@@ -366,7 +376,7 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       auto *contOp = pr.first;
       rewriter.setInsertionPoint(contOp);
       for (auto a : llvm::reverse(pr.second))
-        rewriter.create<quake::DeallocOp>(a.getLoc(), a.getResult());
+        rewriter.create<quake::DeallocOp>(a.getLoc(), adjustedDeallocArg(a));
       rewriter.replaceOpWithNewOp<cf::BranchOp>(contOp, nextBlock,
                                                 contOp->getOperands());
     }
@@ -380,7 +390,7 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       if (Block *blk = blockInfo.continueBlock) {
         rewriter.setInsertionPointToEnd(blk);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), a->getResult(0));
+          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
         if (asPrimitive) {
           Block *landingPad = getLandingPad(infoMap, scope).continueBlock;
           rewriter.create<cf::BranchOp>(loc, landingPad, blk->getArguments());
@@ -393,7 +403,7 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       if (Block *blk = blockInfo.breakBlock) {
         rewriter.setInsertionPointToEnd(blk);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), a->getResult(0));
+          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
         if (asPrimitive) {
           Block *landingPad = getLandingPad(infoMap, scope).breakBlock;
           rewriter.create<cf::BranchOp>(loc, landingPad, blk->getArguments());
@@ -406,7 +416,7 @@ struct ScopeOpPattern : public OpRewritePattern<cudaq::cc::ScopeOp> {
       if (Block *blk = blockInfo.returnBlock) {
         rewriter.setInsertionPointToEnd(blk);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), a->getResult(0));
+          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
         assert(asPrimitive);
         Block *landingPad = getLandingPad(infoMap, scope).returnBlock;
         rewriter.create<cf::BranchOp>(loc, landingPad, blk->getArguments());
@@ -458,7 +468,7 @@ struct FuncLikeOpPattern : public OpRewritePattern<OP> {
       auto *exitOp = pr.first;
       rewriter.setInsertionPoint(exitOp);
       for (auto a : llvm::reverse(pr.second))
-        rewriter.create<quake::DeallocOp>(a.getLoc(), a.getResult());
+        rewriter.create<quake::DeallocOp>(a.getLoc(), adjustedDeallocArg(a));
     }
 
     // Here, we handle the unwind return jumps.
@@ -477,7 +487,7 @@ struct FuncLikeOpPattern : public OpRewritePattern<OP> {
       if (Block *exitBlock = blockInfo.returnBlock) {
         rewriter.setInsertionPointToEnd(exitBlock);
         for (auto a : llvm::reverse(qallocas))
-          rewriter.create<quake::DeallocOp>(a->getLoc(), a->getResult(0));
+          rewriter.create<quake::DeallocOp>(a->getLoc(), adjustedDeallocArg(a));
         rewriter.create<TERM>(func.getLoc(), exitBlock->getArguments());
         func.getBody().push_back(exitBlock);
       }
diff --git a/python/cudaq/__init__.py b/python/cudaq/__init__.py
index ee0f389a01..e6dc131886 100644
--- a/python/cudaq/__init__.py
+++ b/python/cudaq/__init__.py
@@ -6,7 +6,7 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
-import sys, os
+import sys, os, numpy
 from ._packages import *
 from .kernel.kernel_decorator import kernel, PyKernelDecorator
 from .kernel.kernel_builder import make_kernel, QuakeValue, PyKernel
@@ -87,6 +87,26 @@ def synthesize(kernel, *args):
                              kernelName=kernel.name)
 
 
+def simulation_dtype():
+    """
+    Return the data type for the current simulation backend, 
+    either `numpy.complex128` or `numpy.complex64`.
+    """
+    target = get_target()
+    precision = target.get_precision()
+    if precision == cudaq_runtime.SimulationPrecision.fp64:
+        return complex
+    return numpy.complex64
+
+
+def create_state(array_data):
+    """
+    Create a state array with the appropriate data type for the 
+    current simulation backend target. 
+    """
+    return numpy.array(array_data, dtype=simulation_dtype())
+
+
 def __clearKernelRegistries():
     global globalKernelRegistry, globalAstRegistry
     globalKernelRegistry.clear()
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index c07c737b56..5a74a35bb2 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -17,7 +17,7 @@
 from ..mlir.passmanager import *
 from ..mlir.dialects import quake, cc
 from ..mlir.dialects import builtin, func, arith, math, complex
-from ..mlir._mlir_libs._quakeDialects import cudaq_runtime, load_intrinsic
+from ..mlir._mlir_libs._quakeDialects import cudaq_runtime, load_intrinsic, register_all_dialects
 
 # This file implements the CUDA Quantum Python AST to MLIR conversion.
 # It provides a `PyASTBridge` class that implements the `ast.NodeVisitor` type
@@ -115,6 +115,7 @@ def __init__(self, **kwargs):
             self.loc = Location.unknown(context=self.ctx)
         else:
             self.ctx = Context()
+            register_all_dialects(self.ctx)
             quake.register_dialect(self.ctx)
             cc.register_dialect(self.ctx)
             cudaq_runtime.registerLLVMDialectTranslation(self.ctx)
@@ -245,11 +246,27 @@ def getFloatAttr(self, type, value):
     def getConstantFloat(self, value):
         """
         Create a constant float operation and return its MLIR result Value.
-        Takes as input the concrete float value. 
+        Takes as input the concrete float value.
         """
         ty = self.getFloatType()
         return arith.ConstantOp(ty, self.getFloatAttr(ty, value)).result
 
+    def getComplexType(self):
+        """
+        Return an MLIR complex type (double precision).
+        """
+        return ComplexType.get(self.getFloatType())
+
+    def getConstantComplex(self, value):
+        """
+        Create a constant complex operation and return its MLIR result Value.
+        Takes as input the concrete float value.
+        """
+        ty = self.getComplexType()
+        return complex.CreateOp(ty,
+                                 self.getConstantFloat(value.real),
+                                 self.getConstantFloat(value.imag)).result
+
     def getConstantInt(self, value, width=64):
         """
         Create a constant integer operation and return its MLIR result Value.
@@ -884,7 +901,7 @@ def visit_Assign(self, node):
             self.visit(node.value)
 
         if len(self.valueStack) == 0:
-            self.emitFatalError("invalid assignement detected.", node)
+            self.emitFatalError("invalid assignment detected.", node)
 
         varNames = []
         varValues = []
@@ -930,7 +947,7 @@ def visit_Attribute(self, node):
         see from ubiquitous external modules like `numpy`.
         """
         if self.verbose:
-            print('[Visit Attribute]')
+            print(f'[Visit Attribute {node.attr}]')
 
         self.currentNode = node
         # Disallow list.append since we don't do dynamic memory allocation
@@ -2725,6 +2742,21 @@ def visit_Continue(self, node):
         else:
             cc.ContinueOp([])
 
+    def promote_operand_type(self, ty, operand):
+        if ComplexType.isinstance(ty):
+            if not ComplexType.isinstance(operand.type):
+                if IntegerType.isinstance(operand.type):
+                    operand = arith.SIToFPOp(self.getFloatType(), operand).result
+                operand = complex.CreateOp(
+                    ComplexType.get(self.getFloatType()), operand,
+                        self.getConstantFloat(0.0)).result
+
+        if F64Type.isinstance(ty):
+            if IntegerType.isinstance(operand.type):
+                operand = arith.SIToFPOp(ty, operand).result
+
+        return operand
+
     def visit_BinOp(self, node):
         """
         Visit binary operation nodes in the AST and map them to equivalents in the 
@@ -2757,16 +2789,22 @@ def visit_BinOp(self, node):
             raise RuntimeError("Invalid type for Binary Op {} ({}, {})".format(
                 type(node.op), right, right))
 
-        # Basedon the op type and the leaf types, create the MLIR operator
+        # Type promotion for Add, Sub, Div, Mult operations
+        if isinstance(node.op, (ast.Add, ast.Sub, ast.Mult, ast.Div)):
+            right = self.promote_operand_type(left.type, right)
+            left = self.promote_operand_type(right.type, left)
+
+        # Based on the op type and the leaf types, create the MLIR operator
         if isinstance(node.op, ast.Add):
             if IntegerType.isinstance(left.type):
                 self.pushValue(arith.AddIOp(left, right).result)
                 return
             elif F64Type.isinstance(left.type):
-                if IntegerType.isinstance(right.type):
-                    right = arith.SIToFPOp(left.type, right).result
                 self.pushValue(arith.AddFOp(left, right).result)
                 return
+            elif ComplexType.isinstance(left.type):
+                self.pushValue(complex.AddOp(left, right).result)
+                return
             else:
                 self.emitFatalError("unhandled BinOp.Add types.", node)
 
@@ -2774,6 +2812,11 @@ def visit_BinOp(self, node):
             if IntegerType.isinstance(left.type):
                 self.pushValue(arith.SubIOp(left, right).result)
                 return
+            if F64Type.isinstance(left.type):
+                self.pushValue(arith.SubFOp(left, right).result)
+                return
+            if ComplexType.isinstance(left.type):
+                self.pushValue(complex.SubOp(left, right).result)
             else:
                 self.emitFatalError("unhandled BinOp.Sub types.", node)
         if isinstance(node.op, ast.FloorDiv):
@@ -2784,17 +2827,6 @@ def visit_BinOp(self, node):
                 self.emitFatalError("unhandled BinOp.FloorDiv types.", node)
         if isinstance(node.op, ast.Div):
             if ComplexType.isinstance(left.type):
-                if not ComplexType.isinstance(right.type):
-                    right = complex.CreateOp(
-                        ComplexType.get(self.getFloatType()), right,
-                        self.getConstantFloat(0.0)).result
-                self.pushValue(complex.DivOp(left, right).result)
-                return
-            if ComplexType.isinstance(right.type):
-                if not ComplexType.isinstance(left.type):
-                    left = complex.CreateOp(
-                        ComplexType.get(self.getFloatType()), left,
-                        self.getConstantFloat(0.0)).result
                 self.pushValue(complex.DivOp(left, right).result)
                 return
 
@@ -2828,26 +2860,16 @@ def visit_BinOp(self, node):
             return
         if isinstance(node.op, ast.Mult):
             if ComplexType.isinstance(left.type):
-                if not ComplexType.isinstance(right.type):
-                    if IntegerType.isinstance(right.type):
-                        right = arith.SIToFPOp(self.getFloatType(),
-                                               right).result
-                    right = complex.CreateOp(left.type, right,
-                                             self.getConstantFloat(0.)).result
                 self.pushValue(complex.MulOp(left, right).result)
                 return
 
             if F64Type.isinstance(left.type):
-                if not F64Type.isinstance(right.type):
-                    right = arith.SIToFPOp(self.getFloatType(), right).result
+                self.pushValue(arith.MulFOp(left, right).result)
+                return
 
             if IntegerType.isinstance(left.type):
-                if not IntegerType.isinstance(right.type):
-                    right = arith.FPToSIOp(left.type, right).result
                 self.pushValue(arith.MulIOp(left, right).result)
                 return
-
-            self.pushValue(arith.MulFOp(left, right).result)
             return
         if isinstance(node.op, ast.Mod):
             if F64Type.isinstance(left.type):
@@ -2895,9 +2917,10 @@ def visit_Name(self, node):
 
         if node.id in self.capturedVars:
             # Only support a small subset of types here
+            complexType = type(1j)
             value = self.capturedVars[node.id]
             if isinstance(value, list) and isinstance(value[0],
-                                                      (int, bool, float)):
+                                                     (int, bool, float, complexType)):
                 elementValues = None
                 if isinstance(value[0], float):
                     elementValues = [self.getConstantFloat(el) for el in value]
@@ -2905,6 +2928,8 @@ def visit_Name(self, node):
                     elementValues = [self.getConstantInt(el) for el in value]
                 elif isinstance(value[0], bool):
                     elementValues = [self.getConstantInt(el, 1) for el in value]
+                elif isinstance(value[0], complexType):
+                    elementValues = [self.getConstantComplex(el) for el in value]
 
                 if elementValues != None:
                     self.dependentCaptureVars[node.id] = value
@@ -2922,6 +2947,8 @@ def visit_Name(self, node):
                 mlirValCreator = lambda: self.getConstantInt(value, 1)
             elif isinstance(value, float):
                 mlirValCreator = lambda: self.getConstantFloat(value)
+            elif isinstance(value, complexType):
+                mlirValCreator = lambda: self.getConstantComplex(value)
 
             if mlirValCreator != None:
                 with InsertionPoint.at_block_begin(self.entry):
@@ -2935,8 +2962,12 @@ def visit_Name(self, node):
                     self.pushValue(stackSlot)
                     return
 
+            errorType = type(value).__name__
+            if (isinstance(value, list)):
+                errorType = f"{errorType}[{type(value[0]).__name__}]"
+
             self.emitFatalError(
-                f"Invalid type for variable ({node.id}) captured from parent scope (only int, bool, float, and list[int|bool|float] accepted, type was {type(value)}).",
+                f"Invalid type for variable ({node.id}) captured from parent scope (only int, bool, float, complex, and list[int|bool|float|complex] accepted, type was {errorType}).",
                 node)
 
         # Throw an exception for the case that the name is not
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 0b238065cb..30e0a266a2 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -7,10 +7,12 @@
 # ============================================================================ #
 
 from functools import partialmethod
+import hashlib
 import random
 import re
 import string
 import sys
+import numpy as np
 from typing import get_origin, List
 from .quake_value import QuakeValue
 from .kernel_decorator import PyKernelDecorator
@@ -22,8 +24,8 @@
 from ..mlir.passmanager import *
 from ..mlir.execution_engine import *
 from ..mlir.dialects import quake, cc
-from ..mlir.dialects import builtin, func, arith
-from ..mlir._mlir_libs._quakeDialects import cudaq_runtime
+from ..mlir.dialects import builtin, func, arith, math
+from ..mlir._mlir_libs._quakeDialects import cudaq_runtime, register_all_dialects
 
 
 ## [PYTHON_VERSION_FIX]
@@ -175,6 +177,14 @@ def __singleTargetSingleParameterControlOperation(self,
                            context=self.ctx)
 
 
+def supportCommonCast(mlirType, otherTy, arg, FromType, ToType, PyType):
+    argEleTy = cc.StdvecType.getElementType(mlirType)
+    eleTy = cc.StdvecType.getElementType(otherTy)
+    if ToType.isinstance(eleTy) and FromType.isinstance(argEleTy):
+        return [PyType(i) for i in arg]
+    return None
+
+
 class PyKernel(object):
     """
     The :class:`Kernel` provides an API for dynamically constructing quantum 
@@ -191,10 +201,12 @@ class PyKernel(object):
 
     def __init__(self, argTypeList):
         self.ctx = Context()
+        register_all_dialects(self.ctx)
         quake.register_dialect(self.ctx)
         cc.register_dialect(self.ctx)
         cudaq_runtime.registerLLVMDialectTranslation(self.ctx)
 
+        self.stateHashes = []
         self.metadata = {'conditionalOnMeasure': False}
         self.regCounter = 0
         self.loc = Location.unknown(context=self.ctx)
@@ -234,6 +246,13 @@ def __init__(self, argTypeList):
 
             self.insertPoint = InsertionPoint.at_block_begin(e)
 
+    def __del__(self):
+        """
+        When a kernel builder is deleted we need to clean up 
+        any state data if there is any.
+        """
+        cudaq_runtime.deletePointersToStateData(self.stateHashes)
+
     def __processArgType(self, ty):
         """
         Process input argument type. Specifically, try to infer the 
@@ -247,6 +266,8 @@ def __processArgType(self, ty):
                     'int': int,
                     'bool': bool,
                     'float': float,
+                    'complex': complex,
+                    'numpy.complex64': np.complex64,
                     'pauli_word': cudaq_runtime.pauli_word
                 }
                 # Infer the slice type
@@ -516,13 +537,13 @@ def __str__(self, canonicalize=True):
             return str(cloned)
         return str(self.module)
 
-    def qalloc(self, size=None):
+    def qalloc(self, initializer=None):
         """
         Allocate a register of qubits of size `qubit_count` and return a 
         handle to them as a :class:`QuakeValue`.
 
         Args:
-            qubit_count (Union[`int`,`QuakeValue`): The number of qubits to allocate.
+            initializer (Union[`int`,`QuakeValue`, `list[T]`): The number of qubits to allocate or a concrete state to allocate and initialize the qubits.
         Returns:
             :class:`QuakeValue`: A handle to the allocated qubits in the MLIR.
 
@@ -533,18 +554,150 @@ def qalloc(self, size=None):
         ```
         """
         with self.insertPoint, self.loc:
-            if size == None:
+            # If the initializer is an integer, create `veq<N>`
+            if isinstance(initializer, int):
+                veqTy = quake.VeqType.get(self.ctx, initializer)
+                return self.__createQuakeValue(quake.AllocaOp(veqTy).result)
+
+            if isinstance(initializer, list):
+                initializer = np.array(initializer, dtype=type(initializer[0]))
+
+            if isinstance(initializer, np.ndarray):
+                if len(initializer.shape) != 1:
+                    raise RuntimeError(
+                        "invalid initializer for qalloc (np.ndarray must be 1D, vector-like)"
+                    )
+
+                if initializer.dtype not in [
+                        complex, np.complex128, np.complex64
+                ]:
+                    raise RuntimeError(
+                        "qalloc state data must be of complex dtype.")
+
+                # Get the current simulation precision
+                currentTarget = cudaq_runtime.get_target()
+                simulationPrecision = currentTarget.get_precision()
+                if initializer.dtype in [np.complex128, complex]:
+                    if simulationPrecision == cudaq_runtime.SimulationPrecision.fp32:
+                        raise RuntimeError(
+                            "qalloc input state is complex128 but simulator is on complex64 floating point type."
+                        )
+
+                if initializer.dtype == np.complex64:
+                    if simulationPrecision == cudaq_runtime.SimulationPrecision.fp64:
+                        raise RuntimeError(
+                            "qalloc input state is complex64 but simulator is on complex128 floating point type."
+                        )
+
+                # Compute a unique hash string for the state data
+                hashValue = hashlib.sha1(initializer).hexdigest(
+                )[:10] + self.name.removeprefix('__nvqppBuilderKernel_')
+
+                # Get the size of the array
+                size = len(initializer)
+
+                floatType = F64Type.get(
+                    self.ctx
+                ) if simulationPrecision == cudaq_runtime.SimulationPrecision.fp64 else F32Type.get(
+                    self.ctx)
+                complexType = ComplexType.get(floatType)
+                ptrComplex = cc.PointerType.get(self.ctx, complexType)
+                i32Ty = self.getIntegerType(32)
+                globalTy = cc.StructType.get(self.ctx, [ptrComplex, i32Ty])
+                globalName = f'nvqpp.state.{hashValue}'
+                setStateName = f'nvqpp.set.state.{hashValue}'
+                with InsertionPoint.at_block_begin(self.module.body):
+                    cc.GlobalOp(TypeAttr.get(globalTy),
+                                globalName,
+                                external=True)
+                    setStateFunc = func.FuncOp(setStateName,
+                                               FunctionType.get(
+                                                   inputs=[ptrComplex],
+                                                   results=[]),
+                                               loc=self.loc)
+                    entry = setStateFunc.add_entry_block()
+                    kDynamicPtrIndex: int = -2147483648
+                    with InsertionPoint(entry):
+                        zero = self.getConstantInt(0)
+                        address = cc.AddressOfOp(
+                            cc.PointerType.get(self.ctx, globalTy),
+                            FlatSymbolRefAttr.get(globalName))
+                        ptr = cc.ComputePtrOp(
+                            cc.PointerType.get(self.ctx, ptrComplex), address,
+                            [zero, zero],
+                            DenseI32ArrayAttr.get(
+                                [kDynamicPtrIndex, kDynamicPtrIndex],
+                                context=self.ctx))
+                        cc.StoreOp(entry.arguments[0], ptr)
+                        func.ReturnOp([])
+
+                zero = self.getConstantInt(0)
+                numQubits = np.log2(size)
+                if not numQubits.is_integer():
+                    raise RuntimeError(
+                        "invalid input state size for qalloc (not a power of 2)"
+                    )
+
+                # check state is normalized
+                norm = sum([np.conj(a) * a for a in initializer])
+                if np.abs(norm.imag) > 1e-4 or np.abs(1. - norm.real) > 1e-4:
+                    raise RuntimeError(
+                        "invalid input state for qalloc (not normalized)")
+
+                veqTy = quake.VeqType.get(self.ctx, int(numQubits))
+                qubits = quake.AllocaOp(veqTy).result
+                address = cc.AddressOfOp(cc.PointerType.get(self.ctx, globalTy),
+                                         FlatSymbolRefAttr.get(globalName))
+                ptr = cc.ComputePtrOp(
+                    cc.PointerType.get(self.ctx, ptrComplex), address,
+                    [zero, zero],
+                    DenseI32ArrayAttr.get([kDynamicPtrIndex, kDynamicPtrIndex],
+                                          context=self.ctx))
+                loaded = cc.LoadOp(ptr)
+                qubits = quake.InitializeStateOp(qubits.type, qubits,
+                                                 loaded).result
+
+                # Record the unique hash value
+                if hashValue not in self.stateHashes:
+                    self.stateHashes.append(hashValue)
+
+                # Store the pointer to the array data
+                cudaq_runtime.storePointerToStateData(
+                    self.name, hashValue, initializer,
+                    cudaq_runtime.SimulationPrecision.fp64)
+
+                return self.__createQuakeValue(qubits)
+
+            # If the initializer is a QuakeValue, see if it is
+            # a integer or a `stdvec` type
+            if isinstance(initializer, QuakeValue):
+                veqTy = quake.VeqType.get(self.ctx)
+                if IntegerType.isinstance(initializer.mlirValue.type):
+                    # This is an integer size
+                    return self.__createQuakeValue(
+                        quake.AllocaOp(veqTy,
+                                       size=initializer.mlirValue).result)
+
+                if cc.StdvecType.isinstance(initializer.mlirValue.type):
+                    # This is a state to initialize to
+                    size = cc.StdvecSizeOp(self.getIntegerType(),
+                                           initializer.mlirValue).result
+                    numQubits = math.CountTrailingZerosOp(size).result
+                    qubits = quake.AllocaOp(veqTy, size=numQubits).result
+                    ptrTy = cc.PointerType.get(
+                        self.ctx,
+                        cc.StdvecType.getElementType(
+                            initializer.mlirValue.type))
+                    initials = cc.StdvecDataOp(ptrTy, initializer.mlirValue)
+                    quake.InitializeStateOp(veqTy, qubits, initials)
+                    return self.__createQuakeValue(qubits)
+
+            # If no initializer, create a single qubit
+            if initializer == None:
                 qubitTy = quake.RefType.get(self.ctx)
                 return self.__createQuakeValue(quake.AllocaOp(qubitTy).result)
-            else:
-                if isinstance(size, QuakeValue):
-                    veqTy = quake.VeqType.get(self.ctx)
-                    sizeVal = size.mlirValue
-                    return self.__createQuakeValue(
-                        quake.AllocaOp(veqTy, size=sizeVal).result)
-                else:
-                    veqTy = quake.VeqType.get(self.ctx, size)
-                    return self.__createQuakeValue(quake.AllocaOp(veqTy).result)
+
+            raise RuntimeError("invalid initializer argument for qalloc.")
 
     def __isPauliWordType(self, ty):
         """
@@ -1132,6 +1285,26 @@ def getListType(eleType: type):
                     continue
                 listType = getListType(type(arg[0]))
             mlirType = mlirTypeFromPyType(argType, self.ctx)
+
+            if cc.StdvecType.isinstance(mlirType):
+                # Support passing `list[int]` to a `list[float]` argument
+                if cc.StdvecType.isinstance(self.mlirArgTypes[i]):
+                    maybeCasted = supportCommonCast(mlirType,
+                                                    self.mlirArgTypes[i], arg,
+                                                    IntegerType, F64Type, float)
+                    if maybeCasted != None:
+                        processedArgs.append(maybeCasted)
+                        continue
+
+                    # Support passing `list[float]` to a `list[complex]` argument
+                    maybeCasted = supportCommonCast(mlirType,
+                                                    self.mlirArgTypes[i], arg,
+                                                    F64Type, ComplexType,
+                                                    complex)
+                    if maybeCasted != None:
+                        processedArgs.append(maybeCasted)
+                        continue
+
             if mlirType != self.mlirArgTypes[
                     i] and listType != mlirTypeToPyType(self.mlirArgTypes[i]):
                 emitFatalError(
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index 203e66eb5c..61dc288510 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -209,9 +209,12 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
                     )
             return cc.StdvecType.get(ctx, mlirTypeFromPyType(float, ctx))
 
-        if isinstance(argInstance[0], complex):
+        if isinstance(argInstance[0], (complex, np.complex128)):
             return cc.StdvecType.get(ctx, mlirTypeFromPyType(complex, ctx))
 
+        if isinstance(argInstance[0], np.complex64):
+            return cc.StdvecType.get(ctx, ComplexType.get(F32Type.get(ctx)))
+
         if isinstance(argInstance[0], pauli_word):
             return cc.StdvecType.get(ctx, cc.CharspanType.get(ctx))
 
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index ea60a147ed..ca58f36236 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -29,6 +29,7 @@
 #include "mlir/Target/LLVMIR/Export.h"
 
 #include <fmt/core.h>
+#include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
 namespace py = pybind11;
@@ -37,6 +38,16 @@ using namespace mlir;
 namespace cudaq {
 static std::unique_ptr<JITExecutionCache> jitCache;
 
+struct PyStateVectorData {
+  void *data = nullptr;
+  simulation_precision precision = simulation_precision::fp32;
+  std::string kernelName;
+};
+using PyStateVectorStorage = std::map<std::string, PyStateVectorData>;
+
+static std::unique_ptr<PyStateVectorStorage> stateStorage =
+    std::make_unique<PyStateVectorStorage>();
+
 std::tuple<ExecutionEngine *, void *, std::size_t>
 jitAndCreateArgs(const std::string &name, MlirModule module,
                  cudaq::OpaqueArguments &runtimeArgs,
@@ -185,6 +196,28 @@ pyAltLaunchKernelBase(const std::string &name, MlirModule module,
 
   std::string properName = name;
 
+  // If we have any state vector data, we need to extract the function pointer
+  // to set that data, and then set it.
+  for (auto &[stateHash, svdata] : *stateStorage) {
+    if (svdata.kernelName != name)
+      continue;
+    auto setStateFPtr = jit->lookup("nvqpp.set.state." + stateHash);
+    if (!setStateFPtr)
+      throw std::runtime_error(
+          "python alt_launch_kernel failed to get set state function.");
+
+    if (svdata.precision == simulation_precision::fp64) {
+      auto setStateFunc =
+          reinterpret_cast<void (*)(std::complex<double> *)>(*setStateFPtr);
+      setStateFunc(reinterpret_cast<std::complex<double> *>(svdata.data));
+      continue;
+    }
+
+    auto setStateFunc =
+        reinterpret_cast<void (*)(std::complex<float> *)>(*setStateFPtr);
+    setStateFunc(reinterpret_cast<std::complex<float> *>(svdata.data));
+  }
+
   // Need to first invoke the init_func()
   auto kernelInitFunc = properName + ".init_func";
   auto initFuncPtr = jit->lookup(kernelInitFunc);
@@ -362,6 +395,7 @@ void bindAltLaunchKernel(py::module &mod) {
       },
       py::arg("kernelName"), py::arg("module"), py::kw_only(),
       py::arg("callable_names") = std::vector<std::string>{}, "DOC STRING");
+
   mod.def(
       "pyAltLaunchKernelR",
       [&](const std::string &kernelName, MlirModule module, MlirType returnType,
@@ -398,5 +432,28 @@ void bindAltLaunchKernel(py::module &mod) {
         return getQIRLL(name, module, args, profile);
       },
       py::arg("kernel"), py::kw_only(), py::arg("profile") = "");
+
+  mod.def(
+      "storePointerToStateData",
+      [](const std::string &name, const std::string &hash, py::buffer data,
+         simulation_precision precision) {
+        auto ptr = data.request().ptr;
+        stateStorage->insert({hash, PyStateVectorData{ptr, precision, name}});
+      },
+      "Store qalloc state initialization array data.");
+
+  mod.def(
+      "deletePointersToStateData",
+      [](const std::vector<std::string> &hashes) {
+        for (auto iter = stateStorage->cbegin(); iter != stateStorage->end();) {
+          if (std::find(hashes.begin(), hashes.end(), iter->first) !=
+              hashes.end()) {
+            stateStorage->erase(iter++);
+            continue;
+          }
+          iter++;
+        }
+      },
+      "Remove our pointers to the qalloc array data.");
 }
 } // namespace cudaq
\ No newline at end of file
diff --git a/python/runtime/cudaq/target/py_runtime_target.cpp b/python/runtime/cudaq/target/py_runtime_target.cpp
index fd36dbc157..320ef516fa 100644
--- a/python/runtime/cudaq/target/py_runtime_target.cpp
+++ b/python/runtime/cudaq/target/py_runtime_target.cpp
@@ -17,6 +17,10 @@ namespace cudaq {
 
 void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
 
+  py::enum_<simulation_precision>(mod, "SimulationPrecision")
+      .value("fp32", simulation_precision::fp32)
+      .value("fp64", simulation_precision::fp64);
+
   py::class_<cudaq::RuntimeTarget>(
       mod, "Target",
       "The `cudaq.Target` represents the underlying infrastructure that CUDA "
@@ -40,13 +44,17 @@ void bindRuntimeTarget(py::module &mod, LinkedLibraryHolder &holder) {
       .def("is_emulated", &cudaq::RuntimeTarget::is_emulated,
            "Returns true if the emulation mode for the target has been "
            "activated.")
+      .def("get_precision", &cudaq::RuntimeTarget::get_precision, "")
       .def(
           "__str__",
           [](cudaq::RuntimeTarget &self) {
-            return fmt::format("Target {}\n\tsimulator={}\n\tplatform={}"
-                               "\n\tdescription={}\n",
-                               self.name, self.simulatorName, self.platformName,
-                               self.description);
+            return fmt::format(
+                "Target {}\n\tsimulator={}\n\tplatform={}"
+                "\n\tdescription={}\n\tprecision={}\n",
+                self.name, self.simulatorName, self.platformName,
+                self.description,
+                self.get_precision() == simulation_precision::fp32 ? "fp32"
+                                                                   : "fp64");
           },
           "Persist the information in this `cudaq.Target` to a string.");
 
diff --git a/python/runtime/mlir/py_register_dialects.cpp b/python/runtime/mlir/py_register_dialects.cpp
index 0103f757ca..72301dff2c 100644
--- a/python/runtime/mlir/py_register_dialects.cpp
+++ b/python/runtime/mlir/py_register_dialects.cpp
@@ -12,9 +12,12 @@
 #include "cudaq/Optimizer/CAPI/Dialects.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/CodeGen/Pipelines.h"
+#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
 #include "cudaq/Optimizer/Dialect/CC/CCTypes.h"
+#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
 #include "cudaq/Optimizer/Dialect/Quake/QuakeTypes.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/InitAllDialects.h"
 
 #include <fmt/core.h>
 #include <pybind11/stl.h>
@@ -232,5 +235,15 @@ void bindRegisterDialects(py::module &mod) {
     if (failed(builder.loadIntrinsic(unwrapped, name)))
       unwrapped.emitError("failed to load intrinsic " + name);
   });
+
+  mod.def("register_all_dialects", [](MlirContext context) {
+    DialectRegistry registry;
+    registry.insert<quake::QuakeDialect, cudaq::cc::CCDialect>();
+    cudaq::opt::registerCodeGenDialect(registry);
+    registerAllDialects(registry);
+    auto *mlirContext = unwrap(context);
+    mlirContext->appendDialectRegistry(registry);
+    mlirContext->loadAllAvailableDialects();
+  });
 }
 } // namespace cudaq
\ No newline at end of file
diff --git a/python/tests/builder/test_kernel_builder.py b/python/tests/builder/test_kernel_builder.py
index bf13e97b5e..2ba0313049 100644
--- a/python/tests/builder/test_kernel_builder.py
+++ b/python/tests/builder/test_kernel_builder.py
@@ -876,42 +876,139 @@ def test_recursive_calls():
 
     print(kernel3)
 
-
-## [SKIP_TEST]
-@pytest.mark.skip(
-    reason="AttributeError: module 'cudaq' has no attribute 'from_state'")
-def test_from_state():
+def can_set_target(name):
+    target_installed = True
+    try:
+        cudaq.set_target(name)
+    except RuntimeError:
+        target_installed = False
+    return target_installed
+  
+skipIfNvidiaFP64NotInstalled = pytest.make.skipif(
+  can_set_target('nvidia-fp64'),
+  reason='Could not find nvidia-fp64 in installation')
+
+@skipIfNvidiaFP64NotInstalled
+def test_from_state0():
     cudaq.reset_target()
-    state = np.asarray([.70710678, 0., 0., 0.70710678])
-    kernel = cudaq.make_kernel()
-    qubits = kernel.qalloc(2)
+    cudaq.set_target('nvidia-fp64')
 
-    cudaq.from_state(kernel, qubits, state)
+    kernel, initState = cudaq.make_kernel(list[complex])
+    qubits = kernel.qalloc(initState)
 
-    print(kernel)
+    # Test float64 list, casts to complex
+    state = [.70710678, 0., 0., 0.70710678]
+    counts = cudaq.sample(kernel, state)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+    # Test complex list
+    state = [.70710678j, 0., 0., 0.70710678]
+    counts = cudaq.sample(kernel, state)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+    # Test Numpy array
+    state = np.asarray([.70710678, 0., 0., 0.70710678])
+    counts = cudaq.sample(kernel, state)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+    # Now test constant array data, not kernel input
+    state = np.array([.70710678, 0., 0., 0.70710678], dtype=complex)
+    kernel = cudaq.make_kernel()
+    qubits = kernel.qalloc(state)
     counts = cudaq.sample(kernel)
     print(counts)
     assert '11' in counts
     assert '00' in counts
 
-    kernel = cudaq.from_state(state)
+    state = [.70710678 + 0j, 0., 0., 0.70710678]
+    kernel = cudaq.make_kernel()
+    qubits = kernel.qalloc(state)
     counts = cudaq.sample(kernel)
     print(counts)
     assert '11' in counts
     assert '00' in counts
 
-    hamiltonian = 5.907 - 2.1433 * spin.x(0) * spin.x(1) - 2.1433 * spin.y(
-        0) * spin.y(1) + .21829 * spin.z(0) - 6.125 * spin.z(1)
-    state = np.asarray([0., .292786, .956178, 0.])
+    state = np.array([.70710678, 0., 0., 0.70710678])
+    kernel = cudaq.make_kernel()
+    with pytest.raises(RuntimeError) as e:
+        # float data and not complex data
+        qubits = kernel.qalloc(state)
+
+    state = np.array([.70710678, 0., 0., 0.70710678], dtype=np.complex64)
+    kernel = cudaq.make_kernel()
+    with pytest.raises(RuntimeError) as e:
+        # Wrong precision for fp64 simulator
+        qubits = kernel.qalloc(state)
+
+    with pytest.raises(RuntimeError) as e:
+        qubits = kernel.qalloc(np.array([1., 0., 0.], dtype=complex))
+
+skipIfNvidiaNotInstalled = pytest.make.skipif(
+  can_set_target('nvidia'),
+  reason='Could not find nvidia in installation')
+  
+@skipIfNvidiaNotInstalled
+def test_from_state1():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    state = np.array([.70710678, 0., 0., 0.70710678], dtype=np.complex128)
     kernel = cudaq.make_kernel()
-    qubits = kernel.qalloc(2)
-    cudaq.from_state(kernel, qubits, state)
-    energy = cudaq.observe(kernel, hamiltonian).expectation()
-    assert np.isclose(-1.748, energy, 1e-3)
+    with pytest.raises(RuntimeError) as e:
+        qubits = kernel.qalloc(state)
+
+    state = np.array([.70710678, 0., 0., 0.70710678], dtype=np.complex64)
+    kernel2 = cudaq.make_kernel()
+    qubits = kernel2.qalloc(state)
+    counts = cudaq.sample(kernel2)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
 
-    ss = cudaq.get_state(kernel)
-    for i in range(4):
-        assert np.isclose(ss[i], state[i], 1e-3)
+    cudaq.reset_target()
+
+    # Regardless of the target precision, use
+    # cudaq.simulation_dtype() or cudaq.create_state()
+    state = np.array([.70710678, 0., 0., 0.70710678],
+                     dtype=cudaq.simulation_dtype())
+    kernel2 = cudaq.make_kernel()
+    qubits = kernel2.qalloc(state)
+    counts = cudaq.sample(kernel2)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+    state = cudaq.create_state([.70710678, 0., 0., 0.70710678])
+    kernel2 = cudaq.make_kernel()
+    qubits = kernel2.qalloc(state)
+    counts = cudaq.sample(kernel2)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+    state = cudaq.create_state(np.array([.5]*4))
+    kernel2 = cudaq.make_kernel()
+    qubits = kernel2.qalloc(state)
+    counts = cudaq.sample(kernel2)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+    assert '01' in counts
+    assert '10' in counts
+
+    kernel, initState = cudaq.make_kernel(list[np.complex64])
+    qubits = kernel.qalloc(initState)
+    state = cudaq.create_state([.70710678, 0., 0., 0.70710678])
+    counts = cudaq.sample(kernel, state)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
 
 
 @skipIfPythonLessThan39
@@ -925,12 +1022,13 @@ def test_pauli_word_input():
         1, 3, 3, -0.0454063, -0, 15
     ]
     h = cudaq.SpinOperator(h2_data, 4)
-    
+
     kernel, theta, paulis = cudaq.make_kernel(float, list[cudaq.pauli_word])
     q = kernel.qalloc(4)
     kernel.x(q[0])
     kernel.x(q[1])
-    kernel.for_loop(0, paulis.size(), lambda idx : kernel.exp_pauli(theta, q, paulis[idx]))
+    kernel.for_loop(0, paulis.size(),
+                    lambda idx: kernel.exp_pauli(theta, q, paulis[idx]))
 
     print(kernel)
     want_exp = cudaq.observe(kernel, h, .11, ['XXXY']).expectation()
diff --git a/python/tests/kernel/test_kernel_lists.py b/python/tests/kernel/test_kernel_lists.py
new file mode 100644
index 0000000000..f08f8190f4
--- /dev/null
+++ b/python/tests/kernel/test_kernel_lists.py
@@ -0,0 +1,87 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import os, sys
+
+import pytest
+import numpy as np
+from typing import List 
+
+import cudaq
+from cudaq import spin
+
+## [PYTHON_VERSION_FIX]
+skipIfPythonLessThan39 = pytest.mark.skipif(
+    sys.version_info < (3, 9),
+    reason="built-in collection types such as `list` not supported")
+
+def test_float_lists():
+    """Test that we can use float numbers inside kernel functions."""
+
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    f = [0., 1., 1., 0.]
+
+    # Pass list of float as a parameter
+    @cudaq.kernel
+    def test_float_vec_param(vec : list[float]):
+        f1 = vec
+
+    counts = cudaq.sample(test_float_vec_param, f)
+    assert len(counts) == 0
+
+
+    # Capture list of float
+    @cudaq.kernel
+    def test_float_vec_capture():
+        f1 = f
+
+    counts = cudaq.sample(test_float_vec_capture)
+    assert len(counts) == 0
+
+
+    # Define list of float inside kernel
+    @cudaq.kernel
+    def test_float_vec_definition():
+        f1 = [1.0, 0., 0., 1.]
+
+    counts = cudaq.sample(test_float_vec_definition)
+    assert len(counts) == 0
+
+
+def test_float_lists():
+    """Test that we can use complex numbers inside kernel functions."""
+
+    # Pass list of complex as a parameter
+    c = [.70710678 + 0j, 0., 0., 0.70710678]
+
+    @cudaq.kernel
+    def test_complex_vec_param(vec : list[complex]):
+        f1 = vec
+
+    counts = cudaq.sample(test_complex_vec_param, c)
+    assert len(counts) == 0
+
+
+    # Capture list of complex
+    @cudaq.kernel
+    def test_complex_vec_capture():
+        f1 = c
+
+    counts = cudaq.sample(test_complex_vec_capture)
+    assert len(counts) == 0
+
+    # Define list of complex inside kernel
+    @cudaq.kernel
+    def test_complex_vec_definition():
+        f1 = [1.0 + 0j, 0., 0., 1.]
+
+
+    counts = cudaq.sample(test_complex_vec_definition)
+    assert len(counts) == 0
diff --git a/python/utils/LinkedLibraryHolder.cpp b/python/utils/LinkedLibraryHolder.cpp
index 9dacd8e57f..b7aff90760 100644
--- a/python/utils/LinkedLibraryHolder.cpp
+++ b/python/utils/LinkedLibraryHolder.cpp
@@ -29,6 +29,8 @@ constexpr static const char PLATFORM_LIBRARY[] = "PLATFORM_LIBRARY=";
 constexpr static const char NVQIR_SIMULATION_BACKEND[] =
     "NVQIR_SIMULATION_BACKEND=";
 constexpr static const char TARGET_DESCRIPTION[] = "TARGET_DESCRIPTION=";
+constexpr static const char IS_FP64_SIMULATION[] =
+    "CUDAQ_SIMULATION_SCALAR_FP64";
 
 /// @brief A utility function to check availability of Nvidia GPUs and return
 /// their count
@@ -75,6 +77,8 @@ bool RuntimeTarget::is_emulated() {
   return platform.is_emulated();
 }
 
+simulation_precision RuntimeTarget::get_precision() { return precision; }
+
 /// @brief Search the targets folder in the install for available targets.
 void findAvailableTargets(
     const std::filesystem::path &targetPath,
@@ -89,6 +93,7 @@ void findAvailableTargets(
     if (path.extension().string() == ".config") {
       bool isSimulationTarget = false;
       // Extract the target name from the file name
+      simulation_precision precision = simulation_precision::fp32;
       auto fileName = path.filename().string();
       auto targetName = std::regex_replace(fileName, std::regex(".config"), "");
       std::string platformName = "default", simulatorName = "qpp",
@@ -124,6 +129,8 @@ void findAvailableTargets(
             description.erase(
                 std::remove(description.begin(), description.end(), '\"'),
                 description.end());
+          } else if (line.find(IS_FP64_SIMULATION) != std::string::npos) {
+            precision = simulation_precision::fp64;
           }
         }
       }
@@ -131,14 +138,15 @@ void findAvailableTargets(
       cudaq::info("Found Target: {} -> (sim={}, platform={})", targetName,
                   simulatorName, platformName);
       // Add the target.
-      targets.emplace(targetName, RuntimeTarget{targetName, simulatorName,
-                                                platformName, description});
+      targets.emplace(targetName,
+                      RuntimeTarget{targetName, simulatorName, platformName,
+                                    description, precision});
       if (isSimulationTarget) {
         cudaq::info("Found Simulation target: {} -> (sim={}, platform={})",
                     targetName, simulatorName, platformName);
-        simulationTargets.emplace(targetName,
-                                  RuntimeTarget{targetName, simulatorName,
-                                                platformName, description});
+        simulationTargets.emplace(
+            targetName, RuntimeTarget{targetName, simulatorName, platformName,
+                                      description, precision});
         isSimulationTarget = false;
       }
     }
diff --git a/python/utils/LinkedLibraryHolder.h b/python/utils/LinkedLibraryHolder.h
index 2e35ab7a2c..7477ca0c79 100644
--- a/python/utils/LinkedLibraryHolder.h
+++ b/python/utils/LinkedLibraryHolder.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "cudaq/host_config.h"
 #include <filesystem>
 #include <map>
 #include <string>
@@ -30,11 +31,13 @@ struct RuntimeTarget {
   std::string simulatorName;
   std::string platformName;
   std::string description;
+  simulation_precision precision;
 
   /// @brief Return the number of QPUs this target exposes.
   std::size_t num_qpus();
   bool is_remote();
   bool is_emulated();
+  simulation_precision get_precision();
 };
 
 /// @brief The LinkedLibraryHolder provides a mechanism for
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index 7bed619cb0..d1a6963088 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -323,17 +323,31 @@ packArgs(OpaqueArguments &argData, py::args args,
                 return;
               })
               .Case([&](ComplexType type) {
-                genericVecAllocator.template operator()<std::complex<double>>(
-                    [](py::handle element) -> std::complex<double> {
-                      if (!py::hasattr(element, "real"))
-                        throw std::runtime_error(
-                            "invalid complex element type");
-                      if (!py::hasattr(element, "imag"))
-                        throw std::runtime_error(
-                            "invalid complex element type");
-                      return {PyFloat_AsDouble(element.attr("real").ptr()),
-                              PyFloat_AsDouble(element.attr("imag").ptr())};
-                    });
+                if (isa<Float64Type>(type.getElementType())) {
+                  genericVecAllocator.template operator()<std::complex<double>>(
+                      [](py::handle element) -> std::complex<double> {
+                        if (!py::hasattr(element, "real"))
+                          throw std::runtime_error(
+                              "invalid complex element type");
+                        if (!py::hasattr(element, "imag"))
+                          throw std::runtime_error(
+                              "invalid complex element type");
+                        return {PyFloat_AsDouble(element.attr("real").ptr()),
+                                PyFloat_AsDouble(element.attr("imag").ptr())};
+                      });
+                } else {
+                  genericVecAllocator.template operator()<std::complex<float>>(
+                      [](py::handle element) -> std::complex<float> {
+                        if (!py::hasattr(element, "real"))
+                          throw std::runtime_error(
+                              "invalid complex element type");
+                        if (!py::hasattr(element, "imag"))
+                          throw std::runtime_error(
+                              "invalid complex element type");
+                        return {element.attr("real").cast<float>(),
+                                element.attr("imag").cast<float>()};
+                      });
+                }
                 return;
               })
               .Default([](Type ty) {
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 6998b3e4aa..ddfcbe5130 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -23,4 +23,4 @@ install (FILES nvqir/CircuitSimulator.h
                nvqir/QIRTypes.h
                nvqir/Gates.h
         DESTINATION include/nvqir)
-install (FILES cudaq.h host_config.h DESTINATION include)
+install (FILES cudaq.h DESTINATION include)
diff --git a/runtime/common/MeasureCounts.cpp b/runtime/common/MeasureCounts.cpp
index da456a6c43..d99d965546 100644
--- a/runtime/common/MeasureCounts.cpp
+++ b/runtime/common/MeasureCounts.cpp
@@ -455,7 +455,7 @@ void sample_result::clear() {
   totalShots = 0;
 }
 
-void sample_result::dump(std::ostream &os) {
+void sample_result::dump(std::ostream &os) const {
   os << "{ ";
   if (sampleResults.size() > 1) {
     os << "\n  ";
@@ -492,7 +492,7 @@ void sample_result::dump(std::ostream &os) {
   os << "}\n";
 }
 
-void sample_result::dump() { dump(std::cout); }
+void sample_result::dump() const { dump(std::cout); }
 
 bool sample_result::has_even_parity(std::string_view bitString) {
   int c = std::count(bitString.begin(), bitString.end(), '1');
diff --git a/runtime/common/MeasureCounts.h b/runtime/common/MeasureCounts.h
index 5bafa2541a..ee028e37c7 100644
--- a/runtime/common/MeasureCounts.h
+++ b/runtime/common/MeasureCounts.h
@@ -215,10 +215,10 @@ class sample_result {
   size(const std::string_view registerName = GlobalRegisterName) noexcept;
 
   /// @brief Dump this sample_result to standard out.
-  void dump();
+  void dump() const;
 
   /// @brief Dump this sample_result to the given output stream
-  void dump(std::ostream &os);
+  void dump(std::ostream &os) const;
 
   /// @brief Clear this sample_result.
   void clear();
diff --git a/runtime/common/RuntimeMLIR.cpp b/runtime/common/RuntimeMLIR.cpp
index 0b3f452137..c8f0d27117 100644
--- a/runtime/common/RuntimeMLIR.cpp
+++ b/runtime/common/RuntimeMLIR.cpp
@@ -87,9 +87,9 @@ std::unique_ptr<MLIRContext> initializeMLIR() {
   }
 
   DialectRegistry registry;
-  registry.insert<arith::ArithDialect, LLVM::LLVMDialect, math::MathDialect,
-                  memref::MemRefDialect, quake::QuakeDialect, cc::CCDialect,
-                  func::FuncDialect>();
+  registry.insert<quake::QuakeDialect, cc::CCDialect>();
+  cudaq::opt::registerCodeGenDialect(registry);
+  registerAllDialects(registry);
   auto context = std::make_unique<MLIRContext>(registry);
   context->loadAllAvailableDialects();
   registerLLVMDialectTranslation(*context);
diff --git a/runtime/cudaq.h b/runtime/cudaq.h
index 9e52db4747..fe290c9fd1 100644
--- a/runtime/cudaq.h
+++ b/runtime/cudaq.h
@@ -9,8 +9,8 @@
 #pragma once
 
 #include "common/NoiseModel.h"
+#include "cudaq/host_config.h"
 #include "cudaq/qis/qubit_qis.h"
-#include "host_config.h"
 #include <string>
 #include <type_traits>
 
diff --git a/runtime/cudaq/algorithms/broadcast.h b/runtime/cudaq/algorithms/broadcast.h
index a3b6bd008e..ad0b480aa9 100644
--- a/runtime/cudaq/algorithms/broadcast.h
+++ b/runtime/cudaq/algorithms/broadcast.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
+#include "cudaq/host_config.h"
 #include "cudaq/platform.h"
-#include "host_config.h"
 
 namespace cudaq {
 
diff --git a/runtime/cudaq/algorithms/observe.h b/runtime/cudaq/algorithms/observe.h
index 445a47c1e6..e12afa1772 100644
--- a/runtime/cudaq/algorithms/observe.h
+++ b/runtime/cudaq/algorithms/observe.h
@@ -13,8 +13,8 @@
 #include "common/ObserveResult.h"
 #include "cudaq/algorithms/broadcast.h"
 #include "cudaq/concepts.h"
+#include "cudaq/host_config.h"
 #include "cudaq/spin_op.h"
-#include "host_config.h"
 #include <functional>
 #if CUDAQ_USE_STD20
 #include <ranges>
diff --git a/runtime/cudaq/algorithms/sample.h b/runtime/cudaq/algorithms/sample.h
index b11ef47f79..c1f2e4a6c8 100644
--- a/runtime/cudaq/algorithms/sample.h
+++ b/runtime/cudaq/algorithms/sample.h
@@ -13,7 +13,7 @@
 #include "common/MeasureCounts.h"
 #include "cudaq/algorithms/broadcast.h"
 #include "cudaq/concepts.h"
-#include "host_config.h"
+#include "cudaq/host_config.h"
 
 namespace cudaq {
 bool kernelHasConditionalFeedback(const std::string &);
diff --git a/runtime/cudaq/algorithms/state.h b/runtime/cudaq/algorithms/state.h
index 55cb96a3d6..e2b0a27d4a 100644
--- a/runtime/cudaq/algorithms/state.h
+++ b/runtime/cudaq/algorithms/state.h
@@ -11,9 +11,9 @@
 #include "common/ExecutionContext.h"
 #include "common/KernelWrapper.h"
 #include "cudaq/concepts.h"
+#include "cudaq/host_config.h"
 #include "cudaq/platform.h"
 #include "cudaq/platform/QuantumExecutionQueue.h"
-#include "host_config.h"
 #include <complex>
 #include <vector>
 
diff --git a/runtime/cudaq/builder/QuakeValue.h b/runtime/cudaq/builder/QuakeValue.h
index 5527037623..c406a97232 100644
--- a/runtime/cudaq/builder/QuakeValue.h
+++ b/runtime/cudaq/builder/QuakeValue.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include "host_config.h"
+#include "cudaq/host_config.h"
 #include <functional>
 #include <map>
 #include <memory>
diff --git a/runtime/cudaq/builder/kernel_builder.cpp b/runtime/cudaq/builder/kernel_builder.cpp
index d241a7742b..ccf4f05d65 100644
--- a/runtime/cudaq/builder/kernel_builder.cpp
+++ b/runtime/cudaq/builder/kernel_builder.cpp
@@ -9,6 +9,7 @@
 #include "kernel_builder.h"
 #include "common/Logger.h"
 #include "common/RuntimeMLIR.h"
+#include "cudaq/Optimizer/Builder/Intrinsics.h"
 #include "cudaq/Optimizer/Builder/Runtime.h"
 #include "cudaq/Optimizer/CodeGen/Passes.h"
 #include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
@@ -19,6 +20,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -45,57 +47,75 @@ namespace cudaq::details {
 /// @brief Track unique measurement register names.
 static std::size_t regCounter = 0;
 
-KernelBuilderType mapArgToType(double &e) {
+KernelBuilderType convertArgumentTypeToMLIR(double &e) {
   return KernelBuilderType(
       [](MLIRContext *ctx) { return Float64Type::get(ctx); });
 }
 
-KernelBuilderType mapArgToType(float &e) {
+KernelBuilderType convertArgumentTypeToMLIR(float &e) {
   return KernelBuilderType(
       [](MLIRContext *ctx) { return Float32Type::get(ctx); });
 }
 
-KernelBuilderType mapArgToType(int &e) {
+KernelBuilderType convertArgumentTypeToMLIR(int &e) {
   return KernelBuilderType(
       [](MLIRContext *ctx) { return IntegerType::get(ctx, 32); });
 }
 
-KernelBuilderType mapArgToType(std::vector<double> &e) {
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<double> &e) {
   return KernelBuilderType([](MLIRContext *ctx) {
     return cudaq::cc::StdvecType::get(ctx, Float64Type::get(ctx));
   });
 }
 
-KernelBuilderType mapArgToType(std::size_t &e) {
+KernelBuilderType convertArgumentTypeToMLIR(std::size_t &e) {
   return KernelBuilderType(
       [](MLIRContext *ctx) { return IntegerType::get(ctx, 64); });
 }
 
-KernelBuilderType mapArgToType(std::vector<int> &e) {
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<int> &e) {
   return KernelBuilderType([](MLIRContext *ctx) {
     return cudaq::cc::StdvecType::get(ctx, mlir::IntegerType::get(ctx, 32));
   });
 }
 
-KernelBuilderType mapArgToType(std::vector<std::size_t> &e) {
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<std::size_t> &e) {
   return KernelBuilderType([](MLIRContext *ctx) {
     return cudaq::cc::StdvecType::get(ctx, mlir::IntegerType::get(ctx, 64));
   });
 }
 
 /// Map a std::vector<float> to a KernelBuilderType
-KernelBuilderType mapArgToType(std::vector<float> &e) {
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<float> &e) {
   return KernelBuilderType([](MLIRContext *ctx) {
     return cudaq::cc::StdvecType::get(ctx, Float32Type::get(ctx));
   });
 }
 
-KernelBuilderType mapArgToType(cudaq::qubit &e) {
+/// Map a std::vector<complex<double>> to a KernelBuilderType
+KernelBuilderType
+convertArgumentTypeToMLIR(std::vector<std::complex<double>> &e) {
+  return KernelBuilderType([](MLIRContext *ctx) {
+    return cudaq::cc::StdvecType::get(ctx,
+                                      ComplexType::get(Float64Type::get(ctx)));
+  });
+}
+
+/// Map a std::vector<complex<float>> to a KernelBuilderType
+KernelBuilderType
+convertArgumentTypeToMLIR(std::vector<std::complex<float>> &e) {
+  return KernelBuilderType([](MLIRContext *ctx) {
+    return cudaq::cc::StdvecType::get(ctx,
+                                      ComplexType::get(Float32Type::get(ctx)));
+  });
+}
+
+KernelBuilderType convertArgumentTypeToMLIR(cudaq::qubit &e) {
   return KernelBuilderType(
       [](MLIRContext *ctx) { return quake::RefType::get(ctx); });
 }
 
-KernelBuilderType mapArgToType(cudaq::qvector<> &e) {
+KernelBuilderType convertArgumentTypeToMLIR(cudaq::qvector<> &e) {
   return KernelBuilderType(
       [](MLIRContext *ctx) { return quake::VeqType::getUnsized(ctx); });
 }
@@ -144,9 +164,8 @@ initializeBuilder(MLIRContext *context,
 
   cudaq::info("kernel_builder has {} arguments", arguments.size());
 
-  // Every Kernel should have a ReturnOp terminator,
-  // then we'll set the insertion point to right
-  // before it.
+  // Every Kernel should have a ReturnOp terminator, then we'll set the
+  // insertion point to right before it.
   opBuilder->setInsertionPointToStart(entryBlock);
   auto terminator = opBuilder->create<func::ReturnOp>();
   opBuilder->setInsertionPoint(terminator);
@@ -192,10 +211,9 @@ void exp_pauli(ImplicitLocOpBuilder &builder, const QuakeValue &theta,
 }
 
 /// @brief Search the given `FuncOp` for all `CallOps` recursively.
-/// If found, see if the called function is in the current `ModuleOp`
-/// for this `kernel_builder`, if so do nothing. If it is not found,
-/// then find it in the other `ModuleOp`, clone it, and add it to this
-/// `ModuleOp`.
+/// If found, see if the called function is in the current `ModuleOp` for this
+/// `kernel_builder`, if so do nothing. If it is not found, then find it in the
+/// other `ModuleOp`, clone it, and add it to this `ModuleOp`.
 void addAllCalledFunctionRecursively(
     func::FuncOp &function, ModuleOp &currentModule,
     mlir::OwningOpRef<mlir::ModuleOp> &otherModule) {
@@ -244,10 +262,10 @@ void addAllCalledFunctionRecursively(
   visitAllCallOps(function);
 }
 
-/// @brief Get a the function with the given name. First look in the
-/// current `ModuleOp` for this `kernel_builder`, if found return it as is. If
-/// not found, find it in the other `kernel_builder` `ModuleOp` and return a
-/// clone of it. Throw an exception if no kernel with the given name is found
+/// @brief Get a the function with the given name. First look in the current
+/// `ModuleOp` for this `kernel_builder`, if found return it as is. If not
+/// found, find it in the other `kernel_builder` `ModuleOp` and return a clone
+/// of it. Throw an exception if no kernel with the given name is found
 func::FuncOp
 cloneOrGetFunction(StringRef name, ModuleOp &currentModule,
                    mlir::OwningOpRef<mlir::ModuleOp> &otherModule) {
@@ -276,8 +294,8 @@ void call(ImplicitLocOpBuilder &builder, std::string &name,
   auto function = block->getParentOp();
   auto currentModule = function->getParentOfType<ModuleOp>();
 
-  // We need to clone the function we care about, we need
-  // any other functions it calls, so store it in a vector
+  // We need to clone the function we care about, we need any other functions it
+  // calls, so store it in a vector
   std::vector<func::FuncOp> functions;
 
   // Get the function with the kernel name we care about.
@@ -285,8 +303,8 @@ void call(ImplicitLocOpBuilder &builder, std::string &name,
   auto otherFuncCloned =
       cloneOrGetFunction(properName, currentModule, otherModule);
 
-  // We need to recursively find all CallOps and
-  // add their Callee FuncOps to the current Module
+  // We need to recursively find all CallOps and add their Callee FuncOps to the
+  // current Module
   addAllCalledFunctionRecursively(otherFuncCloned, currentModule, otherModule);
 
   // Map the QuakeValues to MLIR Values
@@ -339,8 +357,8 @@ void applyControlOrAdjoint(ImplicitLocOpBuilder &builder, std::string &name,
   auto otherFuncCloned =
       cloneOrGetFunction(properName, currentModule, otherModule);
 
-  // We need to recursively find all CallOps and
-  // add their Callee FuncOps to the current Module
+  // We need to recursively find all CallOps and add their Callee FuncOps to the
+  // current Module
   addAllCalledFunctionRecursively(otherFuncCloned, currentModule, otherModule);
 
   SmallVector<Value> mlirValues;
@@ -460,21 +478,147 @@ QuakeValue qalloc(ImplicitLocOpBuilder &builder, const std::size_t nQubits) {
   return QuakeValue(builder, qubits);
 }
 
-QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &size) {
+QuakeValue qalloc(ImplicitLocOpBuilder &builder, QuakeValue &sizeOrVec) {
   cudaq::info("kernel_builder allocating qubits from quake value");
-  auto value = size.getValue();
+  auto value = sizeOrVec.getValue();
   auto type = value.getType();
+  auto context = builder.getContext();
+
+  if (auto stdvecTy = dyn_cast<cc::StdvecType>(type)) {
+    // get the size
+    Value size = builder.create<cc::StdvecSizeOp>(builder.getI64Type(), value);
+    Value numQubits = builder.create<math::CountTrailingZerosOp>(size);
+    auto veqTy = quake::VeqType::getUnsized(context);
+    // allocate the number of qubits we need
+    Value qubits = builder.create<quake::AllocaOp>(veqTy, numQubits);
+
+    auto ptrTy = cc::PointerType::get(stdvecTy.getElementType());
+    Value initials = builder.create<cc::StdvecDataOp>(ptrTy, value);
+    builder.create<quake::InitializeStateOp>(veqTy, qubits, initials);
+    return QuakeValue(builder, qubits);
+  }
+
   if (!type.isIntOrIndex())
     throw std::runtime_error(
         "Invalid parameter passed to qalloc (must be integer type).");
 
-  auto context = builder.getContext();
   Value qubits = builder.create<quake::AllocaOp>(
       quake::VeqType::getUnsized(context), value);
 
   return QuakeValue(builder, qubits);
 }
 
+template <typename A>
+std::size_t getStateVectorLength(StateVectorStorage &stateVectorStorage,
+                                 std::int64_t index) {
+  if (index >= static_cast<std::int64_t>(stateVectorStorage.size()))
+    throw std::runtime_error("index to state initializer is out of range");
+  if (!std::get<std::vector<std::complex<A>> *>(stateVectorStorage[index]))
+    throw std::runtime_error("state vector cannot be null");
+  auto length =
+      std::get<std::vector<std::complex<A>> *>(stateVectorStorage[index])
+          ->size();
+  if (!std::has_single_bit(length))
+    throw std::runtime_error("state initializer must be a power of 2");
+  return std::countr_zero(length);
+}
+
+template <typename A>
+std::complex<A> *getStateVectorData(StateVectorStorage &stateVectorStorage,
+                                    std::intptr_t index) {
+  // This foregoes all the checks found in getStateVectorLength because these
+  // two functions are called in tandem, this one second.
+  return std::get<std::vector<std::complex<A>> *>(stateVectorStorage[index])
+      ->data();
+}
+
+extern "C" {
+/// Runtime callback to get the log2(size) of a captured state vector.
+std::size_t
+__nvqpp_getStateVectorLength_fp64(StateVectorStorage &stateVectorStorage,
+                                  std::int64_t index) {
+  return getStateVectorLength<double>(stateVectorStorage, index);
+}
+
+std::size_t
+__nvqpp_getStateVectorLength_fp32(StateVectorStorage &stateVectorStorage,
+                                  std::int64_t index) {
+  return getStateVectorLength<float>(stateVectorStorage, index);
+}
+
+/// Runtime callback to get the data array of a captured state vector.
+std::complex<double> *
+__nvqpp_getStateVectorData_fp64(StateVectorStorage &stateVectorStorage,
+                                std::intptr_t index) {
+  return getStateVectorData<double>(stateVectorStorage, index);
+}
+
+/// Runtime callback to get the data array of a captured state vector.
+std::complex<float> *
+__nvqpp_getStateVectorData_fp32(StateVectorStorage &stateVectorStorage,
+                                std::intptr_t index) {
+  return getStateVectorData<float>(stateVectorStorage, index);
+}
+}
+
+QuakeValue qalloc(ImplicitLocOpBuilder &builder,
+                  StateVectorStorage &stateVectorStorage,
+                  StateVectorVariant &&state, simulation_precision precision) {
+  auto *context = builder.getContext();
+  auto index = stateVectorStorage.size();
+  stateVectorStorage.emplace_back(std::move(state));
+
+  // Deal with the single/double precision differences here.
+  const char *getLengthCallBack;
+  const char *getDataCallBack;
+  Type componentTy;
+  {
+    auto parentModule =
+        builder.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
+    IRBuilder irb(context);
+    if (precision == simulation_precision::fp64) {
+      getLengthCallBack = "__nvqpp_getStateVectorLength_fp64";
+      getDataCallBack = "__nvqpp_getStateVectorData_fp64";
+      componentTy = irb.getF64Type();
+    } else {
+      getLengthCallBack = "__nvqpp_getStateVectorLength_fp32";
+      getDataCallBack = "__nvqpp_getStateVectorData_fp32";
+      componentTy = irb.getF32Type();
+    }
+    if (failed(irb.loadIntrinsic(parentModule, getLengthCallBack)) ||
+        failed(irb.loadIntrinsic(parentModule, getDataCallBack)))
+      throw std::runtime_error("loading callbacks should never fail");
+  }
+
+  static_assert(sizeof(std::intptr_t) * 8 == 64);
+  std::intptr_t vecStor = reinterpret_cast<std::intptr_t>(&stateVectorStorage);
+
+  auto vecPtr = builder.create<arith::ConstantIntOp>(vecStor, 64);
+  auto idxOp = builder.create<arith::ConstantIntOp>(index, 64);
+
+  // Use callback to determine the size of the captured vector `state` at
+  // runtime.
+  auto i64Ty = builder.getI64Type();
+  auto size = builder.create<func::CallOp>(i64Ty, getLengthCallBack,
+                                           ValueRange{vecPtr, idxOp});
+
+  // Allocate the qubits
+  Value qubits = builder.create<quake::AllocaOp>(
+      quake::VeqType::getUnsized(context), size.getResult(0));
+
+  // Use callback to retrieve the data pointer of the captured vector `state` at
+  // runtime.
+  auto complexTy = ComplexType::get(componentTy);
+  auto ptrComplexTy = cc::PointerType::get(complexTy);
+  auto dataPtr = builder.create<func::CallOp>(ptrComplexTy, getDataCallBack,
+                                              ValueRange{vecPtr, idxOp});
+
+  // Add the initialize state op
+  qubits = builder.create<quake::InitializeStateOp>(qubits.getType(), qubits,
+                                                    dataPtr.getResult(0));
+  return QuakeValue(builder, qubits);
+}
+
 QuakeValue constantVal(ImplicitLocOpBuilder &builder, double val) {
   llvm::APFloat d(val);
   Value constant =
@@ -741,7 +885,8 @@ void tagEntryPoint(ImplicitLocOpBuilder &builder, ModuleOp &module,
 std::tuple<bool, ExecutionEngine *>
 jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
         std::unordered_map<ExecutionEngine *, std::size_t> &jitHash,
-        std::string kernelName, std::vector<std::string> extraLibPaths) {
+        std::string kernelName, std::vector<std::string> extraLibPaths,
+        StateVectorStorage &stateVectorStorage) {
 
   // Start of by getting the current ModuleOp
   auto *block = builder.getBlock();
@@ -757,10 +902,8 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
   auto moduleHash = static_cast<size_t>(hash);
 
   if (jit) {
-    // Have we added more instructions
-    // since the last time we jit the code?
-    // If so, we need to delete this JIT engine
-    // and create a new one.
+    // Have we added more instructions since the last time we jit the code? If
+    // so, we need to delete this JIT engine and create a new one.
     if (moduleHash == jitHash[jit])
       return std::make_tuple(false, jit);
     else {
@@ -799,9 +942,8 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
 
-  // For some reason I get CFG ops from the LowerToCFGPass
-  // instead of the unrolled cc loop if I don't run
-  // the above manually.
+  // For some reason I get CFG ops from the LowerToCFGPass instead of the
+  // unrolled cc loop if I don't run the above manually.
   if (failed(pm.run(module)))
     throw std::runtime_error(
         "cudaq::builder failed to JIT compile the Quake representation.");
@@ -810,10 +952,17 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
   pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader(/*genAsQuake=*/true));
   pm.addPass(cudaq::opt::createGenerateKernelExecution());
   optPM.addPass(cudaq::opt::createLowerToCFGPass());
-  optPM.addPass(cudaq::opt::createCombineQuantumAllocations());
+  // We want quantum allocations to stay where they are if
+  // we are simulating and have user-provided state vectors.
+  // This check could be better / smarter probably, in tandem
+  // with some synth strategy to rewrite initState with circuit
+  // synthesis result
+  if (stateVectorStorage.empty())
+    optPM.addPass(cudaq::opt::createCombineQuantumAllocations());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   pm.addPass(cudaq::opt::createConvertToQIRPass());
+  pm.addPass(createCanonicalizerPass());
 
   if (failed(pm.run(module)))
     throw std::runtime_error(
@@ -861,8 +1010,8 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
 
   cudaq::info("- JIT Engine created successfully.");
 
-  // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR
-  // for the following we want the proper name, BuilderKernelPTRST
+  // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR for the following we
+  // want the proper name, BuilderKernelPTRST
   std::string properName = name(kernelName);
 
   // Need to first invoke the init_func()
@@ -892,17 +1041,18 @@ jitCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
 
 void invokeCode(ImplicitLocOpBuilder &builder, ExecutionEngine *jit,
                 std::string kernelName, void **argsArray,
-                std::vector<std::string> extraLibPaths) {
+                std::vector<std::string> extraLibPaths,
+                StateVectorStorage &storage) {
 
   assert(jit != nullptr && "JIT ExecutionEngine was null.");
   cudaq::info("kernel_builder invoke kernel with args.");
 
-  // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR
-  // for the following we want the proper name, BuilderKernelPTRST
+  // Kernel names are __nvqpp__mlirgen__BuilderKernelPTRSTR for the following we
+  // want the proper name, BuilderKernelPTRST
   std::string properName = name(kernelName);
 
-  // Incoming Args... have been converted to void **,
-  // now we convert to void * altLaunchKernel args.
+  // Incoming Args... have been converted to void **, now we convert to void *
+  // altLaunchKernel args.
   auto argCreatorName = properName + ".argsCreator";
   auto expectedPtr = jit->lookup(argCreatorName);
   if (!expectedPtr) {
@@ -937,9 +1087,9 @@ std::string to_quake(ImplicitLocOpBuilder &builder) {
 
   // Strategy - we want to clone this ModuleOp because we have to
   // add a valid terminator (func.return), but it is not gauranteed that
-  // the programmer is done building up the kernel even though they've asked to
-  // look at the quake code. So we'll clone here, and add the return op (we have
-  // to or the print out string will be invalid (verifier failed)).
+  // the programmer is done building up the kernel even though they've asked
+  // to look at the quake code. So we'll clone here, and add the return op
+  // (we have to or the print out string will be invalid (verifier failed)).
   auto clonedModule = module.clone();
 
   func::FuncOp unwrappedParentFunc = llvm::cast<func::FuncOp>(parentFunc);
@@ -960,4 +1110,9 @@ std::string to_quake(ImplicitLocOpBuilder &builder) {
   return printOut;
 }
 
+std::ostream &operator<<(std::ostream &stream,
+                         const kernel_builder_base &builder) {
+  return stream << builder.to_quake();
+}
+
 } // namespace cudaq::details
diff --git a/runtime/cudaq/builder/kernel_builder.h b/runtime/cudaq/builder/kernel_builder.h
index f42e96d396..170f8571b0 100644
--- a/runtime/cudaq/builder/kernel_builder.h
+++ b/runtime/cudaq/builder/kernel_builder.h
@@ -9,10 +9,10 @@
 #pragma once
 
 #include "cudaq/builder/QuakeValue.h"
+#include "cudaq/host_config.h"
 #include "cudaq/qis/modifiers.h"
 #include "cudaq/qis/qvector.h"
 #include "cudaq/utils/cudaq_utils.h"
-#include "host_config.h"
 #include <cstring>
 #include <functional>
 #include <map>
@@ -62,13 +62,16 @@ concept KernelBuilderArgTypeIsValid =
     std::disjunction_v<std::is_same<T, Ts>...>;
 
 // If you want to add to the list of valid kernel argument types first add it
-// here, then add `details::mapArgToType()` function
+// here, then add `details::convertArgumentTypeToMLIR()` function
 #define CUDAQ_VALID_BUILDER_ARGS_FOLD()                                        \
-  requires(KernelBuilderArgTypeIsValid<                                        \
-               Args, float, double, std::size_t, int, std::vector<int>,        \
-               std::vector<float>, std::vector<std::size_t>,                   \
-               std::vector<double>, cudaq::qubit, cudaq::qvector<>> &&         \
-           ...)
+  requires(                                                                    \
+      KernelBuilderArgTypeIsValid<                                             \
+          Args, float, double, std::size_t, int, std::vector<int>,             \
+          std::vector<float>, std::vector<std::size_t>, std::vector<double>,   \
+          std::vector<std::complex<float>>, std::vector<std::complex<double>>, \
+          std::vector<cudaq::simulation_scalar>, cudaq::qubit,                 \
+          cudaq::qvector<>> &&                                                 \
+      ...)
 #else
 // Not C++ 2020: stub these out.
 #define QuakeValueOrNumericType typename
@@ -76,6 +79,15 @@ concept KernelBuilderArgTypeIsValid =
 #endif
 
 namespace details {
+/// Use parametric type: `initializations` must be vectors of complex float or
+/// double. No other type is allowed.
+using StateVectorVariant = std::variant<std::vector<std::complex<float>> *,
+                                        std::vector<std::complex<double>> *>;
+
+/// Type describing user-provided state vector data. This is a list of the state
+/// vector variables used in a kernel with at least one `qvector` with initial
+/// state.
+using StateVectorStorage = std::vector<StateVectorVariant>;
 
 // Define a `mlir::Type` generator in the `cudaq` namespace, this helps us keep
 // MLIR out of this public header
@@ -98,34 +110,42 @@ class KernelBuilderType {
 };
 
 /// Map a `double` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(double &e);
+KernelBuilderType convertArgumentTypeToMLIR(double &e);
 
 /// Map a `float` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(float &e);
+KernelBuilderType convertArgumentTypeToMLIR(float &e);
 
 /// Map a `int` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(int &e);
+KernelBuilderType convertArgumentTypeToMLIR(int &e);
 
 /// Map a `size_t` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(std::size_t &e);
+KernelBuilderType convertArgumentTypeToMLIR(std::size_t &e);
 
 /// Map a `std::vector<int>` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(std::vector<int> &e);
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<int> &e);
 
 /// Map a `std::vector<std::size_t>` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(std::vector<std::size_t> &e);
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<std::size_t> &e);
 
 /// Map a `std::vector<float>` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(std::vector<float> &e);
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<float> &e);
 
 /// Map a `vector<double>` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(std::vector<double> &e);
+KernelBuilderType convertArgumentTypeToMLIR(std::vector<double> &e);
+
+/// Map a `vector<std::complex<double>>` to a `KernelBuilderType`
+KernelBuilderType
+convertArgumentTypeToMLIR(std::vector<std::complex<double>> &e);
+
+/// Map a `vector<std::complex<double>>` to a `KernelBuilderType`
+KernelBuilderType
+convertArgumentTypeToMLIR(std::vector<std::complex<float>> &e);
 
 /// Map a `qubit` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(cudaq::qubit &e);
+KernelBuilderType convertArgumentTypeToMLIR(cudaq::qubit &e);
 
 /// @brief  Map a `qvector` to a `KernelBuilderType`
-KernelBuilderType mapArgToType(cudaq::qvector<> &e);
+KernelBuilderType convertArgumentTypeToMLIR(cudaq::qvector<> &e);
 
 /// @brief Initialize the `MLIRContext`, return the raw pointer which we'll wrap
 /// in an `unique_ptr`.
@@ -159,6 +179,11 @@ QuakeValue qalloc(mlir::ImplicitLocOpBuilder &builder,
 /// @brief Allocate a `qvector` from existing `QuakeValue` size
 QuakeValue qalloc(mlir::ImplicitLocOpBuilder &builder, QuakeValue &size);
 
+/// @brief Allocate a `qvector` from a user provided state vector.
+QuakeValue qalloc(mlir::ImplicitLocOpBuilder &builder,
+                  StateVectorStorage &stateVectorData,
+                  StateVectorVariant &&state, simulation_precision precision);
+
 /// @brief Create a QuakeValue representing a constant floating-point number
 QuakeValue constantVal(mlir::ImplicitLocOpBuilder &builder, double val);
 
@@ -221,12 +246,13 @@ void applyPasses(mlir::PassManager &);
 std::tuple<bool, mlir::ExecutionEngine *>
 jitCode(mlir::ImplicitLocOpBuilder &, mlir::ExecutionEngine *,
         std::unordered_map<mlir::ExecutionEngine *, std::size_t> &, std::string,
-        std::vector<std::string>);
+        std::vector<std::string>, StateVectorStorage &);
 
 /// @brief Invoke the function with the given kernel name.
 void invokeCode(mlir::ImplicitLocOpBuilder &builder, mlir::ExecutionEngine *jit,
                 std::string kernelName, void **argsArray,
-                std::vector<std::string> extraLibPaths);
+                std::vector<std::string> extraLibPaths,
+                StateVectorStorage &storage);
 
 /// @brief Invoke the provided kernel function.
 void call(mlir::ImplicitLocOpBuilder &builder, std::string &name,
@@ -326,10 +352,7 @@ class kernel_builder_base {
   /// @brief Write the kernel_builder to the given output stream. This outputs
   /// the Quake representation.
   friend std::ostream &operator<<(std::ostream &stream,
-                                  const kernel_builder_base &builder) {
-    stream << builder.to_quake();
-    return stream;
-  }
+                                  const kernel_builder_base &builder);
 };
 
 } // namespace details
@@ -387,9 +410,12 @@ class kernel_builder : public details::kernel_builder_base {
     return std::get<std::string>(term);
   }
 
+  /// @brief Storage for any user-provided state-vector data.
+  details::StateVectorStorage stateVectorStorage;
+
 public:
-  /// @brief The constructor, takes the input `KernelBuilderType`s which is used
-  /// to create the MLIR function type
+  /// @brief The constructor, takes the input `KernelBuilderType`s which is
+  /// used to create the MLIR function type
   kernel_builder(std::vector<details::KernelBuilderType> &types)
       : context(details::initializeContext(), details::deleteContext),
         opBuilder(nullptr, [](mlir::ImplicitLocOpBuilder *) {}),
@@ -431,6 +457,28 @@ class kernel_builder : public details::kernel_builder_base {
     return details::qalloc(*opBuilder.get(), size);
   }
 
+  /// Return a `QuakeValue` representing the allocated quantum register,
+  /// initialized to the given state vector, \p state.
+  ///
+  /// Note: input argument is a \e true reference here, the calling context has
+  /// to own the data. Specifically, the builder object will capture variables
+  /// by reference (implemented as a container of pointers for simplicity) but
+  /// the builder does not create, own, or copy these vectors. This implies that
+  /// if the captured vector goes out of scope before the kernel is invoked, the
+  /// reference may contain garbage. This behavior is identical to a C++ lambda
+  /// capture by reference.
+  QuakeValue qalloc(std::vector<std::complex<double>> &state) {
+    return details::qalloc(*opBuilder.get(), stateVectorStorage,
+                           details::StateVectorVariant{&state},
+                           simulation_precision::fp64);
+  }
+  // Overload for complex<float> vector.
+  QuakeValue qalloc(std::vector<std::complex<float>> &state) {
+    return details::qalloc(*opBuilder.get(), stateVectorStorage,
+                           details::StateVectorVariant{&state},
+                           simulation_precision::fp32);
+  }
+
   /// @brief Return a `QuakeValue` representing the constant floating-point
   /// value.
   QuakeValue constantVal(double val) {
@@ -794,7 +842,7 @@ class kernel_builder : public details::kernel_builder_base {
   void jitCode(std::vector<std::string> extraLibPaths = {}) override {
     auto [wasChanged, ptr] =
         details::jitCode(*opBuilder, jitEngine.get(), jitEngineToModuleHash,
-                         kernelName, extraLibPaths);
+                         kernelName, extraLibPaths, stateVectorStorage);
     // If we had a jitEngine, but the code changed, delete the one we had.
     if (jitEngine && wasChanged)
       details::deleteJitEngine(jitEngine.release());
@@ -818,7 +866,7 @@ class kernel_builder : public details::kernel_builder_base {
       jitCode(extraLibPaths);
     }
     details::invokeCode(*opBuilder, jitEngine.get(), kernelName, argsArray,
-                        extraLibPaths);
+                        extraLibPaths, stateVectorStorage);
   }
 
   /// @brief The call operator for the kernel_builder, takes as input the
@@ -882,7 +930,7 @@ CUDAQ_VALID_BUILDER_ARGS_FOLD()
 auto make_kernel() {
   std::vector<details::KernelBuilderType> types;
   cudaq::tuple_for_each(std::tuple<Args...>(), [&](auto &&el) {
-    types.push_back(details::mapArgToType(el));
+    types.push_back(details::convertArgumentTypeToMLIR(el));
   });
   return kernel_builder<Args...>(types);
 }
diff --git a/runtime/cudaq/concepts.h b/runtime/cudaq/concepts.h
index 75f7420086..52d9379734 100644
--- a/runtime/cudaq/concepts.h
+++ b/runtime/cudaq/concepts.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include "host_config.h"
+#include "cudaq/host_config.h"
 #include <type_traits>
 
 #if CUDAQ_USE_STD20
diff --git a/runtime/cudaq/host_config.h b/runtime/cudaq/host_config.h
new file mode 100644
index 0000000000..ccba759ebe
--- /dev/null
+++ b/runtime/cudaq/host_config.h
@@ -0,0 +1,36 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <complex>
+
+#define CUDAQ_USE_STD20 (__cplusplus >= 202002L)
+#define CUDAQ_APPLE_CLANG (defined(__apple_build_version__))
+
+namespace cudaq {
+
+/// @brief Define an enumeration of possible simulation
+/// floating point precision types.
+enum class simulation_precision { fp32, fp64 };
+
+#if defined(CUDAQ_SIMULATION_SCALAR_FP64) &&                                   \
+    defined(CUDAQ_SIMULATION_SCALAR_FP32)
+#error "Simulation precision cannot be both double and float"
+#elif defined(CUDAQ_SIMULATION_SCALAR_FP32)
+using simulation_scalar = std::complex<float>;
+#elif defined(CUDAQ_SIMULATION_SCALAR_FP64)
+using simulation_scalar = std::complex<double>;
+#else
+// If neither precision is specified, assume double.
+// Do NOT add the warning though as this fires hundreds of times during a build.
+// #pragma message("Simulation precision is unspecified, assuming double")
+using simulation_scalar = std::complex<double>;
+#endif
+
+} // namespace cudaq
diff --git a/runtime/cudaq/platform/default/nvidia-fp64.config b/runtime/cudaq/platform/default/nvidia-fp64.config
index 30bc83ebf6..b7fb8f4d49 100644
--- a/runtime/cudaq/platform/default/nvidia-fp64.config
+++ b/runtime/cudaq/platform/default/nvidia-fp64.config
@@ -8,6 +8,7 @@
 
 msg=""
 
+PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64"
 gpu_found=$(query_gpu)
 if ${gpu_found} && [ -f "${install_dir}/lib/libnvqir-custatevec-fp64.so" ]; then
 	NVQIR_SIMULATION_BACKEND="custatevec-fp64"
diff --git a/runtime/cudaq/platform/default/nvidia.config b/runtime/cudaq/platform/default/nvidia.config
index c410061a46..8055293ba0 100644
--- a/runtime/cudaq/platform/default/nvidia.config
+++ b/runtime/cudaq/platform/default/nvidia.config
@@ -8,6 +8,7 @@
 
 msg=""
 
+PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP32"
 gpu_found=$(query_gpu)
 if ${gpu_found} && [ -f "${install_dir}/lib/libnvqir-custatevec-fp32.so" ]; then
 	NVQIR_SIMULATION_BACKEND="custatevec-fp32"
@@ -24,4 +25,4 @@ fi
 
 GPU_REQUIREMENTS="true" # This could be a string listing compatibility in the future
 
-TARGET_DESCRIPTION="The NVIDIA Target provides a simulated QPU via single-GPU cuStateVec integration on FP32 types."
\ No newline at end of file
+TARGET_DESCRIPTION="The NVIDIA Target provides a simulated QPU via single-GPU cuStateVec integration on FP32 types."
diff --git a/runtime/cudaq/qis/execution_manager.h b/runtime/cudaq/qis/execution_manager.h
index 841ec1cdcf..1b5f2a9c87 100644
--- a/runtime/cudaq/qis/execution_manager.h
+++ b/runtime/cudaq/qis/execution_manager.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "common/QuditIdTracker.h"
+#include "cudaq/host_config.h"
 #include "cudaq/spin_op.h"
 #include <deque>
 #include <string_view>
@@ -17,6 +18,7 @@
 namespace cudaq {
 class ExecutionContext;
 using SpinMeasureResult = std::pair<double, sample_result>;
+using complex = std::complex<double>;
 
 /// A QuditInfo is a type encoding the number of \a levels and the \a id of the
 /// qudit to the ExecutionManager.
@@ -111,6 +113,12 @@ class ExecutionManager {
   /// Reset the execution context
   virtual void resetExecutionContext() = 0;
 
+  /// @brief Initialize the state of the given qudits to the provided
+  /// state vector.
+  virtual void initializeState(const std::vector<QuditInfo> &targets,
+                               const void *state,
+                               simulation_precision precision) = 0;
+
   /// Apply the quantum instruction with the given name, on the provided target
   /// qudits. Supports input of control qudits and rotational parameters. Can
   /// also optionally take a spin_op as input to affect a general Pauli
diff --git a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
index 1e6bfd9e51..bdc8980603 100644
--- a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
+++ b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
@@ -60,6 +60,15 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
     simulator()->allocateQubits(qudits.size());
   }
 
+  void initializeState(const std::vector<cudaq::QuditInfo> &targets,
+                       const void *state,
+                       cudaq::simulation_precision precision) override {
+    // Here we have qubits in requestedAllocations
+    // want to allocate and set state
+    simulator()->allocateQubits(requestedAllocations.size(), state, precision);
+    requestedAllocations.clear();
+  }
+
   void deallocateQudit(const cudaq::QuditInfo &q) override {
 
     // Before trying to deallocate, make sure the qudit hasn't
diff --git a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
index b537004df4..f623ab0c3a 100644
--- a/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
+++ b/runtime/cudaq/qis/managers/photonics/PhotonicsExecutionManager.cpp
@@ -55,6 +55,12 @@ class PhotonicsExecutionManager : public cudaq::BasicExecutionManager {
       allocateQudit(q);
   }
 
+  void initializeState(const std::vector<cudaq::QuditInfo> &targets,
+                       const void *state,
+                       simulation_precision precision) override {
+    throw std::runtime_error("initializeState not implemented.");
+  }
+
   /// @brief Qudit deallocation method
   void deallocateQudit(const cudaq::QuditInfo &q) override {}
 
diff --git a/runtime/cudaq/qis/qarray.h b/runtime/cudaq/qis/qarray.h
index babd7df6da..99d75ac0d8 100644
--- a/runtime/cudaq/qis/qarray.h
+++ b/runtime/cudaq/qis/qarray.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
+#include "cudaq/host_config.h"
 #include "cudaq/qis/qview.h"
-#include "host_config.h"
 
 namespace cudaq {
 
diff --git a/runtime/cudaq/qis/qreg.h b/runtime/cudaq/qis/qreg.h
index f58ea46c93..5899a6c314 100644
--- a/runtime/cudaq/qis/qreg.h
+++ b/runtime/cudaq/qis/qreg.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
+#include "cudaq/host_config.h"
 #include "cudaq/qis/qspan.h"
-#include "host_config.h"
 
 #if defined(__clang__)
 #pragma clang diagnostic push
diff --git a/runtime/cudaq/qis/qspan.h b/runtime/cudaq/qis/qspan.h
index 10e7175f8b..573cea66d8 100644
--- a/runtime/cudaq/qis/qspan.h
+++ b/runtime/cudaq/qis/qspan.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
+#include "cudaq/host_config.h"
 #include "cudaq/qis/qudit.h"
-#include "host_config.h"
 #if CUDAQ_USE_STD20
 #include <ranges>
 #include <span>
diff --git a/runtime/cudaq/qis/qubit_qis.h b/runtime/cudaq/qis/qubit_qis.h
index f79c4935c9..d6b9fe8610 100644
--- a/runtime/cudaq/qis/qubit_qis.h
+++ b/runtime/cudaq/qis/qubit_qis.h
@@ -9,13 +9,13 @@
 #pragma once
 
 #include "common/MeasureCounts.h"
+#include "cudaq/host_config.h"
 #include "cudaq/qis/modifiers.h"
 #include "cudaq/qis/pauli_word.h"
 #include "cudaq/qis/qarray.h"
 #include "cudaq/qis/qreg.h"
 #include "cudaq/qis/qvector.h"
 #include "cudaq/spin_op.h"
-#include "host_config.h"
 #include <cstring>
 #include <functional>
 
diff --git a/runtime/cudaq/qis/qudit.h b/runtime/cudaq/qis/qudit.h
index d750aa2a4d..527e55f570 100644
--- a/runtime/cudaq/qis/qudit.h
+++ b/runtime/cudaq/qis/qudit.h
@@ -10,7 +10,15 @@
 
 #include "execution_manager.h"
 
+using namespace std::complex_literals;
+
 namespace cudaq {
+using complex = std::complex<double>;
+
+namespace ket {
+inline static const std::vector<complex> zero{1. + 0i, 0. + 0i};
+inline static const std::vector<complex> one{0. + 0i, 1. + 0i};
+} // namespace ket
 
 /// The qudit models a general d-level quantum system.
 /// This type is templated on the number of levels d.
@@ -28,6 +36,30 @@ class qudit {
 public:
   /// Construct a qudit, will allocated a new unique index
   qudit() : idx(getExecutionManager()->getAvailableIndex(n_levels())) {}
+  qudit(const std::vector<simulation_scalar> &state) : qudit() {
+    if (state.size() != Levels)
+      throw std::runtime_error(
+          "Invalid number of state vector elements for qudit allocation (" +
+          std::to_string(state.size()) + ").");
+
+    auto norm =
+        std::inner_product(
+            state.begin(), state.end(), state.begin(),
+            simulation_scalar{0., 0.}, [](auto a, auto b) { return a + b; },
+            [](auto a, auto b) { return std::conj(a) * b; })
+            .real();
+    if (std::fabs(1.0 - norm) > 1e-4)
+      throw std::runtime_error("Invalid vector norm for qudit allocation.");
+
+    // Perform the initialization
+    auto precision = std::is_same_v<simulation_scalar::value_type, float>
+                         ? simulation_precision::fp32
+                         : simulation_precision::fp64;
+    getExecutionManager()->initializeState({QuditInfo(n_levels(), idx)},
+                                           state.data(), precision);
+  }
+  qudit(const std::initializer_list<complex> &list)
+      : qudit({list.begin(), list.end()}) {}
 
   // Qudits cannot be copied
   qudit(const qudit &q) = delete;
diff --git a/runtime/cudaq/qis/qvector.h b/runtime/cudaq/qis/qvector.h
index 4c3ac0f4c5..1e3638b098 100644
--- a/runtime/cudaq/qis/qvector.h
+++ b/runtime/cudaq/qis/qvector.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
+#include "cudaq/host_config.h"
 #include "cudaq/qis/qview.h"
-#include "host_config.h"
 
 namespace cudaq {
 
@@ -30,6 +30,43 @@ class qvector {
   /// @brief Construct a `qvector` with `size` qudits in the |0> state.
   qvector(std::size_t size) : qudits(size) {}
 
+  qvector(const std::vector<simulation_scalar> &vector)
+      : qudits(std::log2(vector.size())) {
+    if (Levels == 2) {
+      auto numElements = std::log2(vector.size());
+      if (std::floor(numElements) != numElements)
+        throw std::runtime_error(
+            "Invalid state vector passed to qvector initialization - number of "
+            "elements must be power of 2.");
+    }
+
+    auto norm =
+        std::inner_product(
+            vector.begin(), vector.end(), vector.begin(),
+            simulation_scalar{0., 0.}, [](auto a, auto b) { return a + b; },
+            [](auto a, auto b) { return std::conj(a) * b; })
+            .real();
+    if (std::fabs(1.0 - norm) > 1e-4)
+      throw std::runtime_error("Invalid vector norm for qudit allocation.");
+
+    std::vector<QuditInfo> targets;
+    for (auto &q : qudits)
+      targets.emplace_back(QuditInfo{Levels, q.id()});
+
+    auto precision = std::is_same_v<simulation_scalar::value_type, float>
+                         ? simulation_precision::fp32
+                         : simulation_precision::fp64;
+    getExecutionManager()->initializeState(targets, vector.data(), precision);
+  }
+
+  // FIXME do we need float versions?
+  qvector(const std::vector<double> &vector)
+      : qvector(std::vector<simulation_scalar>{vector.begin(), vector.end()}) {}
+  qvector(const std::initializer_list<double> &list)
+      : qvector(std::vector<simulation_scalar>{list.begin(), list.end()}) {}
+  qvector(const std::initializer_list<complex> &list)
+      : qvector(std::vector<simulation_scalar>{list.begin(), list.end()}) {}
+
   /// @cond
   /// Nullary constructor
   /// meant to be used with `kernel_builder<cudaq::qvector<>>`
diff --git a/runtime/cudaq/qis/qview.h b/runtime/cudaq/qis/qview.h
index e7b519881a..1b690a6118 100644
--- a/runtime/cudaq/qis/qview.h
+++ b/runtime/cudaq/qis/qview.h
@@ -8,8 +8,8 @@
 
 #pragma once
 
+#include "cudaq/host_config.h"
 #include "cudaq/qis/qudit.h"
-#include "host_config.h"
 #if CUDAQ_USE_STD20
 #include <ranges>
 #include <span>
diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h
index 13d33384d6..10e2babb0d 100644
--- a/runtime/nvqir/CircuitSimulator.h
+++ b/runtime/nvqir/CircuitSimulator.h
@@ -14,7 +14,7 @@
 #include "common/MeasureCounts.h"
 #include "common/NoiseModel.h"
 #include "common/Timing.h"
-
+#include "cudaq/host_config.h"
 #include <cstdarg>
 #include <cstddef>
 #include <queue>
@@ -191,7 +191,10 @@ class CircuitSimulator {
   virtual std::size_t allocateQubit() = 0;
 
   /// @brief Allocate `count` qubits.
-  virtual std::vector<std::size_t> allocateQubits(const std::size_t count) = 0;
+  virtual std::vector<std::size_t>
+  allocateQubits(std::size_t count, const void *state = nullptr,
+                 cudaq::simulation_precision precision =
+                     cudaq::simulation_precision::fp32) = 0;
 
   /// @brief Deallocate the qubit with give unique index
   virtual void deallocate(const std::size_t qubitIdx) = 0;
@@ -594,7 +597,11 @@ class CircuitSimulatorBase : public CircuitSimulator {
   }
 
   /// @brief Add the given number of qubits to the state.
-  virtual void addQubitsToState(std::size_t count) {
+  virtual void addQubitsToState(std::size_t count,
+                                const void *state = nullptr) {
+    if (state != nullptr)
+      throw std::runtime_error("State initialization must be handled by "
+                               "subclasses, override addQubitsToState.");
     for (std::size_t i = 0; i < count; i++)
       addQubitToState();
   }
@@ -806,8 +813,26 @@ class CircuitSimulatorBase : public CircuitSimulator {
   }
 
   /// @brief Allocate `count` qubits.
-  std::vector<std::size_t> allocateQubits(std::size_t count) override {
-    ScopedTraceWithContext("allocateQubits", count);
+  std::vector<std::size_t>
+  allocateQubits(std::size_t count, const void *state = nullptr,
+                 cudaq::simulation_precision precision =
+                     cudaq::simulation_precision::fp32) override {
+    // Make sure if someone gives us state data, that the precision
+    // is correct for this simulation.
+    if (state != nullptr) {
+      if constexpr (std::is_same_v<ScalarType, float>) {
+        if (precision == cudaq::simulation_precision::fp64)
+          throw std::runtime_error(
+              "Invalid user-provided state data. Simulator "
+              "is FP32 but state data is FP64.");
+      } else {
+        if (precision == cudaq::simulation_precision::fp32)
+          throw std::runtime_error(
+              "Invalid user-provided state data. Simulator "
+              "is FP64 but state data is FP32.");
+      }
+    }
+
     std::vector<std::size_t> qubits;
     for (std::size_t i = 0; i < count; i++)
       qubits.emplace_back(tracker.getNextIndex());
@@ -832,7 +857,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
     stateDimension = calculateStateDim(nQubitsAllocated);
 
     // Tell the subtype to allocate more qubits
-    addQubitsToState(count);
+    addQubitsToState(count, state);
 
     // May be that the state grows enough that we
     // want to handle observation via sampling
diff --git a/runtime/nvqir/NVQIR.cpp b/runtime/nvqir/NVQIR.cpp
index 152c35fcdd..6ccadee59a 100644
--- a/runtime/nvqir/NVQIR.cpp
+++ b/runtime/nvqir/NVQIR.cpp
@@ -184,6 +184,25 @@ Array *__quantum__rt__qubit_allocate_array(uint64_t size) {
   return vectorSizetToArray(qubitIdxs);
 }
 
+Array *__quantum__rt__qubit_allocate_array_with_state_fp64(
+    uint64_t size, std::complex<double> *data) {
+  ScopedTraceWithContext("NVQIR::qubit_allocate_array_with_data_fp64", size);
+  __quantum__rt__initialize(0, nullptr);
+  auto qubitIdxs = nvqir::getCircuitSimulatorInternal()->allocateQubits(
+      size, data, cudaq::simulation_precision::fp64);
+  return vectorSizetToArray(qubitIdxs);
+}
+
+Array *
+__quantum__rt__qubit_allocate_array_with_state_fp32(uint64_t size,
+                                                    std::complex<float> *data) {
+  ScopedTraceWithContext("NVQIR::qubit_allocate_array_with_data_fp32", size);
+  __quantum__rt__initialize(0, nullptr);
+  auto qubitIdxs = nvqir::getCircuitSimulatorInternal()->allocateQubits(
+      size, data, cudaq::simulation_precision::fp32);
+  return vectorSizetToArray(qubitIdxs);
+}
+
 /// @brief Once done, release the QIR qubit array
 void __quantum__rt__qubit_release_array(Array *arr) {
   ScopedTraceWithContext("NVQIR::qubit_release_array", arr->size());
diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
index 040aa1dff1..143c154f4b 100644
--- a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
+++ b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
@@ -78,6 +78,73 @@ __global__ void setFirstNElements(T *sv, const T *__restrict__ sv2, int64_t N) {
   }
 }
 
+// kronprod functions adapted from
+// https://github.com/DmitryLyakh/TAL_SH/blob/3cefc2133a68b67c515f4b68a0ed9e3c66e4b4b2/tensor_algebra_gpu_nvidia.cu#L745
+
+#define THRDS_ARRAY_PRODUCT 256
+
+#pragma push
+#pragma nv_diag_suppress 177
+__device__ __host__ cuDoubleComplex operator*(cuDoubleComplex a,
+                                              cuDoubleComplex b) {
+  return cuCmul(a, b);
+}
+__device__ __host__ cuDoubleComplex operator+(cuDoubleComplex a,
+                                              cuDoubleComplex b) {
+  return cuCadd(a, b);
+}
+__device__ __host__ cuFloatComplex operator*(cuFloatComplex a,
+                                             cuFloatComplex b) {
+  return cuCmulf(a, b);
+}
+__device__ __host__ cuFloatComplex operator+(cuFloatComplex a,
+                                             cuFloatComplex b) {
+  return cuCaddf(a, b);
+}
+
+template <typename T>
+__global__ void kronprod(size_t tsize1, const T *arr1, size_t tsize2,
+                         const T *arr2, T *arr0) {
+  __shared__ T lbuf[THRDS_ARRAY_PRODUCT + 1], rbuf[THRDS_ARRAY_PRODUCT];
+  size_t _ib, _in, _jb, _jn, _tx, _jc, _ja;
+
+  _tx = (size_t)threadIdx.x;
+  for (_jb = blockIdx.y * THRDS_ARRAY_PRODUCT; _jb < tsize2;
+       _jb += gridDim.y * THRDS_ARRAY_PRODUCT) {
+    if (_jb + THRDS_ARRAY_PRODUCT > tsize2) {
+      _jn = tsize2 - _jb;
+    } else {
+      _jn = THRDS_ARRAY_PRODUCT;
+    }
+
+    if (_tx < _jn)
+      rbuf[_tx] = arr2[_jb + _tx];
+
+    for (_ib = blockIdx.x * THRDS_ARRAY_PRODUCT; _ib < tsize1;
+         _ib += gridDim.x * THRDS_ARRAY_PRODUCT) {
+      if (_ib + THRDS_ARRAY_PRODUCT > tsize1) {
+        _in = tsize1 - _ib;
+      } else {
+        _in = THRDS_ARRAY_PRODUCT;
+      }
+
+      if (_tx < _in)
+        lbuf[_tx] = arr1[_ib + _tx];
+
+      __syncthreads();
+      for (_jc = 0; _jc < _jn; _jc++) {
+        if (_tx < _in) {
+          _ja = (_jb + _jc) * tsize1 + (_ib + _tx);
+          arr0[_ja] = arr0[_ja] + lbuf[_tx] * rbuf[_jc];
+        }
+      }
+      __syncthreads();
+    }
+  }
+  return;
+}
+#pragma pop
+
 /// @brief The CuStateVecCircuitSimulator implements the CircuitSimulator
 /// base class to provide a simulator that delegates to the NVIDIA CuStateVec
 /// GPU-accelerated library.
@@ -196,40 +263,101 @@ protected:
         controls32.size()));
   }
 
+  /// @brief Nice utility function to have to print the state vector contents on
+  /// GPU.
+  void printStateFromGPU(const std::string &name, void *ptr, std::size_t size) {
+    std::vector<std::complex<ScalarType>> tmp(size);
+    cudaMemcpy(tmp.data(), ptr, size * sizeof(std::complex<ScalarType>),
+               cudaMemcpyDeviceToHost);
+    for (auto &r : tmp)
+      printf("%s: (%.12lf, %.12lf)\n", name.c_str(), r.real(), r.imag());
+    printf("\n");
+  }
+
   /// @brief Increase the state size by the given number of qubits.
-  void addQubitsToState(std::size_t count) override {
-    ScopedTraceWithContext("CuStateVecCircuitSimulator::addQubitsToState", count);
+  void addQubitsToState(std::size_t count, const void *stateIn) override {
+    ScopedTraceWithContext("CuStateVecCircuitSimulator::addQubitsToState",
+                           count);
     if (count == 0)
       return;
 
+    // Cast the state, at this point an error would
+    // have been thrown if it is not of the right floating point type
+    std::complex<ScalarType> *state =
+        reinterpret_cast<std::complex<ScalarType> *>(
+            const_cast<void *>(stateIn));
+
     int dev;
     HANDLE_CUDA_ERROR(cudaGetDevice(&dev));
     cudaq::info("GPU {} Allocating new qubit array of size {}.", dev, count);
 
+    constexpr int32_t threads_per_block = 256;
+    uint32_t n_blocks =
+        (stateDimension + threads_per_block - 1) / threads_per_block;
+
+    // Check if this is the first time to allocate, if so
+    // the allocation is much easier
     if (!deviceStateVector) {
+      // Create the memory and the handle
       HANDLE_CUDA_ERROR(cudaMalloc((void **)&deviceStateVector,
                                    stateDimension * sizeof(CudaDataType)));
-      constexpr int32_t threads_per_block = 256;
-      uint32_t n_blocks =
-          (stateDimension + threads_per_block - 1) / threads_per_block;
-      initializeDeviceStateVector<<<n_blocks, threads_per_block>>>(
-          reinterpret_cast<CudaDataType *>(deviceStateVector), stateDimension);
       HANDLE_ERROR(custatevecCreate(&handle));
+
+      // If no state provided, initialize to the zero state
+      if (state == nullptr) {
+        initializeDeviceStateVector<<<n_blocks, threads_per_block>>>(
+            reinterpret_cast<CudaDataType *>(deviceStateVector),
+            stateDimension);
+        return;
+      }
+
+      // User state provided...
+
+      // FIXME handle case where pointer is a device pointer
+
+      // First allocation, so just set the user provided data here
+      HANDLE_CUDA_ERROR(cudaMemcpy(deviceStateVector, state,
+                                   stateDimension * sizeof(CudaDataType),
+                                   cudaMemcpyHostToDevice));
+      return;
+    }
+
+    // State already exists, need to allocate new state and compute
+    // kronecker product with existing state
+
+    // Allocate new vector to place the kron prod result
+    void *newDeviceStateVector;
+    HANDLE_CUDA_ERROR(cudaMalloc((void **)&newDeviceStateVector,
+                                 stateDimension * sizeof(CudaDataType)));
+
+    // Place the state data on device. Could be that
+    // we just need the zero state, or the user could have provided one
+    void *otherState;
+    HANDLE_CUDA_ERROR(cudaMalloc((void **)&otherState,
+                                 (1UL << count) * sizeof(CudaDataType)));
+    if (state == nullptr) {
+      initializeDeviceStateVector<<<n_blocks, threads_per_block>>>(
+          reinterpret_cast<CudaDataType *>(otherState), (1UL << count));
     } else {
-      // Allocate new state..
-      void *newDeviceStateVector;
-      HANDLE_CUDA_ERROR(cudaMalloc((void **)&newDeviceStateVector,
-                                   stateDimension * sizeof(CudaDataType)));
-      constexpr int32_t threads_per_block = 256;
-      uint32_t n_blocks =
-          (stateDimension + threads_per_block - 1) / threads_per_block;
-      setFirstNElements<<<n_blocks, threads_per_block>>>(
-          reinterpret_cast<CudaDataType *>(newDeviceStateVector),
-          reinterpret_cast<CudaDataType *>(deviceStateVector),
-          previousStateDimension);
-      HANDLE_CUDA_ERROR(cudaFree(deviceStateVector));
-      deviceStateVector = newDeviceStateVector;
+
+      // FIXME Handle case where data is already on GPU
+      HANDLE_CUDA_ERROR(cudaMemcpy(otherState, state,
+                                   (1UL << count) * sizeof(CudaDataType),
+                                   cudaMemcpyHostToDevice));
     }
+
+    // Compute the kronecker product
+    kronprod<CudaDataType><<<n_blocks, threads_per_block>>>(
+        previousStateDimension,
+        reinterpret_cast<CudaDataType *>(deviceStateVector), (1UL << count),
+        reinterpret_cast<CudaDataType *>(otherState),
+        reinterpret_cast<CudaDataType *>(newDeviceStateVector));
+    HANDLE_CUDA_ERROR(cudaGetLastError());
+    
+    // Free the old vectors we don't need anymore.
+    HANDLE_CUDA_ERROR(cudaFree(deviceStateVector));
+    HANDLE_CUDA_ERROR(cudaFree(otherState));
+    deviceStateVector = newDeviceStateVector;
   }
 
   /// @brief Increase the state size by one qubit.
@@ -343,9 +471,7 @@ public:
   }
 
   /// @brief Device synchronization
-  void synchronize() override {
-    HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
-  }
+  void synchronize() override { HANDLE_CUDA_ERROR(cudaDeviceSynchronize()); }
 
   /// @brief Measure operation
   /// @param qubitIdx
@@ -638,9 +764,10 @@ public:
       return cudaq::State{{stateDimension}, {}};
 
     std::vector<std::complex<ScalarType>> tmp(stateDimension);
-    HANDLE_CUDA_ERROR(cudaMemcpy(tmp.data(), deviceStateVector,
-               stateDimension * sizeof(std::complex<ScalarType>),
-               cudaMemcpyDeviceToHost));
+    HANDLE_CUDA_ERROR(
+        cudaMemcpy(tmp.data(), deviceStateVector,
+                   stateDimension * sizeof(std::complex<ScalarType>),
+                   cudaMemcpyDeviceToHost));
 
     if constexpr (std::is_same_v<ScalarType, float>) {
       std::vector<std::complex<double>> data;
diff --git a/runtime/nvqir/cutensornet/CMakeLists.txt b/runtime/nvqir/cutensornet/CMakeLists.txt
index 23f10564af..b279895cd0 100644
--- a/runtime/nvqir/cutensornet/CMakeLists.txt
+++ b/runtime/nvqir/cutensornet/CMakeLists.txt
@@ -1,3 +1,11 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
 # Find CUDA Toolkit for CUDA libs, e.g., cudart.
 find_package(CUDAToolkit REQUIRED)
 
@@ -69,7 +77,7 @@ if (${CUTENSORNET_VERSION} VERSION_GREATER_EQUAL "2.3")
     target_include_directories(nvqir-${LIBRARY_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/runtime/common ${CMAKE_SOURCE_DIR}/runtime/nvqir ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUTENSORNET_INCLUDE_DIR})
     target_link_libraries(nvqir-${LIBRARY_NAME} PRIVATE fmt::fmt-header-only cudaq cudaq-common ${CUTENSORNET_LIB} ${CUTENSOR_LIB} CUDA::cudart)
     install(TARGETS nvqir-${LIBRARY_NAME} DESTINATION lib)
-    file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\n")
+    file (WRITE ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config "NVQIR_SIMULATION_BACKEND=${LIBRARY_NAME}\nGPU_REQUIREMENTS=\"true\"\nPREPROCESSOR_DEFINES=\"\${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64\"\n")
     install(FILES ${CMAKE_BINARY_DIR}/targets/${LIBRARY_NAME}.config DESTINATION targets)
   endmacro()
 
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
index edd31034c2..934b94a991 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -5,6 +5,7 @@
  * This source code and the accompanying materials are made available under    *
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
+
 #include "simulator_cutensornet.h"
 #include "cudaq.h"
 #include "cutensornet.h"
@@ -228,6 +229,7 @@ static nvqir::CutensornetExecutor *getPluginInstance() {
   cudaq::info("Successfully loaded the cutensornet plugin.");
   return fcn();
 }
+
 /// @brief Evaluate the expectation value of a given observable
 cudaq::observe_result
 SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
@@ -271,7 +273,8 @@ cudaq::State SimulatorTensorNetBase::getStateData() {
 }
 
 nvqir::CircuitSimulator *SimulatorTensorNetBase::clone() { return nullptr; }
-void SimulatorTensorNetBase::addQubitsToState(std::size_t count) {
+
+void SimulatorTensorNetBase::addQubitsToState(std::size_t count, const void *) {
   LOG_API_TIME();
   if (!m_state)
     m_state = std::make_unique<TensorNetState>(count, m_cutnHandle);
diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h
index f939534b67..60bce7df0a 100644
--- a/runtime/nvqir/cutensornet/simulator_cutensornet.h
+++ b/runtime/nvqir/cutensornet/simulator_cutensornet.h
@@ -55,7 +55,8 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
   virtual cudaq::observe_result observe(const cudaq::spin_op &op) override;
 
   /// @brief Add qubits to the underlying quantum state
-  virtual void addQubitsToState(std::size_t count) override;
+  virtual void addQubitsToState(std::size_t count,
+                                const void *state = nullptr) override;
 
   /// @brief Return the state vector data
   virtual cudaq::State getStateData() override;
diff --git a/runtime/nvqir/qpp/QppCircuitSimulator.cpp b/runtime/nvqir/qpp/QppCircuitSimulator.cpp
index abe30c1db8..0116fa1cbd 100644
--- a/runtime/nvqir/qpp/QppCircuitSimulator.cpp
+++ b/runtime/nvqir/qpp/QppCircuitSimulator.cpp
@@ -88,24 +88,35 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
 
   /// @brief Override the default sized allocation of qubits
   /// here to be a bit more efficient than the default implementation
-  void addQubitsToState(std::size_t count) override {
+  void addQubitsToState(std::size_t count,
+                        const void *stateDataIn = nullptr) override {
     if (count == 0)
       return;
 
+    auto *stateData = reinterpret_cast<std::complex<double> *>(
+        const_cast<void *>(stateDataIn));
+
     if (state.size() == 0) {
       // If this is the first time, allocate the state
-      state = qpp::ket::Zero(stateDimension);
-      state(0) = 1.0;
+      if (stateData == nullptr) {
+        state = qpp::ket::Zero(stateDimension);
+        state(0) = 1.0;
+      } else
+        state = qpp::ket::Map(stateData, stateDimension);
       return;
     }
 
     // If we are resizing an existing, allocate
     // a zero state on a n qubit, and Kron-prod
     // that with the existing state.
-    qpp::ket zero_state = qpp::ket::Zero((1UL << count));
-    zero_state(0) = 1.0;
-    state = qpp::kron(zero_state, state);
-
+    if (stateData == nullptr) {
+      qpp::ket zero_state = qpp::ket::Zero((1UL << count));
+      zero_state(0) = 1.0;
+      state = qpp::kron(zero_state, state);
+    } else {
+      qpp::ket initState = qpp::ket::Map(stateData, count);
+      state = qpp::kron(initState, state);
+    }
     return;
   }
 
diff --git a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
index ae4ad1b453..82a01f26a4 100644
--- a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
+++ b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
@@ -68,10 +68,14 @@ class QppNoiseCircuitSimulator : public nvqir::QppCircuitSimulator<qpp::cmat> {
   /// @brief Grow the density matrix by one qubit.
   void addQubitToState() override { addQubitsToState(1); }
 
-  void addQubitsToState(std::size_t count) override {
+  void addQubitsToState(std::size_t count,
+                        const void *data = nullptr) override {
     if (count == 0)
       return;
 
+    if (data != nullptr)
+      throw std::runtime_error("init state not implemented for dm sim");
+
     if (state.size() == 0) {
       // If this is the first time, allocate the state
       state = qpp::cmat::Zero(stateDimension, stateDimension);
diff --git a/runtime/nvqir/qpp/density-matrix-cpu.config b/runtime/nvqir/qpp/density-matrix-cpu.config
index 56d43d0801..2c74e3f321 100644
--- a/runtime/nvqir/qpp/density-matrix-cpu.config
+++ b/runtime/nvqir/qpp/density-matrix-cpu.config
@@ -7,4 +7,5 @@
 # ============================================================================ #
 
 NVQIR_SIMULATION_BACKEND="dm"
-TARGET_DESCRIPTION="The Density Matrix CPU Target provides a simulated QPU via OpenMP-enabled, CPU-only density matrix emulation."
\ No newline at end of file
+PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64"
+TARGET_DESCRIPTION="The Density Matrix CPU Target provides a simulated QPU via OpenMP-enabled, CPU-only density matrix emulation."
diff --git a/runtime/nvqir/qpp/qpp-cpu.config b/runtime/nvqir/qpp/qpp-cpu.config
index b87951ac1d..d73401fb66 100644
--- a/runtime/nvqir/qpp/qpp-cpu.config
+++ b/runtime/nvqir/qpp/qpp-cpu.config
@@ -7,4 +7,5 @@
 # ============================================================================ #
 
 NVQIR_SIMULATION_BACKEND="qpp"
-TARGET_DESCRIPTION="QPP-based CPU-only backend target"
\ No newline at end of file
+PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64"
+TARGET_DESCRIPTION="QPP-based CPU-only backend target"
diff --git a/targettests/execution/from_state.cpp b/targettests/execution/from_state.cpp
new file mode 100644
index 0000000000..388f38dcc4
--- /dev/null
+++ b/targettests/execution/from_state.cpp
@@ -0,0 +1,28 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std %s -o %t && %t | FileCheck %s
+
+#include <cudaq.h>
+
+__qpu__ void test(std::vector<cudaq::simulation_scalar> inState) {
+  cudaq::qvector q = inState;
+}
+
+// CHECK: size 2
+
+int main() {
+  std::vector<cudaq::simulation_scalar> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  auto counts = cudaq::sample(test, vec);
+  counts.dump();
+
+  printf("size %zu\n", counts.size());
+
+
+}
\ No newline at end of file
diff --git a/test/AST-Quake/qalloc_initialization.cpp b/test/AST-Quake/qalloc_initialization.cpp
new file mode 100644
index 0000000000..b96f5b518d
--- /dev/null
+++ b/test/AST-Quake/qalloc_initialization.cpp
@@ -0,0 +1,408 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: cudaq-quake -D CUDAQ_SIMULATION_SCALAR_FP64 %cpp_std %s | cudaq-opt | FileCheck %s
+// RUN: cudaq-quake -D CUDAQ_SIMULATION_SCALAR_FP64 %cpp_std %s | cudaq-opt | cudaq-translate --convert-to=qir | FileCheck --check-prefix=QIR %s
+// clang-format on
+
+// Test various flavors of qubits declared with initial state information.
+
+#include <cudaq.h>
+
+struct Vanilla {
+  std::vector<bool> operator()() __qpu__ {
+    cudaq::qvector v = {0., 1., 1., 0.};
+    h(v);
+    return mz(v);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__Vanilla() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_3:.*]] = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr<!cc.array<f64 x 4>>
+// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_3]] : (!quake.veq<2>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<?>
+// clang-format on
+
+struct Cherry {
+  std::vector<bool> operator()() __qpu__ {
+    using namespace std::complex_literals;
+    cudaq::qvector v = std::initializer_list<std::complex<double>>{
+        {0.0, 1.0}, {0.6, 0.4}, {1.0, 0.0}, {0.0, 0.0}};
+    h(v);
+    return mz(v);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__Cherry() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 4.000000e-01 : f64
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 6.000000e-01 : f64
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1.000000e+00 : f64
+// CHECK:           %[[VAL_7:.*]] = complex.create %[[VAL_5]], %[[VAL_6]] : complex<f64>
+// CHECK:           %[[VAL_8:.*]] = complex.create %[[VAL_4]], %[[VAL_3]] : complex<f64>
+// CHECK:           %[[VAL_9:.*]] = complex.create %[[VAL_6]], %[[VAL_5]] : complex<f64>
+// CHECK:           %[[VAL_10:.*]] = cc.alloca !cc.array<complex<f64> x 4>
+// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_8]], %[[VAL_12]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_10]][2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_9]], %[[VAL_13]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_14:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_15:.*]] = quake.init_state %[[VAL_14]], %[[VAL_10]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f64> x 4>>) -> !quake.veq<?>
+// clang-format on
+
+struct MooseTracks {
+  std::vector<bool> operator()() __qpu__ {
+    using namespace std::complex_literals;
+    cudaq::qvector v = {
+        std::complex<double>{0.0, 1.0}, std::complex<double>{0.75, 0.25},
+        std::complex<double>{1.0, 0.0}, std::complex<double>{0.0, 0.0}};
+    h(v);
+    return mz(v);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__MooseTracks() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2.500000e-01 : f64
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 7.500000e-01 : f64
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1.000000e+00 : f64
+// CHECK:           %[[VAL_7:.*]] = complex.create %[[VAL_5]], %[[VAL_6]] : complex<f64>
+// CHECK:           %[[VAL_8:.*]] = complex.create %[[VAL_4]], %[[VAL_3]] : complex<f64>
+// CHECK:           %[[VAL_9:.*]] = complex.create %[[VAL_6]], %[[VAL_5]] : complex<f64>
+// CHECK:           %[[VAL_10:.*]] = cc.alloca !cc.array<complex<f64> x 4>
+// CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][0] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_7]], %[[VAL_11]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_10]][1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_8]], %[[VAL_12]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_10]][2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_9]], %[[VAL_13]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_14:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_15:.*]] = quake.init_state %[[VAL_14]], %[[VAL_10]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f64> x 4>>) -> !quake.veq<?>
+// clang-format on
+
+struct RockyRoad {
+  std::vector<bool> operator()() __qpu__ {
+    using namespace std::complex_literals;
+    cudaq::qvector v = {0.0 + 1.0i, std::complex<double>{0.8, 0.2}, 1.0 + 0.0i,
+                        std::complex<double>{0.0, 0.0}};
+    h(v);
+    return mz(v);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__RockyRoad() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f{{[1280]+}}
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 2.000000e-01 : f64
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 8.000000e-01 : f64
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1.000000e+00 : f{{[1280]+}}
+// CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK-DAG:       %[[VAL_9:.*]] = cc.alloca f64
+// CHECK:           cc.store %[[VAL_8]], %[[VAL_9]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_10:.*]] = call @_ZNSt8literals16complex_literalsli1iEe(%[[VAL_7]]) : (f{{[1280]+}}) -> complex<f64>
+// CHECK:           %[[VAL_11:.*]] = cc.alloca complex<f64>
+// CHECK:           cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_12:.*]] = call @_ZStplIdESt7complexIT_ERKS1_RKS2_(%[[VAL_9]], %[[VAL_11]]) : (!cc.ptr<f64>, !cc.ptr<complex<f64>>) -> complex<f64>
+// CHECK:           %[[VAL_13:.*]] = complex.create %[[VAL_6]], %[[VAL_5]] : complex<f64>
+// CHECK:           %[[VAL_14:.*]] = cc.alloca f64
+// CHECK:           cc.store %[[VAL_4]], %[[VAL_14]] : !cc.ptr<f64>
+// CHECK:           %[[VAL_15:.*]] = call @_ZNSt8literals16complex_literalsli1iEe(%[[VAL_3]]) : (f{{[1280]+}}) -> complex<f64>
+// CHECK:           %[[VAL_16:.*]] = cc.alloca complex<f64>
+// CHECK:           cc.store %[[VAL_15]], %[[VAL_16]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_17:.*]] = call @_ZStplIdESt7complexIT_ERKS1_RKS2_(%[[VAL_14]], %[[VAL_16]]) : (!cc.ptr<f64>, !cc.ptr<complex<f64>>) -> complex<f64>
+// CHECK:           %[[VAL_18:.*]] = cc.alloca !cc.array<complex<f64> x 4>
+// CHECK:           %[[VAL_19:.*]] = cc.compute_ptr %[[VAL_18]][0] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_12]], %[[VAL_19]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_18]][1] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_13]], %[[VAL_20]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_21:.*]] = cc.compute_ptr %[[VAL_18]][2] : (!cc.ptr<!cc.array<complex<f64> x 4>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_17]], %[[VAL_21]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_22:.*]] = quake.alloca !quake.veq<2>
+// CHECK:           %[[VAL_23:.*]] = quake.init_state %[[VAL_22]], %[[VAL_18]] : (!quake.veq<2>, !cc.ptr<!cc.array<complex<f64> x 4>>) -> !quake.veq<?>
+// clang-format on
+
+std::vector<double> getTwoTimesRank();
+
+struct Pistachio {
+  bool operator()() __qpu__ {
+    cudaq::qvector v{getTwoTimesRank()};
+    h(v);
+    return mz(v[0]);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__Pistachio() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_2:.*]] = call @_Z15getTwoTimesRankv() : () -> !cc.stdvec<f64>
+// CHECK:           %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_2]] : (!cc.stdvec<f64>) -> i64
+// CHECK:           %[[VAL_3:.*]] = math.cttz %[[VAL_30]] : i64
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_2]] : (!cc.stdvec<f64>) -> !cc.ptr<f64>
+// CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
+// CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_4]] : (!quake.veq<?>, !cc.ptr<f64>) -> !quake.veq<?>
+// clang-format on
+
+struct ChocolateMint {
+  bool operator()() __qpu__ {
+    cudaq::qvector v = getTwoTimesRank();
+    h(v);
+    return mz(v[0]);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__ChocolateMint() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_2:.*]] = call @_Z15getTwoTimesRankv() : () -> !cc.stdvec<f64>
+// CHECK:           %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_2]] : (!cc.stdvec<f64>) -> i64
+// CHECK:           %[[VAL_3:.*]] = math.cttz %[[VAL_30]] : i64
+// CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_2]] : (!cc.stdvec<f64>) -> !cc.ptr<f64>
+// CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<?>[%[[VAL_3]] : i64]
+// CHECK:           %[[VAL_6:.*]] = quake.init_state %[[VAL_5]], %[[VAL_4]] : (!quake.veq<?>, !cc.ptr<f64>) -> !quake.veq<?>
+// clang-format on
+
+std::vector<std::complex<double>> getComplexInit();
+
+struct Neapolitan {
+  std::vector<bool> operator()() __qpu__ {
+    cudaq::qvector v{getComplexInit()};
+    h(v);
+    return mz(v);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__Neapolitan() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_3:.*]] = call @_Z14getComplexInitv() : () -> !cc.stdvec<complex<f64>>
+// CHECK:           %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<complex<f64>>) -> i64
+// CHECK:           %[[VAL_4:.*]] = math.cttz %[[VAL_30]] : i64
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<complex<f64>>) -> !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_6:.*]] = quake.alloca !quake.veq<?>[%[[VAL_4]] : i64]
+// CHECK:           %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_5]] : (!quake.veq<?>, !cc.ptr<complex<f64>>) -> !quake.veq<?>
+// clang-format on
+
+struct ButterPecan {
+  std::vector<bool> operator()() __qpu__ {
+    cudaq::qvector v = getComplexInit();
+    h(v);
+    return mz(v);
+  }
+};
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__ButterPecan() -> !cc.stdvec<i1> attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+// CHECK:           %[[VAL_3:.*]] = call @_Z14getComplexInitv() : () -> !cc.stdvec<complex<f64>>
+// CHECK:           %[[VAL_30:.*]] = cc.stdvec_size %[[VAL_3]] : (!cc.stdvec<complex<f64>>) -> i64
+// CHECK:           %[[VAL_4:.*]] = math.cttz %[[VAL_30]] : i64
+// CHECK:           %[[VAL_5:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<complex<f64>>) -> !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_6:.*]] = quake.alloca !quake.veq<?>[%[[VAL_4]] : i64]
+// CHECK:           %[[VAL_7:.*]] = quake.init_state %[[VAL_6]], %[[VAL_5]] : (!quake.veq<?>, !cc.ptr<complex<f64>>) -> !quake.veq<?>
+// clang-format on
+
+__qpu__ auto Strawberry() {
+  cudaq::qubit q = {0., 1.};
+  return mz(q);
+}
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_Strawberry._Z10Strawberryv() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = complex.create %[[VAL_1]], %[[VAL_1]] : complex<f64>
+// CHECK:           %[[VAL_3:.*]] = complex.create %[[VAL_0]], %[[VAL_1]] : complex<f64>
+// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<complex<f64> x 2>
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]][0] : (!cc.ptr<!cc.array<complex<f64> x 2>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_5]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<complex<f64> x 2>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_3]], %[[VAL_6]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<1>
+// CHECK:           %[[VAL_8:.*]] = quake.init_state %[[VAL_7]], %[[VAL_4]] : (!quake.veq<1>, !cc.ptr<!cc.array<complex<f64> x 2>>) -> !quake.veq<1>
+// CHECK:           %[[VAL_9:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref
+// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_9]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1
+// CHECK:           return %[[VAL_11]] : i1
+// CHECK:         }
+// clang-format on
+
+#if 0
+// The ket syntax is not yet provided in the headers.
+__qpu__ auto GoldRibbon() {
+  cudaq::qubit q = cudaq::ket::one;
+  return mz(q);
+}
+#endif
+
+__qpu__ bool Peppermint() { 
+  cudaq::qubit q = {M_SQRT1_2, M_SQRT1_2};
+  return mz(q);
+}
+
+// clang-format off
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_Peppermint._Z10Peppermintv() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0.70710678118654757 : f64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[VAL_2:.*]] = complex.create %[[VAL_0]], %[[VAL_1]] : complex<f64>
+// CHECK:           %[[VAL_3:.*]] = complex.create %[[VAL_0]], %[[VAL_1]] : complex<f64>
+// CHECK:           %[[VAL_4:.*]] = cc.alloca !cc.array<complex<f64> x 2>
+// CHECK:           %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_4]][0] : (!cc.ptr<!cc.array<complex<f64> x 2>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_2]], %[[VAL_5]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_6:.*]] = cc.compute_ptr %[[VAL_4]][1] : (!cc.ptr<!cc.array<complex<f64> x 2>>) -> !cc.ptr<complex<f64>>
+// CHECK:           cc.store %[[VAL_3]], %[[VAL_6]] : !cc.ptr<complex<f64>>
+// CHECK:           %[[VAL_7:.*]] = quake.alloca !quake.veq<1>
+// CHECK:           %[[VAL_8:.*]] = quake.init_state %[[VAL_7]], %[[VAL_4]] : (!quake.veq<1>, !cc.ptr<!cc.array<complex<f64> x 2>>) -> !quake.veq<1>
+// CHECK:           %[[VAL_9:.*]] = quake.extract_ref %[[VAL_8]][0] : (!quake.veq<1>) -> !quake.ref
+// CHECK:           %[[VAL_10:.*]] = quake.mz %[[VAL_9]] : (!quake.ref) -> !quake.measure
+// CHECK:           %[[VAL_11:.*]] = quake.discriminate %[[VAL_10]] : (!quake.measure) -> i1
+// CHECK:           return %[[VAL_11]] : i1
+// CHECK:         }
+// clang-format on
+
+//===----------------------------------------------------------------------===//
+//
+// QIR checks
+//
+//===----------------------------------------------------------------------===//
+
+// clang-format off
+// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Vanilla() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = tail call %[[VAL_1:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull bitcast ([4 x double]* @__nvqpp__rodata_init_0 to i8*))
+// QIR:         %[[VAL_2:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_1]]* %[[VAL_0]])
+// QIR:       }
+
+// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Cherry() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = alloca [4 x { double, double }], align 8
+// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 0
+// QIR:         store double 0.000000e+00, double* %[[VAL_1]], align 8
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 1
+// QIR:         store double 1.000000e+00, double* %[[VAL_2]], align 8
+// QIR:         %[[VAL_3:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 0
+// QIR:         store double 6.000000e-01, double* %[[VAL_3]], align 8
+// QIR:         %[[VAL_4:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 1
+// QIR:         store double 4.000000e-01, double* %[[VAL_4]], align 8
+// QIR:         %[[VAL_5:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 0
+// QIR:         store double 1.000000e+00, double* %[[VAL_5]], align 8
+// QIR:         %[[VAL_6:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 1
+// QIR:         %[[VAL_7:.*]] = bitcast [4 x { double, double }]* %[[VAL_0]] to i8*
+// QIR:         call void @llvm.memset
+// QIR:         %[[VAL_8:.*]] = call %[[VAL_9:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull %[[VAL_7]])
+// QIR:         %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_9]]* %[[VAL_8]])
+// QIR:       }
+
+// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__MooseTracks() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = alloca [4 x { double, double }], align 8
+// QIR:         %[[VAL_1:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 0
+// QIR:         store double 0.000000e+00, double* %[[VAL_1]], align 8
+// QIR:         %[[VAL_2:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 0, i32 1
+// QIR:         store double 1.000000e+00, double* %[[VAL_2]], align 8
+// QIR:         %[[VAL_3:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 0
+// QIR:         store double 7.500000e-01, double* %[[VAL_3]], align 8
+// QIR:         %[[VAL_4:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 1, i32 1
+// QIR:         store double 2.500000e-01, double* %[[VAL_4]], align 8
+// QIR:         %[[VAL_5:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 0
+// QIR:         store double 1.000000e+00, double* %[[VAL_5]], align 8
+// QIR:         %[[VAL_6:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_0]], i64 0, i64 2, i32 1
+// QIR:         %[[VAL_7:.*]] = bitcast [4 x { double, double }]* %[[VAL_0]] to i8*
+// QIR:         call void @llvm.memset
+// QIR:         %[[VAL_8:.*]] = call %[[VAL_9:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull %[[VAL_7]])
+// QIR:         %[[VAL_10:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_9]]* %[[VAL_8]])
+// QIR:       }
+
+// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__RockyRoad() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = alloca double, align 8
+// QIR:         store double 0.000000e+00, double* %[[VAL_0]], align 8
+// QIR:         %[[VAL_1:.*]] = tail call { double, double } @_ZNSt8literals16complex_literalsli1iEe(
+// QIR:         %[[VAL_2:.*]] = alloca { double, double }, align 8
+// QIR:         %[[VAL_3:.*]] = extractvalue { double, double } %[[VAL_1]], 0
+// QIR:         %[[VAL_4:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_2]], i64 0, i32 0
+// QIR:         store double %[[VAL_3]], double* %[[VAL_4]], align 8
+// QIR:         %[[VAL_5:.*]] = extractvalue { double, double } %[[VAL_1]], 1
+// QIR:         %[[VAL_6:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_2]], i64 0, i32 1
+// QIR:         store double %[[VAL_5]], double* %[[VAL_6]], align 8
+// QIR:         %[[VAL_7:.*]] = call { double, double } @_ZStplIdESt7complexIT_ERKS1_RKS2_(double* nonnull %[[VAL_0]], { double, double }* nonnull %[[VAL_2]])
+// QIR:         %[[VAL_8:.*]] = alloca double, align 8
+// QIR:         store double 1.000000e+00, double* %[[VAL_8]], align 8
+// QIR:         %[[VAL_9:.*]] = call { double, double } @_ZNSt8literals16complex_literalsli1iEe(
+// QIR:         %[[VAL_10:.*]] = alloca { double, double }, align 8
+// QIR:         %[[VAL_11:.*]] = extractvalue { double, double } %[[VAL_9]], 0
+// QIR:         %[[VAL_12:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_10]], i64 0, i32 0
+// QIR:         store double %[[VAL_11]], double* %[[VAL_12]], align 8
+// QIR:         %[[VAL_13:.*]] = extractvalue { double, double } %[[VAL_9]], 1
+// QIR:         %[[VAL_14:.*]] = getelementptr inbounds { double, double }, { double, double }* %[[VAL_10]], i64 0, i32 1
+// QIR:         store double %[[VAL_13]], double* %[[VAL_14]], align 8
+// QIR:         %[[VAL_15:.*]] = call { double, double } @_ZStplIdESt7complexIT_ERKS1_RKS2_(double* nonnull %[[VAL_8]], { double, double }* nonnull %[[VAL_10]])
+// QIR:         %[[VAL_16:.*]] = alloca [4 x { double, double }], align 8
+// QIR:         %[[VAL_17:.*]] = extractvalue { double, double } %[[VAL_7]], 0
+// QIR:         %[[VAL_18:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 0, i32 0
+// QIR:         store double %[[VAL_17]], double* %[[VAL_18]], align 8
+// QIR:         %[[VAL_19:.*]] = extractvalue { double, double } %[[VAL_7]], 1
+// QIR:         %[[VAL_20:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 0, i32 1
+// QIR:         store double %[[VAL_19]], double* %[[VAL_20]], align 8
+// QIR:         %[[VAL_21:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 1, i32 0
+// QIR:         store double 8.000000e-01, double* %[[VAL_21]], align 8
+// QIR:         %[[VAL_22:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 1, i32 1
+// QIR:         store double 2.000000e-01, double* %[[VAL_22]], align 8
+// QIR:         %[[VAL_23:.*]] = extractvalue { double, double } %[[VAL_15]], 0
+// QIR:         %[[VAL_24:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 2, i32 0
+// QIR:         store double %[[VAL_23]], double* %[[VAL_24]], align 8
+// QIR:         %[[VAL_25:.*]] = extractvalue { double, double } %[[VAL_15]], 1
+// QIR:         %[[VAL_26:.*]] = getelementptr inbounds [4 x { double, double }], [4 x { double, double }]* %[[VAL_16]], i64 0, i64 2, i32 1
+// QIR:         store double %[[VAL_25]], double* %[[VAL_26]], align 8
+// QIR:         %[[VAL_27:.*]] = bitcast [4 x { double, double }]* %[[VAL_16]] to i8*
+// QIR:         %[[VAL_28:.*]] = call %[[VAL_29:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 2, i8* nonnull %[[VAL_27]])
+// QIR:         %[[VAL_30:.*]] = call i64 @__quantum__rt__array_get_size_1d(%[[VAL_29]]* %[[VAL_28]])
+// QIR:       }
+
+// QIR-LABEL: define i1 @__nvqpp__mlirgen__Pistachio() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = tail call { double*, i64 } @_Z15getTwoTimesRankv()
+// QIR:         %[[VAL_I:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 1
+// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false)
+// QIR:         %[[VAL_2:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_3:.*]] = bitcast double* %[[VAL_2]] to i8*
+// QIR:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]])
+// QIR:         %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]])
+// QIR:       }
+
+// QIR-LABEL: define i1 @__nvqpp__mlirgen__ChocolateMint() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = tail call { double*, i64 } @_Z15getTwoTimesRankv()
+// QIR:         %[[VAL_I:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 1
+// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false)
+// QIR:         %[[VAL_2:.*]] = extractvalue { double*, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_3:.*]] = bitcast double* %[[VAL_2]] to i8*
+// QIR:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]])
+// QIR:         %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]])
+// QIR:       }
+
+// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__Neapolitan() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = tail call { { double, double }*, i64 } @_Z14getComplexInitv()
+// QIR:         %[[VAL_I:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 1
+// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false)
+// QIR:         %[[VAL_2:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_3:.*]] = bitcast { double, double }* %[[VAL_2]] to i8*
+// QIR:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]])
+// QIR:         %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]])
+// QIR:       }
+
+// QIR-LABEL: define { i1*, i64 } @__nvqpp__mlirgen__ButterPecan() local_unnamed_addr {
+// QIR:         %[[VAL_0:.*]] = tail call { { double, double }*, i64 } @_Z14getComplexInitv()
+// QIR:         %[[VAL_I:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 1
+// QIR: %[[VAL_1:.*]] = tail call i64 @llvm.cttz.i64(i64 %[[VAL_I]], i1 false)
+// QIR:         %[[VAL_2:.*]] = extractvalue { { double, double }*, i64 } %[[VAL_0]], 0
+// QIR:         %[[VAL_3:.*]] = bitcast { double, double }* %[[VAL_2]] to i8*
+// QIR:         %[[VAL_4:.*]] = tail call %[[VAL_5:.*]]* @__quantum__rt__qubit_allocate_array_with_state_fp64(i64 %[[VAL_1]], i8* %[[VAL_3]])
+// QIR:         %[[VAL_6:.*]] = tail call i64 @__quantum__rt__array_get_size_1d(%[[VAL_5]]* %[[VAL_4]])
+// QIR:       }
+
diff --git a/test/AST-Quake/type_alias.cpp b/test/AST-Quake/type_alias.cpp
index 8d5f421b0c..306b63f09d 100644
--- a/test/AST-Quake/type_alias.cpp
+++ b/test/AST-Quake/type_alias.cpp
@@ -54,11 +54,11 @@ __qpu__ big kernel3(big arg) {
 }
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_kernel3._Z7kernel3e(
-// CHECK-SAME:      %[[VAL_0:.*]]: f128{{.*}}) -> f128
-// CHECK:           %[[VAL_1:.*]] = cc.alloca f128
-// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<f128>
+// CHECK-SAME:      %[[VAL_0:.*]]: f[[TY:[1280]+]]{{.*}}) -> f[[TY]]
+// CHECK:           %[[VAL_1:.*]] = cc.alloca f[[TY]]
+// CHECK:           cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<f[[TY]]>
 // CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.ref
 // CHECK:           quake.x %[[VAL_2]] : (!quake.ref) -> ()
-// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<f128>
-// CHECK:           return %[[VAL_3]] : f128
+// CHECK:           %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<f[[TY]]>
+// CHECK:           return %[[VAL_3]] : f[[TY]]
 // CHECK:         }
diff --git a/test/Quake/add_dealloc.qke b/test/Quake/add_dealloc-0.qke
similarity index 100%
rename from test/Quake/add_dealloc.qke
rename to test/Quake/add_dealloc-0.qke
diff --git a/test/Quake/add_dealloc-1.qke b/test/Quake/add_dealloc-1.qke
new file mode 100644
index 0000000000..0348d1c7f3
--- /dev/null
+++ b/test/Quake/add_dealloc-1.qke
@@ -0,0 +1,45 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt --add-dealloc %s | FileCheck %s
+
+module {
+  func.func @__nvqpp__mlirgen__FromState() {
+    %c4 = arith.constant 4 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %0 = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr<!cc.array<f64 x 4>>
+    %1 = quake.alloca !quake.veq<4>
+    %2 = quake.init_state %1, %0 : (!quake.veq<4>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<4>
+    %3 = cc.loop while ((%arg0 = %c0) -> (index)) {
+      %4 = arith.cmpi slt, %arg0, %c4 : index
+      cc.condition %4(%arg0 : index)
+    } do {
+    ^bb0(%arg0: index):
+      %4 = quake.extract_ref %2[%arg0] : (!quake.veq<4>, index) -> !quake.ref
+      quake.h %4 : (!quake.ref) -> ()
+      cc.continue %arg0 : index
+    } step {
+    ^bb0(%arg0: index):
+      %4 = arith.addi %arg0, %c1 : index
+      cc.continue %4 : index
+    } {invariant}
+    return
+  }
+  cc.global constant @__nvqpp__rodata_init_0 (dense<[1.000000e+00, 0.000000e+00, 5.000000e-01, 5.000000e-01]> : tensor<4xf64>) : !cc.array<f64 x 4>
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__FromState() {
+// CHECK:           %[[VAL_3:.*]] = cc.address_of @__nvqpp__rodata_init_0 : !cc.ptr<!cc.array<f64 x 4>>
+// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.veq<4>
+// CHECK:           %[[VAL_5:.*]] = quake.init_state %[[VAL_4]], %[[VAL_3]] : (!quake.veq<4>, !cc.ptr<!cc.array<f64 x 4>>) -> !quake.veq<4>
+// CHECK:           quake.dealloc %[[VAL_5]] : !quake.veq<4>
+// CHECK:           return
+// CHECK:         }
+// CHECK:         cc.global constant @__nvqpp__rodata_init_0 (dense<[1.000000e+00, 0.000000e+00, 5.000000e-01, 5.000000e-01]> : tensor<4xf64>) : !cc.array<f64 x 4>
+
diff --git a/test/Transforms/mapping_non_unitaries.qke b/test/Transforms/mapping_non_unitaries.qke
index 1ff3117e05..d6f15b970b 100644
--- a/test/Transforms/mapping_non_unitaries.qke
+++ b/test/Transforms/mapping_non_unitaries.qke
@@ -14,14 +14,14 @@
 // RUN: cudaq-opt --qubit-mapping=device=grid\(3,3\) %s | CircuitCheck --up-to-mapping %s
 // RUN: cudaq-opt --qubit-mapping=device=grid\(1,5\) %s | CircuitCheck --up-to-mapping %s
 // RUN: cudaq-opt --qubit-mapping=device=grid\(5,1\) %s | CircuitCheck --up-to-mapping %s
-// RUN: cudaq-opt --qubit-mapping=device=path %s | FileCheck --check-prefix CHECK %s
-// RUN: cudaq-opt --qubit-mapping=device=path\(5\) %s | FileCheck --check-prefix CHECK %s
-// RUN: cudaq-opt --qubit-mapping=device=ring\(5\) %s | FileCheck --check-prefix CHECK %s
-// RUN: cudaq-opt --qubit-mapping=device=star\(5,2\) %s | FileCheck --check-prefix STAR52 %s
-// RUN: cudaq-opt --qubit-mapping=device=star\(5,0\) %s | FileCheck --check-prefix STAR50 %s
-// RUN: cudaq-opt --qubit-mapping=device=grid\(3,3\) %s | FileCheck --check-prefix CHECK %s
-// RUN: cudaq-opt --qubit-mapping=device=grid\(1,5\) %s | FileCheck --check-prefix CHECK %s
-// RUN: cudaq-opt --qubit-mapping=device=grid\(5,1\) %s | FileCheck --check-prefix CHECK %s
+// RUN: cudaq-opt --qubit-mapping=device=path %s | FileCheck %s
+// RUN: cudaq-opt --qubit-mapping=device=path\(5\) %s | FileCheck %s
+// RUN: cudaq-opt --qubit-mapping=device=ring\(5\) %s | FileCheck %s
+// RUN: cudaq-opt --qubit-mapping=device=star\(5,2\) %s | FileCheck --check-prefix=STAR52 %s
+// RUN: cudaq-opt --qubit-mapping=device=star\(5,0\) %s | FileCheck --check-prefix=STAR50 %s
+// RUN: cudaq-opt --qubit-mapping=device=grid\(3,3\) %s | FileCheck %s
+// RUN: cudaq-opt --qubit-mapping=device=grid\(1,5\) %s | FileCheck %s
+// RUN: cudaq-opt --qubit-mapping=device=grid\(5,1\) %s | FileCheck %s
 
 func.func @test_measurement() {
   %0 = quake.null_wire
diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
index 480ee1f66e..fb33f8510e 100644
--- a/tools/nvqpp/nvq++.in
+++ b/tools/nvqpp/nvq++.in
@@ -637,8 +637,12 @@ fi
 LLC=${LLVMBIN}llc${llvm_suffix}
 
 if ${LIBRARY_MODE}; then
+	PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_LIBRARY_MODE"
 	ENABLE_KERNEL_EXECUTION=false
 fi
+if [[ "${TARGET_CONFIG}" != *"CUDAQ_SIMULATION_SCALAR_"* ]]; then
+	PREPROCESSOR_DEFINES="${PREPROCESSOR_DEFINES} -D CUDAQ_SIMULATION_SCALAR_FP64"
+fi
 
 RUN_OPT=false
 OPT_PASSES=
@@ -697,7 +701,7 @@ for i in ${SRCS}; do
 	# If LIBRARY_MODE explicitly requested, then
 	# simply compile with the classical compiler.
 	if ${LIBRARY_MODE}; then
-		run ${CMAKE_FALLBACK_HOST_CXX} ${CLANG_VERBOSE} ${CLANG_RESOURCE_DIR} ${COMPILER_FLAGS} -DCUDAQ_LIBRARY_MODE ${PREPROCESSOR_DEFINES} ${INCLUDES} ${ARGS} -o ${file}.o -c $i
+		run ${CMAKE_FALLBACK_HOST_CXX} ${CLANG_VERBOSE} ${CLANG_RESOURCE_DIR} ${COMPILER_FLAGS} ${PREPROCESSOR_DEFINES} ${INCLUDES} ${ARGS} -o ${file}.o -c $i
 		OBJS="${OBJS} ${file}.o"
 		# Go to the next iteration, maybe there
 		# will be cudaq kernels there
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index c609f8926d..0f62ae4a97 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -72,15 +72,18 @@ macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER)
     cudaq-builder
     gtest_main)
   set(TEST_LABELS "")
+  if (${NVQIR_BACKEND} STREQUAL "qpp")
+    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_SIMULATION_SCALAR_FP64)
+  endif()
   if (${NVQIR_BACKEND} STREQUAL "dm")
-    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_DM)
+    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_DM -DCUDAQ_SIMULATION_SCALAR_FP64)
   endif()
   if (${NVQIR_BACKEND} STREQUAL "tensornet")
-    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET)
+    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_SIMULATION_SCALAR_FP64)
     set(TEST_LABELS "gpu_required")
   endif()
   if (${NVQIR_BACKEND} STREQUAL "tensornet-mps")
-    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET)
+    target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DCUDAQ_BACKEND_TENSORNET -DCUDAQ_SIMULATION_SCALAR_FP64)
     set(TEST_LABELS "gpu_required")
   endif()
   if (${NVQIR_BACKEND} STREQUAL "custatevec-fp32")
diff --git a/unittests/integration/builder_tester.cpp b/unittests/integration/builder_tester.cpp
index 0b7a554fb6..a5a19b7631 100644
--- a/unittests/integration/builder_tester.cpp
+++ b/unittests/integration/builder_tester.cpp
@@ -1203,7 +1203,73 @@ CUDAQ_TEST(BuilderTester, checkControlledRotations) {
   }
 }
 
-#ifndef CUDAQ_BACKEND_DM
+#if !defined(CUDAQ_BACKEND_DM) && !defined(CUDAQ_BACKEND_TENSORNET)
+
+TEST(BuilderTester, checkFromStateVector) {
+  std::vector<cudaq::simulation_scalar> vec{M_SQRT1_2, 0., 0., M_SQRT1_2};
+  {
+    auto kernel = cudaq::make_kernel();
+    auto qubits = kernel.qalloc(vec);
+    std::cout << kernel << "\n";
+    auto counts = cudaq::sample(kernel);
+    counts.dump();
+    EXPECT_EQ(counts.size(), 2);
+    std::size_t counter = 0;
+    for (auto &[k, v] : counts) {
+      counter += v;
+      EXPECT_TRUE(k == "00" || k == "11");
+    }
+    EXPECT_EQ(counter, 1000);
+  }
+
+  {
+    auto [kernel, initState] =
+        cudaq::make_kernel<std::vector<cudaq::simulation_scalar>>();
+    auto qubits = kernel.qalloc(initState);
+    std::cout << kernel << "\n";
+    auto counts = cudaq::sample(kernel, vec);
+    counts.dump();
+    EXPECT_EQ(counts.size(), 2);
+    std::size_t counter = 0;
+    for (auto &[k, v] : counts) {
+      counter += v;
+      EXPECT_TRUE(k == "00" || k == "11");
+    }
+    EXPECT_EQ(counter, 1000);
+  }
+
+  {
+    // 2 qubit 11 state
+    std::vector<cudaq::simulation_scalar> vec{0., 0., 0., 1.};
+    auto [kernel, initState] =
+        cudaq::make_kernel<std::vector<cudaq::simulation_scalar>>();
+    auto qubits = kernel.qalloc(initState);
+    // induce the need for a kron prod between
+    // [0,0,0,1] and [1, 0, 0, 0]
+    auto anotherOne = kernel.qalloc(2);
+    std::cout << kernel << "\n";
+    auto counts = cudaq::sample(kernel, vec);
+    counts.dump();
+    EXPECT_EQ(counts.size(), 1);
+    EXPECT_EQ(counts.count("1100"), 1000);
+  }
+
+  {
+    // 2 qubit 11 state
+    std::vector<cudaq::simulation_scalar> vec{0., 0., 0., 1.};
+    auto [kernel, initState] =
+        cudaq::make_kernel<std::vector<cudaq::simulation_scalar>>();
+    auto qubits = kernel.qalloc(initState);
+    // induce the need for a kron prod between
+    // [0,0,0,1] and [1, 0]
+    auto anotherOne = kernel.qalloc();
+    std::cout << kernel << "\n";
+    auto counts = cudaq::sample(kernel, vec);
+    counts.dump();
+    EXPECT_EQ(counts.size(), 1);
+    EXPECT_EQ(counts.count("110"), 1000);
+  }
+}
 
 CUDAQ_TEST(BuilderTester, checkCanProgressivelyBuild) {
   auto kernel = cudaq::make_kernel();
diff --git a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
index e92f21b740..d4ff6b74c7 100644
--- a/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
+++ b/unittests/qudit/simple_qudit/SimpleQuditExecutionManager.cpp
@@ -117,7 +117,11 @@ class SimpleQuditExecutionManager : public cudaq::BasicExecutionManager {
   cudaq::SpinMeasureResult measure(cudaq::spin_op &op) override {
     return cudaq::SpinMeasureResult();
   }
-
+  void initializeState(const std::vector<cudaq::QuditInfo> &targets,
+                       const void *state,
+                       cudaq::simulation_precision precision) override {
+    throw std::runtime_error("initializeState not implemented.");
+  }
   void resetQudit(const cudaq::QuditInfo &id) override {}
 };