diff --git a/python/cudaq/__init__.py b/python/cudaq/__init__.py
index a12c9662dd..168f3f20f3 100644
--- a/python/cudaq/__init__.py
+++ b/python/cudaq/__init__.py
@@ -116,7 +116,7 @@ def complex():
     target = get_target()
     precision = target.get_precision()
     if precision == cudaq_runtime.SimulationPrecision.fp64:
-        return complex
+        return numpy.complex128
     return numpy.complex64
 
 
diff --git a/python/cudaq/kernel/analysis.py b/python/cudaq/kernel/analysis.py
index 990327bbcf..120004cf78 100644
--- a/python/cudaq/kernel/analysis.py
+++ b/python/cudaq/kernel/analysis.py
@@ -233,8 +233,9 @@ def visit_FunctionDef(self, node):
                 raise RuntimeError(
                     'cudaq.kernel functions must have argument type annotations.'
                 )
-            if isinstance(annotation,
-                          ast.Subscript) and annotation.value.id == 'Callable':
+            if isinstance(annotation, ast.Subscript) and hasattr(
+                    annotation.value,
+                    "id") and annotation.value.id == 'Callable':
                 if not hasattr(annotation, 'slice'):
                     raise RuntimeError(
                         'Callable type must have signature specified.')
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 2f6f2513ae..d2f6199aff 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -153,6 +153,25 @@ def __init__(self, **kwargs):
         self.verbose = 'verbose' in kwargs and kwargs['verbose']
         self.currentNode = None
 
+    def emitWarning(self, msg, astNode=None):
+        """
+        Emit a warning, providing the user with source file information and
+        the offending code.
+        """
+        codeFile = os.path.basename(self.locationOffset[0])
+        if astNode == None:
+            astNode = self.currentNode
+        lineNumber = '' if astNode == None else astNode.lineno + self.locationOffset[
+            1] - 1
+
+        print(Color.BOLD, end='')
+        msg = codeFile + ":" + str(
+            lineNumber
+        ) + ": " + Color.YELLOW + "warning: " + Color.END + Color.BOLD + msg + (
+            "\n\t (offending source -> " + ast.unparse(astNode) + ")" if
+            hasattr(ast, 'unparse') and astNode is not None else '') + Color.END
+        print(msg)
+
     def emitFatalError(self, msg, astNode=None):
         """
         Emit a fatal error, providing the user with source file information and
@@ -343,6 +362,23 @@ def promoteOperandType(self, ty, operand):
 
         return operand
 
+    def simulationPrecision(self):
+        """
+        Return precision for the current simulation backend,
+        see `cudaq_runtime.SimulationPrecision`.
+        """
+        target = cudaq_runtime.get_target()
+        return target.get_precision()
+
+    def simulationDType(self):
+        """
+        Return the data type for the current simulation backend,
+        either `numpy.complex128` or `numpy.complex64`.
+        """
+        if self.simulationPrecision() == cudaq_runtime.SimulationPrecision.fp64:
+            return self.getComplexType(width=64)
+        return self.getComplexType(width=32)
+
     def pushValue(self, value):
         """
         Push an MLIR Value onto the stack for usage in a subsequent AST node visit method.
@@ -426,6 +462,19 @@ def ifPointerThenLoad(self, value):
             return cc.LoadOp(value).result
         return value
 
+    def ifNotPointerThenStore(self, value):
+        """
+        If the given value is not of a pointer type, allocate a
+        slot on the stack, store the the value in the slot, and
+        return the slot address.
+        """
+        if not cc.PointerType.isinstance(value.type):
+            slot = cc.AllocaOp(cc.PointerType.get(self.ctx, value.type),
+                               TypeAttr.get(value.type)).result
+            cc.StoreOp(value, slot)
+            return slot
+        return value
+
     def __createStdvecWithKnownValues(self, size, listElementValues):
         # Turn this List into a StdVec<T>
         arrSize = self.getConstantInt(size)
@@ -449,6 +498,55 @@ def __createStdvecWithKnownValues(self, size, listElementValues):
         return cc.StdvecInitOp(cc.StdvecType.get(self.ctx, vecTy), alloca,
                                arrSize).result
 
+    # Create a new vector with source elements converted to the target element type if needed.
+    def __copyVectorAndCastElements(self, source, targetEleType):
+        if not cc.PointerType.isinstance(source.type):
+            if cc.StdvecType.isinstance(source.type):
+                # Exit early if no copy is needed to avoid an unneeded store.
+                sourceEleType = cc.StdvecType.getElementType(source.type)
+                if (sourceEleType == targetEleType):
+                    return source
+
+        sourcePtr = source
+        if not cc.PointerType.isinstance(sourcePtr.type):
+            sourcePtr = self.ifNotPointerThenStore(sourcePtr)
+
+        sourceType = cc.PointerType.getElementType(sourcePtr.type)
+        if not cc.StdvecType.isinstance(sourceType):
+            raise RuntimeError(
+                f"expected vector type in __copyVectorAndCastElements but received {sourceType}"
+            )
+
+        sourceEleType = cc.StdvecType.getElementType(sourceType)
+        if (sourceEleType == targetEleType):
+            return sourcePtr
+
+        sourceElePtrTy = cc.PointerType.get(self.ctx, sourceEleType)
+        sourceValue = self.ifPointerThenLoad(sourcePtr)
+        sourceDataPtr = cc.StdvecDataOp(sourceElePtrTy, sourceValue).result
+        sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result
+
+        targetElePtrType = cc.PointerType.get(self.ctx, targetEleType)
+        targetTy = cc.ArrayType.get(self.ctx, targetEleType)
+        targetVecTy = cc.StdvecType.get(self.ctx, targetEleType)
+        targetPtr = cc.AllocaOp(cc.PointerType.get(self.ctx, targetTy),
+                                TypeAttr.get(targetEleType),
+                                seqSize=sourceSize).result
+
+        rawIndex = DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)
+
+        def bodyBuilder(iterVar):
+            eleAddr = cc.ComputePtrOp(sourceElePtrTy, sourceDataPtr, [iterVar],
+                                      rawIndex).result
+            loadedEle = cc.LoadOp(eleAddr).result
+            castedEle = self.promoteOperandType(targetEleType, loadedEle)
+            targetEleAddr = cc.ComputePtrOp(targetElePtrType, targetPtr,
+                                            [iterVar], rawIndex).result
+            cc.StoreOp(castedEle, targetEleAddr)
+
+        self.createInvariantForLoop(sourceSize, bodyBuilder)
+        return cc.StdvecInitOp(targetVecTy, targetPtr, sourceSize).result
+
     def __insertDbgStmt(self, value, dbgStmt):
         """
         Insert a debug print out statement if the programmer requested. Handles 
@@ -920,6 +1018,7 @@ def visit_Assign(self, node):
                     ast.Name) and node.targets[0].value.id in self.symbolTable:
                 # Visit_Subscript will try to load any pointer and return it
                 # but here we want the pointer, so flip that flag
+                # FIXME: move loading from Visit_Subscript to the user instead.
                 self.subscriptPushPointerValue = True
                 # Visit the subscript node, get the pointer value
                 self.visit(node.targets[0])
@@ -996,7 +1095,7 @@ def visit_Attribute(self, node):
         see from ubiquitous external modules like `numpy`.
         """
         if self.verbose:
-            print(f'[Visit Attribute {node.attr} on {node.value}]')
+            print(f'[Visit Attribute {node.attr} on {ast.unparse(node)}]')
 
         self.currentNode = node
         # Disallow list.append since we don't do dynamic memory allocation
@@ -1044,6 +1143,15 @@ def visit_Attribute(self, node):
             self.pushValue(self.getConstantFloat(np.pi))
             return
 
+        if isinstance(node.value,
+                      ast.Name) and node.value.id in ['np', 'numpy']:
+            if node.attr == 'complex64':
+                self.pushValue(self.getComplexType(width=32))
+                return
+            if node.attr == 'complex128':
+                self.pushValue(self.getComplexType(width=64))
+                return
+
     def visit_Call(self, node):
         """
         Map a Python Call operation to equivalent MLIR. This method will first check 
@@ -1157,7 +1265,7 @@ def visit_Call(self, node):
                 namedArgs[keyword.arg] = self.popValue()
 
             if node.func.id == "len":
-                listVal = self.popValue()
+                listVal = self.ifPointerThenLoad(self.popValue())
                 if cc.StdvecType.isinstance(listVal.type):
                     self.pushValue(
                         cc.StdvecSizeOp(self.getIntegerType(), listVal).result)
@@ -1235,7 +1343,7 @@ def bodyBuilder(iterVar):
                 extractFunctor = None
                 if len(self.valueStack) == 1:
                     # `qreg`-like or `stdvec`-like thing thing
-                    iterable = self.popValue()
+                    iterable = self.ifPointerThenLoad(self.popValue())
                     # Create a new iterable, `alloca cc.struct<i64, T>`
                     totalSize = None
                     if quake.VeqType.isinstance(iterable.type):
@@ -1560,9 +1668,12 @@ def bodyBuilder(iterVal):
                                 self.pushValue(maybeIterableSize)
                                 return
                 if len(self.valueStack) == 1:
-                    if cc.StdvecType.isinstance(self.valueStack[0].type):
+                    arrayTy = self.valueStack[0].type
+                    if cc.PointerType.isinstance(arrayTy):
+                        arrayTy = cc.PointerType.getElementType(arrayTy)
+                    if cc.StdvecType.isinstance(arrayTy):
                         return
-                    if cc.ArrayType.isinstance(self.valueStack[0].type):
+                    if cc.ArrayType.isinstance(arrayTy):
                         return
 
                 self.emitFatalError('Invalid list() cast requested.', node)
@@ -1577,12 +1688,37 @@ def bodyBuilder(iterVal):
                         node.func.id, globalKernelRegistry.keys()), node)
 
         elif isinstance(node.func, ast.Attribute):
-            self.generic_visit(node)
             if node.func.value.id in ['numpy', 'np']:
-                if node.func.attr == 'array':
-                    return
+                [self.visit(arg) for arg in node.args]
+
+                namedArgs = {}
+                for keyword in node.keywords:
+                    self.visit(keyword.value)
+                    namedArgs[keyword.arg] = self.popValue()
 
                 value = self.popValue()
+
+                if node.func.attr == 'array':
+                    # `np.array(vec, <dtype = ty>)`
+                    arrayType = value.type
+                    if cc.PointerType.isinstance(value.type):
+                        arrayType = cc.PointerType.getElementType(value.type)
+
+                    if cc.StdvecType.isinstance(arrayType):
+                        eleTy = cc.StdvecType.getElementType(arrayType)
+                        dTy = eleTy
+                        if len(namedArgs) > 0:
+                            dTy = namedArgs['dtype']
+
+                        # Convert the vector to the provided data type if needed.
+                        self.pushValue(
+                            self.__copyVectorAndCastElements(value, dTy))
+                        return
+
+                    raise self.emitFatalError(
+                        f"unexpected numpy array initializer type: {value.type}",
+                        node)
+
                 value = self.ifPointerThenLoad(value)
 
                 if node.func.attr in ['complex128', 'complex64']:
@@ -1677,18 +1813,101 @@ def bodyBuilder(iterVal):
                 self.emitFatalError(
                     f"unsupported NumPy call ({node.func.attr})", node)
 
+            self.generic_visit(node)
+
             if node.func.value.id == 'cudaq':
-                if node.func.attr in ['qvector']:
-                    # Handle `cudaq.qvector(N)`
-                    size = self.popValue()
-                    if hasattr(size, "literal_value"):
-                        ty = self.getVeqType(size.literal_value)
-                        qubits = quake.AllocaOp(ty)
-                    else:
+                if node.func.attr == 'complex':
+                    self.pushValue(self.simulationDType())
+                    return
+
+                if node.func.attr == 'amplitudes':
+                    value = self.popValue()
+                    arrayType = value.type
+                    if cc.PointerType.isinstance(value.type):
+                        arrayType = cc.PointerType.getElementType(value.type)
+
+                    if cc.StdvecType.isinstance(arrayType):
+                        eleTy = cc.StdvecType.getElementType(arrayType)
+                        simDTy = self.simulationDType()
+                        if (simDTy != eleTy):
+                            self.emitWarning(
+                                f"Extra copy is added to convert list[{mlirTypeToPyType(eleTy)}]"
+                                f"to list[{mlirTypeToPyType(simDTy)}]. "
+                                f"Consider moving `cudaq.amplitudes` outside kernels.",
+                                node)
+
+                        # Convert the vector to the simulation data type if needed.
+                        self.pushValue(
+                            self.__copyVectorAndCastElements(value, simDTy))
+                        return
+
+                if node.func.attr == 'qvector':
+                    value = self.ifPointerThenLoad(self.popValue())
+                    if (IntegerType.isinstance(value.type)):
+                        # handle `cudaq.qvector(n)`
                         ty = self.getVeqType()
-                        size = self.ifPointerThenLoad(size)
-                        qubits = quake.AllocaOp(ty, size=size)
-                    self.pushValue(qubits.results[0])
+                        qubits = quake.AllocaOp(ty, size=value).result
+                        self.pushValue(qubits)
+                        return
+                    if cc.StdvecType.isinstance(value.type):
+                        # handle `cudaq.qvector(initState)`
+
+                        # Validate the length in case of a constant initializer:
+                        # `cudaq.qvector([1., 0., ...])`
+                        # `cudaq.qvector(np.array([1., 0., ...]))`
+                        listScalar = None
+                        arrNode = node.args[0]
+                        if isinstance(arrNode, ast.List):
+                            listScalar = arrNode.elts
+
+                        if isinstance(arrNode, ast.Call) and isinstance(
+                                arrNode.func, ast.Attribute):
+                            if arrNode.func.value.id in [
+                                    'numpy', 'np'
+                            ] and arrNode.func.attr == 'array':
+                                lst = node.args[0].args[0]
+                                if isinstance(lst, ast.List):
+                                    listScalar = lst.elts
+
+                        if listScalar != None:
+                            size = len(listScalar)
+                            numQubits = np.log2(size)
+                            if not numQubits.is_integer():
+                                self.emitFatalError(
+                                    "Invalid input state size for qvector init (not a power of 2)",
+                                    node)
+
+                        eleTy = cc.StdvecType.getElementType(value.type)
+                        size = cc.StdvecSizeOp(self.getIntegerType(),
+                                               value).result
+                        numQubits = math.CountTrailingZerosOp(size).result
+
+                        # TODO: Dynamically check if number of qubits is power of 2
+                        # and if the state is normalized
+
+                        simDTy = self.simulationDType()
+                        if (simDTy != eleTy):
+                            self.emitWarning(
+                                f"Extra copy is added to convert list[{mlirTypeToPyType(eleTy)}]"
+                                f"to list[{mlirTypeToPyType(simDTy)}]. "
+                                f"Consider using `cudaq.amplitudes` or `cudaq.complex` "
+                                f"in `qvector` initializers.", node)
+
+                        eleTy = simDTy
+                        value = self.__copyVectorAndCastElements(value, eleTy)
+                        ptrTy = cc.PointerType.get(self.ctx, eleTy)
+                        veqTy = quake.VeqType.get(self.ctx)
+
+                        qubits = quake.AllocaOp(veqTy, size=numQubits).result
+                        initials = cc.StdvecDataOp(ptrTy, value).result
+                        init = quake.InitializeStateOp(veqTy, qubits,
+                                                       initials).result
+                        self.pushValue(init)
+                        return
+
+                    self.emitFatalError(
+                        f"unsupported qvector argument type: {value.type} (unknown)",
+                        node)
                     return
 
                 if node.func.attr == "qubit":
@@ -2218,7 +2437,9 @@ def visit_List(self, node):
                 # Find the "superior type" (int < float < complex)
                 superiorType = self.getIntegerType()
                 for t in [v.type for v in listElementValues]:
-                    if F64Type.isinstance(t) or F32Type.isinstance(t):
+                    if F32Type.isinstance(t):
+                        superiorType = t
+                    if F64Type.isinstance(t):
                         superiorType = t
                     if ComplexType.isinstance(t):
                         superiorType = t
@@ -2300,7 +2521,7 @@ def visit_Subscript(self, node):
         if isinstance(node.slice, ast.Slice):
 
             self.visit(node.value)
-            var = self.popValue()
+            var = self.ifPointerThenLoad(self.popValue())
 
             lowerVal, upperVal, stepVal = (None, None, None)
             if node.slice.lower is not None:
@@ -2358,7 +2579,7 @@ def visit_Subscript(self, node):
         assert len(self.valueStack) > 1
 
         # get the last name, should be name of var being subscripted
-        var = self.popValue()
+        var = self.ifPointerThenLoad(self.popValue())
         idx = self.popValue()
 
         # Support `VAR[-1]` as the last element of `VAR`
@@ -2475,7 +2696,7 @@ def bodyBuilder(iterVar):
         # the total size of the iterable, produced by range() / enumerate()
         if len(self.valueStack) == 1:
             # Get the iterable from the stack
-            iterable = self.popValue()
+            iterable = self.ifPointerThenLoad(self.popValue())
             # we currently handle `veq` and `stdvec` types
             if quake.VeqType.isinstance(iterable.type):
                 size = quake.VeqType.getSize(iterable.type)
@@ -2861,7 +3082,7 @@ def visit_Return(self, node):
         if len(self.valueStack) == 0:
             return
 
-        result = self.popValue()
+        result = self.ifPointerThenLoad(self.popValue())
         if cc.StdvecType.isinstance(result.type):
             symName = '__nvqpp_vectorCopyCtor'
             load_intrinsic(self.module, symName)
@@ -3121,6 +3342,14 @@ def visit_Name(self, node):
         if node.id in globalKernelRegistry:
             return
 
+        if node.id == 'complex':
+            self.pushValue(self.getComplexType())
+            return
+
+        if node.id == 'float':
+            self.pushValue(self.getFloatType())
+            return
+
         if node.id in self.symbolTable:
             value = self.symbolTable[node.id]
             if cc.PointerType.isinstance(value.type):
@@ -3133,6 +3362,9 @@ def visit_Name(self, node):
                         eleTy).width == 8:
                     self.pushValue(value)
                     return
+                if cc.StdvecType.isinstance(eleTy):
+                    self.pushValue(value)
+                    return
                 loaded = cc.LoadOp(value).result
                 self.pushValue(loaded)
             elif cc.CallableType.isinstance(
@@ -3146,7 +3378,7 @@ def visit_Name(self, node):
             # Only support a small subset of types here
             complexType = type(1j)
             value = self.capturedVars[node.id]
-            if isinstance(value, list) and isinstance(
+            if isinstance(value, (list, np.ndarray)) and isinstance(
                     value[0], (int, bool, float, np.float32, np.float64,
                                complexType, np.complex64, np.complex128)):
                 elementValues = None
@@ -3213,7 +3445,7 @@ def visit_Name(self, node):
                 errorType = f"{errorType}[{type(value[0]).__name__}]"
 
             self.emitFatalError(
-                f"Invalid type for variable ({node.id}) captured from parent scope (only int, bool, float, complex, and list[int|bool|float|complex] accepted, type was {errorType}).",
+                f"Invalid type for variable ({node.id}) captured from parent scope (only int, bool, float, complex, and list/np.ndarray[int|bool|float|complex] accepted, type was {errorType}).",
                 node)
 
         # Throw an exception for the case that the name is not
diff --git a/python/cudaq/kernel/kernel_builder.py b/python/cudaq/kernel/kernel_builder.py
index 20c85b6eab..6f0183a424 100644
--- a/python/cudaq/kernel/kernel_builder.py
+++ b/python/cudaq/kernel/kernel_builder.py
@@ -257,7 +257,7 @@ def __processArgType(self, ty):
         """
         if ty in [cudaq_runtime.qvector, cudaq_runtime.qubit]:
             return ty, None
-        if get_origin(ty) == list or isinstance(ty(), list):
+        if get_origin(ty) == list or isinstance(ty, list):
             if '[' in str(ty) and ']' in str(ty):
                 allowedTypeMap = {
                     'int': int,
@@ -685,8 +685,10 @@ def qalloc(self, initializer=None):
                         self.ctx,
                         cc.StdvecType.getElementType(
                             initializer.mlirValue.type))
-                    initials = cc.StdvecDataOp(ptrTy, initializer.mlirValue)
-                    quake.InitializeStateOp(veqTy, qubits, initials)
+                    initials = cc.StdvecDataOp(ptrTy,
+                                               initializer.mlirValue).result
+                    qubits = quake.InitializeStateOp(veqTy, qubits,
+                                                     initials).result
                     return self.__createQuakeValue(qubits)
 
             # If no initializer, create a single qubit
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index fbc00aaac9..9b4fbed9d6 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -34,6 +34,7 @@
 
 
 class Color:
+    YELLOW = '\033[93m'
     RED = '\033[91m'
     BOLD = '\033[1m'
     END = '\033[0m'
@@ -92,7 +93,7 @@ def emitFatalErrorOverride(msg):
                 return cc.CharspanType.get(ctx)
 
         if annotation.value.id in ['numpy', 'np']:
-            if annotation.attr == 'ndarray':
+            if annotation.attr in ['array', 'ndarray']:
                 return cc.StdvecType.get(ctx, F64Type.get())
             if annotation.attr == 'complex128':
                 return ComplexType.get(F64Type.get())
@@ -176,7 +177,6 @@ def emitFatalErrorOverride(msg):
 
 
 def mlirTypeFromPyType(argType, ctx, **kwargs):
-
     if argType == int:
         return IntegerType.get_signless(64, ctx)
     if argType in [float, np.float64]:
diff --git a/python/tests/builder/test_kernel_builder.py b/python/tests/builder/test_kernel_builder.py
index 0e5b8e1155..93ed04a346 100644
--- a/python/tests/builder/test_kernel_builder.py
+++ b/python/tests/builder/test_kernel_builder.py
@@ -877,10 +877,11 @@ def test_recursive_calls():
 
     print(kernel3)
 
-  
+
 skipIfNvidiaFP64NotInstalled = pytest.mark.skipif(
-  not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia-fp64')),
-  reason='Could not find nvidia-fp64 in installation')
+    not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia-fp64')),
+    reason='Could not find nvidia-fp64 in installation')
+
 
 @skipIfNvidiaFP64NotInstalled
 def test_from_state0():
@@ -944,10 +945,12 @@ def test_from_state0():
 
     cudaq.reset_target()
 
+
 skipIfNvidiaNotInstalled = pytest.mark.skipif(
-  not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
-  reason='Could not find nvidia in installation')
-  
+    not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
+    reason='Could not find nvidia in installation')
+
+
 @skipIfNvidiaNotInstalled
 def test_from_state1():
     cudaq.set_target('nvidia')
@@ -969,8 +972,7 @@ def test_from_state1():
 
     # Regardless of the target precision, use
     # cudaq.complex() or cudaq.amplitudes()
-    state = np.array([.70710678, 0., 0., 0.70710678],
-                     dtype=cudaq.complex()) 
+    state = np.array([.70710678, 0., 0., 0.70710678], dtype=cudaq.complex())
     kernel2 = cudaq.make_kernel()
     qubits = kernel2.qalloc(state)
     counts = cudaq.sample(kernel2)
@@ -986,7 +988,7 @@ def test_from_state1():
     assert '11' in counts
     assert '00' in counts
 
-    state = cudaq.amplitudes(np.array([.5]*4))
+    state = cudaq.amplitudes(np.array([.5] * 4))
     kernel2 = cudaq.make_kernel()
     qubits = kernel2.qalloc(state)
     counts = cudaq.sample(kernel2)
@@ -1358,6 +1360,38 @@ def test_u3_ctrl():
     assert ('11' in counts)
 
 
+@skipIfNvidiaFP64NotInstalled
+def test_builder_rotate_state():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [0., 0., 0., 1.]
+
+    # Our kernel will start with 2 qubits in `11`, then
+    # rotate each qubit back to `0` before applying a
+    # Hadamard gate.
+    kernel, state = cudaq.make_kernel(list[complex])
+    q = kernel.qalloc(state)
+
+    # Can now operate on the qvector as usual:
+    # Rotate state of the front qubit 180 degrees along X.
+    kernel.x(q[0])
+    # Rotate state of the back qubit 180 degrees along Y.
+    kernel.y(q[1])
+    # Put qubits into superposition state.
+    kernel.h(q)
+
+    # Measure.
+    kernel.mz(q)
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+    assert '01' in counts
+    assert '10' in counts
+
+
 # leave for gdb debugging
 if __name__ == "__main__":
     loc = os.path.abspath(__file__)
diff --git a/python/tests/kernel/test_kernel_state_init.py b/python/tests/kernel/test_kernel_state_init.py
new file mode 100644
index 0000000000..ddaeb6cc4d
--- /dev/null
+++ b/python/tests/kernel/test_kernel_state_init.py
@@ -0,0 +1,671 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+import pytest
+
+import cudaq
+import numpy as np
+
+skipIfNvidiaFP64NotInstalled = pytest.mark.skipif(
+    not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia-fp64')),
+    reason='Could not find nvidia-fp64 in installation')
+
+skipIfNvidiaNotInstalled = pytest.mark.skipif(
+    not (cudaq.num_available_gpus() > 0 and cudaq.has_target('nvidia')),
+    reason='Could not find nvidia in installation')
+
+
+# float
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_float_params_f64():
+
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[float]):
+        q = cudaq.qvector(vec)
+
+    counts = cudaq.sample(kernel, f)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_float_params_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[float]):
+        q = cudaq.qvector(vec)
+
+    counts = cudaq.sample(kernel, f)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_float_capture_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(f)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_float_capture_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(f)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_float_np_array_from_capture_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(np.array(f))
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_float_np_array_from_capture_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    f = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(np.array(f))
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_float_definition_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_float_definition_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+# complex
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_complex_params_rotate_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [0. + 0j, 0., 0., 1.]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+        x(q.front())
+        y(q.back())
+        h(q)
+        mz(q)
+
+    counts = cudaq.sample(kernel, c)
+    print(f'rotate: {counts}')
+    assert '11' in counts
+    assert '00' in counts
+    assert '01' in counts
+    assert '10' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_complex_params_rotate_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [0. + 0j, 0., 0., 1.]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+        x(q.front())
+        y(q.back())
+        h(q)
+        mz(q)
+
+    counts = cudaq.sample(kernel, c)
+    print(f'rotate: {counts}')
+    assert '11' in counts
+    assert '00' in counts
+    assert '01' in counts
+    assert '10' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_complex_params_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_complex_params_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_complex_capture_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(c)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_complex_capture_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(c)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_complex_np_array_from_capture_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(np.array(c))
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_complex_np_array_from_capture_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(np.array(c))
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_complex_definition_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)])
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_complex_definition_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector([1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)])
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+# np arrays
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_dtype_complex_params_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(np.array(vec, dtype=complex))
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_dtype_complex128_params_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(np.array(vec, dtype=np.complex128))
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_dtype_complex64_params_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(np.array(vec, dtype=np.complex64))
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+# simulation dtype
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_simulation_dtype_complex_params_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(np.array(vec, dtype=cudaq.complex()))
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_simulation_dtype_complex_params_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(np.array(vec, dtype=cudaq.complex()))
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_amplitudes_complex_params_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_amplitudes_complex_params_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = cudaq.amplitudes([1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)])
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(vec)
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_amplitudes_complex_from_capture_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.), 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(cudaq.amplitudes(vec))
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_amplitudes_complex_from_capture_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel(vec: list[complex]):
+        q = cudaq.qvector(cudaq.amplitudes(vec))
+
+    counts = cudaq.sample(kernel, c)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_simulation_dtype_np_array_from_capture_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(np.array(c, dtype=cudaq.complex()))
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_simulation_dtype_np_array_from_capture_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(np.array(c, dtype=cudaq.complex()))
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_simulation_dtype_np_array_capture_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    state = np.array(c, dtype=cudaq.complex())
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(state)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_simulation_dtype_np_array_capture_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    c = [1. / np.sqrt(2.) + 0j, 0., 0., 1. / np.sqrt(2.)]
+
+    state = np.array(c, dtype=cudaq.complex())
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(state)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert '11' in counts
+    assert '00' in counts
+
+
+# test errors
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_error_invalid_array_size_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    @cudaq.kernel
+    def kernel():
+        qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex))
+
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(kernel)
+    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
+        e)
+
+
+@skipIfNvidiaFP64NotInstalled
+def test_kernel_error_invalid_list_size_f64():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia-fp64')
+
+    @cudaq.kernel
+    def kernel():
+        qubits = cudaq.qvector([1., 0., 0.])
+
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(kernel)
+    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
+        e)
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_error_invalid_array_size_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    @cudaq.kernel
+    def kernel():
+        qubits = cudaq.qvector(np.array([1., 0., 0.], dtype=complex))
+
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(kernel)
+    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
+        e)
+
+
+@skipIfNvidiaNotInstalled
+def test_kernel_error_invalid_list_size_f32():
+    cudaq.reset_target()
+    cudaq.set_target('nvidia')
+
+    @cudaq.kernel
+    def kernel():
+        qubits = cudaq.qvector([1., 0., 0.])
+
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(kernel)
+    assert 'Invalid input state size for qvector init (not a power of 2)' in repr(
+        e)
+
+
+def test_kernel_qvector_init_from_param_int():
+
+    @cudaq.kernel
+    def kernel(n: int):
+        q = cudaq.qvector(n)
+
+    counts = cudaq.sample(kernel, 2)
+    print(counts)
+    assert not '11' in counts
+    assert not '10' in counts
+    assert not '01' in counts
+    assert '00' in counts
+
+
+def test_kernel_qvector_init_from_capture_int():
+    n = 2
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(n)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert not '11' in counts
+    assert not '10' in counts
+    assert not '01' in counts
+    assert '00' in counts
+
+
+def test_kernel_qvector_init_from_int():
+
+    @cudaq.kernel
+    def kernel():
+        q = cudaq.qvector(2)
+
+    counts = cudaq.sample(kernel)
+    print(counts)
+    assert not '11' in counts
+    assert not '10' in counts
+    assert not '01' in counts
+    assert '00' in counts