diff --git a/python/lib.c b/python/lib.c
index bc3a0edc..13450efd 100644
--- a/python/lib.c
+++ b/python/lib.c
@@ -35,7 +35,7 @@
  *          ...
  *  @endcode
  *
- *  This `cdist` example takes 2 positional, 1 postional or named, 3 named-only arguments.
+ *  This `cdist` example takes 2 positional, 1 positional or named, 3 named-only arguments.
  *  The alternative using the `METH_FASTCALL` is to use a function signature like:
  *
  *  @code {.c}
@@ -76,7 +76,7 @@
  *
  *      ! ValueError: cannot include dtype 'E' in a buffer
  *
- *  Moreover, the CPython documentation and the NumPy documentation diverge on the format specificers
+ *  Moreover, the CPython documentation and the NumPy documentation diverge on the format specifiers
  *  for the `typestr` and `format` data-type descriptor strings, making the development error-prone.
  *  At this point, SimSIMD seems to be @b the_only_package that at least attempts to provide interoperability.
  *
@@ -120,6 +120,7 @@ typedef struct BufferOrScalarArgument {
         BufferKind,
     } kind;
     /// Populated only for `ScalarKind` and `ScalarBufferKind` kinds.
+    simsimd_f64_t as_f64;
     simsimd_u8_t as_scalar[8];
     /// The address of the buffer start, if the kind is `BufferKind` or `ScalarBufferKind`.
     /// Alternatively, points to the `&as_scalar` field for `ScalarKind` kind.
@@ -136,7 +137,7 @@ typedef struct BufferOrScalarArgument {
 
 } BufferOrScalarArgument;
 
-/// @brief  Minimalistics and space-efficient representation of a rank-1 or rank-2 output tensor.
+/// @brief  Minimalistic and space-efficient representation of a rank-1 or rank-2 output tensor.
 typedef struct DistancesTensor {
     PyObject_HEAD                    //
         simsimd_datatype_t datatype; // Any SimSIMD numeric type
@@ -157,7 +158,7 @@ typedef struct NDArray {
     simsimd_distance_t start[];         // Variable length data aligned to 64-bit scalars
 } NDArray;
 
-/// @brief  Faster alternative to NumPy's `ndindex` object, supporting just as amny dimensions,
+/// @brief  Faster alternative to NumPy's `ndindex` object, supporting just as many dimensions,
 ///         as the `NDArray` object.
 typedef struct NDIndex {
     PyObject_HEAD //
@@ -638,33 +639,81 @@ int parse_tensor(PyObject *tensor, Py_buffer *buffer, simsimd_datatype_t *dtype)
 int parse_buffer_or_scalar_argument(PyObject *obj, Py_buffer *buffer, BufferOrScalarArgument *parsed) {
 
     if (PyFloat_Check(obj)) {
-        // Return a C `double` representation of the contents of `obj`.
-        // If `obj` is not a Python floating-point object but has a `__float__()` method,
-        // this method will first be called to convert `obj` into a float.
-        simsimd_f64_t as_float = PyFloat_AsDouble(obj);
-        memcpy(&parsed->as_scalar, &as_float, sizeof(simsimd_f64_t));
         parsed->kind = ScalarKind;
-        parsed->datatype = simsimd_datatype_f64_k;
         parsed->as_buffer_start = (char *)&parsed->as_scalar;
         parsed->as_buffer_dimensions = 1;
         parsed->as_buffer_shape[0] = 1;
         parsed->as_buffer_strides[0] = 0;
+        // Return a C `double` representation of the contents of `obj`.
+        // If `obj` is not a Python floating-point object but has a `__float__()` method,
+        // this method will first be called to convert `obj` into a float.
+        simsimd_f64_t as_float = PyFloat_AsDouble(obj);
+        parsed->as_f64 = as_float;
+        // Check if we convert to a smaller floating-point type, without the loss of precision.
+        if (SIMSIMD_NATIVE_F16 && as_float == (simsimd_f64_t)(simsimd_f16_t)as_float) {
+            simsimd_f16_t as_f16 = (simsimd_f16_t)as_float;
+            memcpy(parsed->as_scalar, &as_f16, sizeof(simsimd_f16_t));
+            parsed->datatype = simsimd_datatype_f16_k;
+        }
+        else if (as_float == (simsimd_f64_t)(simsimd_f32_t)as_float) {
+            simsimd_f32_t as_f32 = (simsimd_f32_t)as_float;
+            memcpy(parsed->as_scalar, &as_f32, sizeof(simsimd_f32_t));
+            parsed->datatype = simsimd_datatype_f32_k;
+        }
+        else {
+            memcpy(parsed->as_scalar, &as_float, sizeof(simsimd_f64_t));
+            parsed->datatype = simsimd_datatype_f64_k;
+        }
         return 1;
     }
     else if (PyLong_Check(obj)) {
         int did_overflow = 0;
-        simsimd_i64_t as_int = PyLong_AsLongLongAndOverflow(obj, &did_overflow);
+        simsimd_i64_t as_integral = PyLong_AsLongLongAndOverflow(obj, &did_overflow);
         if (did_overflow) {
             PyErr_SetString(PyExc_ValueError, "Integer overflow");
             return 0;
         }
-        memcpy(&parsed->as_scalar, &as_int, sizeof(simsimd_i64_t));
         parsed->kind = ScalarKind;
-        parsed->datatype = simsimd_datatype_i64_k;
         parsed->as_buffer_start = (char *)&parsed->as_scalar;
         parsed->as_buffer_dimensions = 1;
         parsed->as_buffer_shape[0] = 1;
         parsed->as_buffer_strides[0] = 0;
+        parsed->as_f64 = as_integral;
+        // Check for smaller unsigned integer types and store in `as_scalar`
+        if (as_integral == (simsimd_u64_t)(simsimd_u8_t)as_integral) {
+            simsimd_u8_t as_u8 = (simsimd_u8_t)as_integral;
+            memcpy(parsed->as_scalar, &as_u8, sizeof(simsimd_u8_t));
+            parsed->datatype = simsimd_datatype_u8_k;
+        }
+        else if (as_integral == (simsimd_u64_t)(simsimd_u16_t)as_integral) {
+            simsimd_u16_t as_u16 = (simsimd_u16_t)as_integral;
+            memcpy(parsed->as_scalar, &as_u16, sizeof(simsimd_u16_t));
+            parsed->datatype = simsimd_datatype_u16_k;
+        }
+        else if (as_integral == (simsimd_u64_t)(simsimd_u32_t)as_integral) {
+            simsimd_u32_t as_u32 = (simsimd_u32_t)as_integral;
+            memcpy(parsed->as_scalar, &as_u32, sizeof(simsimd_u32_t));
+            parsed->datatype = simsimd_datatype_u32_k;
+        }
+        else if (as_integral == (simsimd_i64_t)(simsimd_i8_t)as_integral) {
+            simsimd_i8_t as_i8 = (simsimd_i8_t)as_integral;
+            memcpy(parsed->as_scalar, &as_i8, sizeof(simsimd_i8_t));
+            parsed->datatype = simsimd_datatype_i8_k;
+        }
+        else if (as_integral == (simsimd_i64_t)(simsimd_i16_t)as_integral) {
+            simsimd_i16_t as_i16 = (simsimd_i16_t)as_integral;
+            memcpy(parsed->as_scalar, &as_i16, sizeof(simsimd_i16_t));
+            parsed->datatype = simsimd_datatype_i16_k;
+        }
+        else if (as_integral == (simsimd_i64_t)(simsimd_i32_t)as_integral) {
+            simsimd_i32_t as_i32 = (simsimd_i32_t)as_integral;
+            memcpy(parsed->as_scalar, &as_i32, sizeof(simsimd_i32_t));
+            parsed->datatype = simsimd_datatype_i32_k;
+        }
+        else {
+            memcpy(parsed->as_scalar, &as_integral, sizeof(simsimd_i64_t));
+            parsed->datatype = simsimd_datatype_i64_k;
+        }
         return 1;
     }
     else if (PyObject_CheckBuffer(obj)) {
@@ -998,6 +1047,7 @@ static PyObject *implement_dense_metric( //
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&b_buffer);
     PyBuffer_Release(&out_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -1127,6 +1177,7 @@ static PyObject *implement_curved_metric( //
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&b_buffer);
     PyBuffer_Release(&c_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -1181,6 +1232,7 @@ static PyObject *implement_sparse_metric( //
 cleanup:
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&b_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -1379,6 +1431,7 @@ static PyObject *implement_cdist(                        //
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&b_buffer);
     PyBuffer_Release(&out_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -1994,6 +2047,7 @@ static PyObject *api_scale(PyObject *self, PyObject *const *args, Py_ssize_t con
 cleanup:
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&out_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -2156,6 +2210,7 @@ static PyObject *api_sum(PyObject *self, PyObject *const *args, Py_ssize_t const
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&b_buffer);
     PyBuffer_Release(&out_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -2333,6 +2388,7 @@ static PyObject *api_wsum(PyObject *self, PyObject *const *args, Py_ssize_t cons
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&b_buffer);
     PyBuffer_Release(&out_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -2518,13 +2574,14 @@ static PyObject *api_fma(PyObject *self, PyObject *const *args, Py_ssize_t const
     PyBuffer_Release(&b_buffer);
     PyBuffer_Release(&c_buffer);
     PyBuffer_Release(&out_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
 typedef void (*binary_kernel_t)(void const *, void const *, void *);
 typedef void (*unary_kernel_t)(void const *, void *);
 
-void implementation_elementwise_binary_tensor_operation( //
+void apply_elementwise_binary_operation_to_each_scalar( //
     BufferOrScalarArgument const *a_parsed, BufferOrScalarArgument const *b_parsed,
     BufferOrScalarArgument const *out_parsed, //
     binary_kernel_t elementwise_kernel) {
@@ -2570,7 +2627,7 @@ void implementation_elementwise_binary_tensor_operation( //
     }
 }
 
-void implementation_elementwise_binary_mixed_tensor_operation( //
+void apply_elementwise_casting_binary_operation_to_each_scalar( //
     BufferOrScalarArgument const *a_parsed, BufferOrScalarArgument const *b_parsed,
     BufferOrScalarArgument const *out_parsed,                       //
     unary_kernel_t a_upcast_kernel, unary_kernel_t b_upcast_kernel, //
@@ -2621,7 +2678,7 @@ void implementation_elementwise_binary_mixed_tensor_operation( //
     }
 }
 
-void implementation_vectorized_binary_tensor_operation( //
+void apply_elementwise_binary_operation_to_each_continuous_slice( //
     BufferOrScalarArgument const *a_parsed, BufferOrScalarArgument const *b_parsed,
     BufferOrScalarArgument const *out_parsed, //
     Py_ssize_t const non_continuous_ranks,    //
@@ -2669,6 +2726,50 @@ void implementation_vectorized_binary_tensor_operation( //
     }
 }
 
+void apply_scale_to_each_continuous_slice(                                               //
+    BufferOrScalarArgument const *input_parsed, simsimd_f64_t alpha, simsimd_f64_t beta, //
+    BufferOrScalarArgument const *out_parsed,                                            //
+    Py_ssize_t const non_continuous_ranks,                                               //
+    Py_ssize_t const continuous_elements,                                                //
+    void (*binary_kernel)(void const *, simsimd_size_t, simsimd_f64_t, simsimd_f64_t, void *)) {
+
+    // The hardest part of this operations is addressing the elements in a non-continuous tensor of arbitrary rank.
+    // While iteratively deepening into the lower layers of the tensor, we need to keep track of the byte offsets
+    // for each dimension to avoid recomputing them in the inner loops.
+    simsimd_mdindices_t input_mdindices, out_mdindices;
+    memset(&input_mdindices, 0, sizeof(simsimd_mdindices_t));
+    memset(&out_mdindices, 0, sizeof(simsimd_mdindices_t));
+
+    // Start from last dimension and move backward, replicating the logic
+    // of `simsimd_mdindices_next`, broadcasting the same update logic across the
+    // indexes in all three tensors, and avoiding additional branches inside the loops.
+    while (1) {
+        // Invoke the provided kernel at the current byte offsets
+        binary_kernel(input_parsed->as_buffer_start + input_mdindices.byte_offset, continuous_elements, alpha, beta,
+                      out_parsed->as_buffer_start + out_mdindices.byte_offset);
+
+        // Advance to the next index
+        Py_ssize_t dim;
+        for (dim = non_continuous_ranks - 1; dim >= 0; --dim) {
+            out_mdindices.coordinates[dim]++;
+            out_mdindices.byte_offset += out_parsed->as_buffer_strides[dim];
+            input_mdindices.byte_offset += input_parsed->as_buffer_strides[dim];
+
+            // Successfully moved to the next index in this dimension
+            if (out_mdindices.coordinates[dim] < out_parsed->as_buffer_shape[dim]) break;
+            else {
+                // Reset coordinates and byte offset for this dimension
+                out_mdindices.coordinates[dim] = 0;
+                out_mdindices.byte_offset -= out_parsed->as_buffer_strides[dim] * out_parsed->as_buffer_shape[dim];
+                input_mdindices.byte_offset -= input_parsed->as_buffer_strides[dim] * out_parsed->as_buffer_shape[dim];
+            }
+        }
+
+        // If we've processed all dimensions, we're done
+        if (dim < 0) break;
+    }
+}
+
 static binary_kernel_t elementwise_sadd(simsimd_datatype_t dtype) {
     switch (dtype) {
     case simsimd_datatype_u64_k: return (binary_kernel_t)&_simsimd_u64_sadd;
@@ -2801,7 +2902,6 @@ static char const doc_add[] = //
     "Args:\n"
     "    a (Union[NDArray, float, int]): First Tensor or scalar.\n"
     "    b (Union[NDArray, float, int]): Second Tensor or scalar.\n"
-    "    dtype (Union[IntegralType, FloatType], optional): Override the presumed numeric type of all tensors.\n"
     "    out (NDArray, optional): Tensor for resulting distances.\n"
     "    a_dtype (Union[IntegralType, FloatType], optional): Override the presumed numeric type of `a`.\n"
     "    b_dtype (Union[IntegralType, FloatType], optional): Override the presumed numeric type of `b`.\n"
@@ -2812,7 +2912,7 @@ static char const doc_add[] = //
     "\n"
     "Equivalent to: `a + b`, but unlike the `sum` API supports scalar arguments.\n"
     "Signature:\n"
-    "    >>> def add(a, b, /, dtype, *, out, a_dtype, b_dtype, out_dtype) -> Optional[DistancesTensor]: ...\n"
+    "    >>> def add(a, b, /, *, out, a_dtype, b_dtype, out_dtype) -> Optional[DistancesTensor]: ...\n"
     "\n"
     "Performance recommendations:\n"
     "    - Provide an output tensor to avoid memory allocations.\n"
@@ -2847,20 +2947,22 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
 
     PyObject *return_obj = NULL;
 
-    // This function accepts up to 5 arguments:
-    PyObject *a_obj = NULL;     // Required object, positional-only
-    PyObject *b_obj = NULL;     // Required object, positional-only
-    PyObject *dtype_obj = NULL; // Optional object, "dtype" keyword or positional
-    PyObject *out_obj = NULL;   // Optional object, "out" keyword-only
+    // This function accepts up to 6 arguments:
+    PyObject *a_obj = NULL;         // Required object, positional-only
+    PyObject *b_obj = NULL;         // Required object, positional-only
+    PyObject *out_obj = NULL;       // Optional object, "out" keyword-only
+    PyObject *a_dtype_obj = NULL;   // Optional object, "a_dtype" keyword-only
+    PyObject *b_dtype_obj = NULL;   // Optional object, "b_dtype" keyword-only
+    PyObject *out_dtype_obj = NULL; // Optional object, "out_dtype" keyword-only
 
     Py_ssize_t const args_names_count = args_names_tuple ? PyTuple_Size(args_names_tuple) : 0;
     Py_ssize_t const args_count = positional_args_count + args_names_count;
-    if (args_count < 2 || args_count > 4) {
-        PyErr_Format(PyExc_TypeError, "Function expects 2-4 arguments, got %zd", args_count);
+    if (args_count < 2 || args_count > 6) {
+        PyErr_Format(PyExc_TypeError, "Function expects 2-6 arguments, got %zd", args_count);
         return NULL;
     }
-    if (positional_args_count > 3) {
-        PyErr_Format(PyExc_TypeError, "Only first 3 arguments can be positional, received %zd", positional_args_count);
+    if (positional_args_count > 2) {
+        PyErr_Format(PyExc_TypeError, "Only first 2 arguments can be positional, received %zd", positional_args_count);
         return NULL;
     }
 
@@ -2868,15 +2970,14 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
     a_obj = args[0];
     b_obj = args[1];
 
-    // Positional or keyword arguments (dtype)
-    if (positional_args_count == 3) dtype_obj = args[2];
-
     // The rest of the arguments must be checked in the keyword dictionary:
     for (Py_ssize_t args_names_tuple_progress = 0, args_progress = positional_args_count;
          args_names_tuple_progress < args_names_count; ++args_progress, ++args_names_tuple_progress) {
         PyObject *const key = PyTuple_GetItem(args_names_tuple, args_names_tuple_progress);
         PyObject *const value = args[args_progress];
-        if (PyUnicode_CompareWithASCIIString(key, "dtype") == 0 && !dtype_obj) { dtype_obj = value; }
+        if (PyUnicode_CompareWithASCIIString(key, "a_dtype") == 0 && !a_dtype_obj) { a_dtype_obj = value; }
+        else if (PyUnicode_CompareWithASCIIString(key, "b_dtype") == 0 && !b_dtype_obj) { b_dtype_obj = value; }
+        else if (PyUnicode_CompareWithASCIIString(key, "out_dtype") == 0 && !out_dtype_obj) { out_dtype_obj = value; }
         else if (PyUnicode_CompareWithASCIIString(key, "out") == 0 && !out_obj) { out_obj = value; }
         else {
             PyErr_Format(PyExc_TypeError, "Got unexpected keyword argument: %S", key);
@@ -2884,18 +2985,19 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
         }
     }
 
-    // Convert `dtype_obj` to `dtype_str` and to `dtype`
-    char const *dtype_str = NULL;
-    simsimd_datatype_t dtype = simsimd_datatype_unknown_k;
-    if (dtype_obj) {
-        dtype_str = PyUnicode_AsUTF8(dtype_obj);
-        if (!dtype_str && PyErr_Occurred()) {
-            PyErr_SetString(PyExc_TypeError, "Expected 'dtype' to be a string");
-            return NULL;
-        }
-        dtype = python_string_to_datatype(dtype_str);
-        if (dtype == simsimd_datatype_unknown_k) {
-            PyErr_SetString(PyExc_ValueError, "Unsupported 'dtype'");
+    // Convert `a_dtype_obj` to `a_dtype_str` and to `a_dtype`
+    char const *a_dtype_str = NULL, *b_dtype_str = NULL, *out_dtype_str = NULL;
+    simsimd_datatype_t a_dtype = simsimd_datatype_unknown_k, b_dtype = simsimd_datatype_unknown_k,
+                       out_dtype = simsimd_datatype_unknown_k;
+    if (a_dtype_obj) a_dtype_str = PyUnicode_AsUTF8(a_dtype_obj), a_dtype = python_string_to_datatype(a_dtype_str);
+    if (b_dtype_obj) b_dtype_str = PyUnicode_AsUTF8(b_dtype_obj), b_dtype = python_string_to_datatype(b_dtype_str);
+    if (out_dtype_obj)
+        out_dtype_str = PyUnicode_AsUTF8(out_dtype_obj), out_dtype = python_string_to_datatype(out_dtype_str);
+    if ((a_dtype_obj || b_dtype_obj || out_dtype_obj) && (!a_dtype_str && !b_dtype_str && !out_dtype_str)) {
+        if (PyErr_Occurred()) return NULL;
+        if (a_dtype == simsimd_datatype_unknown_k && b_dtype == simsimd_datatype_unknown_k &&
+            out_dtype == simsimd_datatype_unknown_k) {
+            PyErr_SetString(PyExc_ValueError, "Unsupported 'a_dtype'");
             return NULL;
         }
     }
@@ -2916,40 +3018,13 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
     if (!parse_buffer_or_scalar_argument(a_obj, &a_buffer, &a_parsed)) goto cleanup;
     if (!parse_buffer_or_scalar_argument(b_obj, &b_buffer, &b_parsed)) goto cleanup;
     if (out_obj && !parse_tensor(out_obj, &out_buffer, &out_parsed.datatype)) goto cleanup;
-    if (dtype == simsimd_datatype_unknown_k)
-        dtype = a_parsed.kind != ScalarKind ? a_parsed.datatype : b_parsed.datatype;
-
-    // If we are only provided with scalars, avoid all future shenanigans:
-    if (a_parsed.kind == ScalarKind && b_parsed.kind == ScalarKind) {
-        simsimd_f64_t a_as_double = *(simsimd_f64_t *)&a_parsed.as_scalar[0];
-        simsimd_f64_t b_as_double = *(simsimd_f64_t *)&b_parsed.as_scalar[0];
-        if (!out_obj) {
-            int result_is_int = PyLong_Check(a_obj) && PyLong_Check(b_obj);
-            if (result_is_int) { return PyLong_FromLong(a_as_double + b_as_double); }
-            else { return PyFloat_FromDouble(a_as_double + b_as_double); }
-        }
-        else {
-            // Make sure the output tensor is a scalar, i.e. contains a single element
-            if (out_buffer.len != out_buffer.itemsize) {
-                PyErr_SetString(PyExc_ValueError, "Output tensor must be a scalar");
-                goto cleanup;
-            }
-            if (!cast_distance(a_as_double + b_as_double, out_parsed.datatype, out_buffer.buf, 0)) {
-                PyErr_SetString(PyExc_ValueError, "Unsupported output datatype");
-                goto cleanup;
-            }
-            else {
-                return_obj = Py_None;
-                goto cleanup;
-            }
-        }
-    }
+
     // Check dimensions, but unlike the `sum`, `scale`, `wsum`, and `fma` APIs
     // we want to provide maximal compatibility with NumPy and OpenCV. In many
     // such cases, the input is not a rank-1 tensor and may not be continuous.
     Py_ssize_t ins_continuous_dimensions = 0;
     Py_ssize_t ins_continuous_elements = 1;
-    if (a_parsed.kind == BufferKind && b_parsed.kind == BufferKind) {
+    if (a_parsed.kind != ScalarKind && b_parsed.kind != ScalarKind) {
         //! The ranks of tensors may not match!
         // We need to compare them in reverse order, right to left, assuming all the missing dimensions are 1.
         // To match those, we are going to populate the `a_parsed.as_buffer_shape` and `b_parsed.as_buffer_shape`,
@@ -3003,6 +3078,7 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
                     "The number of entries %zd (along dimension %zd in the first tensor) doesn't "
                     "match the number of entries %zd (along dimension %zd in the second tensor)",
                     a_dim, a_buffer.ndim - 1 - i, b_dim, b_buffer.ndim - 1 - i);
+                return_obj = NULL;
                 goto cleanup;
             }
         }
@@ -3030,9 +3106,8 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
             }
         }
     }
-    // If at least one of the entries is actually is of `ScalarKind` or `ScalarBufferKind`,
-    // our logic becomes much easier:
-    else if (a_parsed.kind == BufferKind) {
+    // If at least one of the entries is actually is a `ScalarKind` our logic becomes much easier:
+    else if (a_parsed.kind != ScalarKind) {
         a_parsed.as_buffer_dimensions = a_buffer.ndim;
         memcpy(a_parsed.as_buffer_shape, a_buffer.shape, a_buffer.ndim * sizeof(Py_ssize_t));
         memcpy(a_parsed.as_buffer_strides, a_buffer.strides, a_buffer.ndim * sizeof(Py_ssize_t));
@@ -3059,6 +3134,7 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
         if (out_buffer.ndim != out_parsed.as_buffer_dimensions) {
             PyErr_Format(PyExc_ValueError, "Output tensor has rank-%zd, but rank-%zd is expected", out_buffer.ndim,
                          out_parsed.as_buffer_dimensions);
+            return_obj = NULL;
             goto cleanup;
         }
         for (Py_ssize_t i = 0; i < out_parsed.as_buffer_dimensions; ++i) {
@@ -3066,56 +3142,130 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
                 PyErr_Format(PyExc_ValueError,
                              "Output tensor doesn't match the input tensor in shape at dimension %zd: %zd != %zd", i,
                              out_buffer.shape[i], out_parsed.as_buffer_shape[i]);
+                return_obj = NULL;
                 goto cleanup;
             }
         }
     }
-    // Infer the output tensor shape if it wasn't provided
-    else {
-        // For addition and multiplication, treat complex numbers as floats
-        simsimd_datatype_family_k a_kind = simsimd_datatype_family(a_parsed.datatype);
-        simsimd_datatype_family_k b_kind = simsimd_datatype_family(b_parsed.datatype);
-        if (a_kind == simsimd_datatype_complex_float_family_k) a_kind = simsimd_datatype_float_family_k;
-        if (b_kind == simsimd_datatype_complex_float_family_k) b_kind = simsimd_datatype_float_family_k;
-        if (a_kind == simsimd_datatype_binary_family_k || b_kind == simsimd_datatype_binary_family_k) {
-            PyErr_SetString(PyExc_ValueError, "Boolean tensors are not supported in element-wise operations");
-            goto cleanup;
-        }
 
+    // First we need to understand the smallest numeric type we may need for this computation.
+    simsimd_datatype_t ab_dtype;
+    simsimd_datatype_family_k a_family = simsimd_datatype_family(a_parsed.datatype);
+    simsimd_datatype_family_k b_family = simsimd_datatype_family(b_parsed.datatype);
+    // For addition and multiplication, treat complex numbers as floats
+    if (a_family == simsimd_datatype_complex_float_family_k) a_family = simsimd_datatype_float_family_k;
+    if (b_family == simsimd_datatype_complex_float_family_k) b_family = simsimd_datatype_float_family_k;
+    if (a_family == simsimd_datatype_binary_family_k || b_family == simsimd_datatype_binary_family_k) {
+        PyErr_SetString(PyExc_ValueError, "Boolean tensors are not supported in element-wise operations");
+        return_obj = NULL;
+        goto cleanup;
+    }
+    // Infer the type-casting rules for the output tensor, if the type was not provided.
+    {
         size_t a_itemsize = bytes_per_datatype(a_parsed.datatype);
         size_t b_itemsize = bytes_per_datatype(b_parsed.datatype);
         size_t max_itemsize = a_itemsize > b_itemsize ? a_itemsize : b_itemsize;
-        if (a_parsed.datatype == b_parsed.datatype) { out_parsed.datatype = a_parsed.datatype; }
+        if (a_parsed.datatype == b_parsed.datatype) { ab_dtype = a_parsed.datatype; }
         // Simply take the bigger datatype if they are both floats or both are unsigned integers, etc.
-        else if (a_kind == b_kind) {
-            out_parsed.datatype = a_itemsize > b_itemsize ? a_parsed.datatype : b_parsed.datatype;
-        }
-        // If only one of the operands is a float, the output should be a float, of the next size...
-        // Sum of `float16` and `int32` is a `float64`. Sum of `float16` and `int16` is a `float32`.
-        else if (a_kind == simsimd_datatype_float_family_k || b_kind == simsimd_datatype_float_family_k) {
-            //? No 128-bit float on most platforms
-            if (max_itemsize == 8) { out_parsed.datatype = simsimd_datatype_f64_k; }
-            else if (max_itemsize == 4) { out_parsed.datatype = simsimd_datatype_f64_k; }
-            else if (max_itemsize == 2) { out_parsed.datatype = simsimd_datatype_f32_k; }
-            else if (max_itemsize == 1) { out_parsed.datatype = simsimd_datatype_f16_k; }
-        }
-        // If only one of the operands is a signed integer, the output should be a signed integer, of the next
-        // size... Sum of `int16` and `uint32` is a `int64`. Sum of `int16` and `uint16` is a `int32`.
-        else if (a_kind == simsimd_datatype_int_family_k || b_kind == simsimd_datatype_int_family_k) {
-            //? No 128-bit integer on most platforms
-            if (max_itemsize == 8) { out_parsed.datatype = simsimd_datatype_i64_k; }
-            else if (max_itemsize == 4) { out_parsed.datatype = simsimd_datatype_i64_k; }
-            else if (max_itemsize == 2) { out_parsed.datatype = simsimd_datatype_i32_k; }
-            else if (max_itemsize == 1) { out_parsed.datatype = simsimd_datatype_i16_k; }
+        else if (a_family == b_family) { ab_dtype = a_itemsize > b_itemsize ? a_parsed.datatype : b_parsed.datatype; }
+        // If only one of the operands is a float, and the second is integral of same size, the output should be a
+        // float, of the next size... If the floating type is bigger, don't upcast.
+        // Sum of `float16` and `int32` is a `float64`.
+        // Sum of `float16` and `int16` is a `float32`.
+        // Sum of `float32` and `int8` is a `float32`.
+        else if (a_family == simsimd_datatype_float_family_k || b_family == simsimd_datatype_float_family_k) {
+            size_t float_size = a_family == simsimd_datatype_float_family_k ? a_itemsize : b_itemsize;
+            size_t integral_size = a_family == simsimd_datatype_float_family_k ? b_itemsize : a_itemsize;
+            if (float_size <= integral_size) {
+                //? No 128-bit float on most platforms
+                if (max_itemsize == 8) { ab_dtype = simsimd_datatype_f64_k; }
+                else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_f64_k; }
+                else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_f32_k; }
+                else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_f16_k; }
+            }
+            else {
+                if (max_itemsize == 8) { ab_dtype = simsimd_datatype_f64_k; }
+                else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_f32_k; }
+                else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_f16_k; }
+                else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_f16_k; }
+            }
+        }
+        // If only one of the operands is a unsigned, and the second is a signed integral of same size,
+        // the output should be a signed integer, of the next size... If the signed type is bigger, don't upcast.
+        // Sum of `int16` and `uint32` is a `int64`.
+        // Sum of `int16` and `uint16` is a `int32`.
+        // Sum of `int32` and `uint8` is a `int32`.
+        else if (a_family == simsimd_datatype_int_family_k || b_family == simsimd_datatype_int_family_k) {
+            size_t unsigned_size = a_family == simsimd_datatype_int_family_k ? b_itemsize : a_itemsize;
+            size_t signed_size = a_family == simsimd_datatype_int_family_k ? a_itemsize : b_itemsize;
+            if (signed_size <= unsigned_size) {
+                //? No 128-bit integer on most platforms
+                if (max_itemsize == 8) { ab_dtype = simsimd_datatype_i64_k; }
+                else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_i64_k; }
+                else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_i32_k; }
+                else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_i16_k; }
+            }
+            else {
+                if (max_itemsize == 8) { ab_dtype = simsimd_datatype_i64_k; }
+                else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_i32_k; }
+                else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_i16_k; }
+                else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_i16_k; }
+            }
         }
         // For boolean and complex types, we don't yet have a clear policy.
         else {
             PyErr_SetString(PyExc_ValueError, "Unsupported combination of datatypes");
+            return_obj = NULL;
             goto cleanup;
         }
-        dtype = out_parsed.datatype;
     }
 
+    // If dealing with scalars, consider up-casting them to the same `ab_dtype` datatype.
+    // That way we will be use the `apply_elementwise_binary_operation_to_each_scalar` function
+    // instead of the `apply_elementwise_casting_binary_operation_to_each_scalar` function.
+    simsimd_datatype_family_k ab_family = simsimd_datatype_family(ab_dtype);
+    if (a_parsed.kind != BufferKind && b_parsed.kind != BufferKind && a_parsed.datatype != b_parsed.datatype) {
+        if (ab_family == simsimd_datatype_float_family_k) {
+            if (a_parsed.datatype != ab_dtype) {
+                elementwise_upcast_to_f64(a_parsed.datatype)(a_parsed.as_scalar, a_parsed.as_scalar);
+                elementwise_downcast_from_f64(ab_dtype)(a_parsed.as_scalar, a_parsed.as_scalar);
+                a_parsed.datatype = ab_dtype;
+            }
+            if (b_parsed.datatype != ab_dtype) {
+                elementwise_upcast_to_f64(b_parsed.datatype)(b_parsed.as_scalar, b_parsed.as_scalar);
+                elementwise_downcast_from_f64(ab_dtype)(b_parsed.as_scalar, b_parsed.as_scalar);
+                b_parsed.datatype = ab_dtype;
+            }
+        }
+        else if (ab_family == simsimd_datatype_uint_family_k) {
+            if (a_parsed.datatype != ab_dtype) {
+                elementwise_upcast_to_u64(a_parsed.datatype)(a_parsed.as_scalar, a_parsed.as_scalar);
+                elementwise_downcast_from_u64(ab_dtype)(a_parsed.as_scalar, a_parsed.as_scalar);
+                a_parsed.datatype = ab_dtype;
+            }
+            if (b_parsed.datatype != ab_dtype) {
+                elementwise_upcast_to_u64(b_parsed.datatype)(b_parsed.as_scalar, b_parsed.as_scalar);
+                elementwise_downcast_from_u64(ab_dtype)(b_parsed.as_scalar, b_parsed.as_scalar);
+                b_parsed.datatype = ab_dtype;
+            }
+        }
+        else if (ab_family == simsimd_datatype_int_family_k) {
+            if (a_parsed.datatype != ab_dtype) {
+                elementwise_upcast_to_i64(a_parsed.datatype)(a_parsed.as_scalar, a_parsed.as_scalar);
+                elementwise_downcast_from_i64(ab_dtype)(a_parsed.as_scalar, a_parsed.as_scalar);
+                a_parsed.datatype = ab_dtype;
+            }
+            if (b_parsed.datatype != ab_dtype) {
+                elementwise_upcast_to_i64(b_parsed.datatype)(b_parsed.as_scalar, b_parsed.as_scalar);
+                elementwise_downcast_from_i64(ab_dtype)(b_parsed.as_scalar, b_parsed.as_scalar);
+                b_parsed.datatype = ab_dtype;
+            }
+        }
+    }
+
+    printf("Effective datatypes are: a - %s, b - %s, out - %s\n", datatype_to_python_string(a_parsed.datatype),
+           datatype_to_python_string(b_parsed.datatype), datatype_to_python_string(ab_dtype));
+
     // Estimate the total number of output elements:
     Py_ssize_t out_total_elements = 1;
     for (Py_ssize_t i = 0; i < out_parsed.as_buffer_dimensions; ++i)
@@ -3127,20 +3277,21 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
     Py_ssize_t out_continuous_dimensions = 0;
     Py_ssize_t out_continuous_elements = 1;
     if (!out_obj) {
-        Py_ssize_t const expected_size_bytes = out_total_elements * bytes_per_datatype(dtype);
+        Py_ssize_t const expected_size_bytes = out_total_elements * bytes_per_datatype(ab_dtype);
         NDArray *out_buffer_obj = PyObject_NewVar(NDArray, &NDArrayType, expected_size_bytes);
         if (!out_buffer_obj) {
             PyErr_NoMemory();
+            return_obj = NULL;
             goto cleanup;
         }
 
         // Initialize the object
         memset(out_buffer_obj->shape, 0, sizeof(out_buffer_obj->shape));
         memset(out_buffer_obj->strides, 0, sizeof(out_buffer_obj->strides));
-        out_buffer_obj->datatype = out_parsed.datatype;
+        out_buffer_obj->datatype = ab_dtype;
         out_buffer_obj->ndim = out_parsed.as_buffer_dimensions;
         memcpy(out_buffer_obj->shape, out_parsed.as_buffer_shape, out_parsed.as_buffer_dimensions * sizeof(Py_ssize_t));
-        out_buffer_obj->strides[out_parsed.as_buffer_dimensions - 1] = bytes_per_datatype(out_parsed.datatype);
+        out_buffer_obj->strides[out_parsed.as_buffer_dimensions - 1] = bytes_per_datatype(ab_dtype);
         for (Py_ssize_t i = out_parsed.as_buffer_dimensions - 2; i >= 0; --i)
             out_buffer_obj->strides[i] = out_buffer_obj->strides[i + 1] * out_buffer_obj->shape[i + 1];
 
@@ -3149,6 +3300,7 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
         out_continuous_elements = out_total_elements;
 
         // Re-export into the `out_parsed` for future use
+        out_parsed.datatype = ab_dtype;
         out_parsed.as_buffer_start = (char *)&out_buffer_obj->start[0];
         memcpy(out_parsed.as_buffer_strides, out_buffer_obj->strides,
                out_parsed.as_buffer_dimensions * sizeof(Py_ssize_t));
@@ -3167,13 +3319,17 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
             expected_last_stride_bytes *= out_buffer.shape[i];
             out_continuous_elements *= out_buffer.shape[i];
         }
+
+        out_parsed.as_buffer_dimensions = out_buffer.ndim;
+        out_parsed.as_buffer_start = (char *)out_buffer.buf;
+        memcpy(out_parsed.as_buffer_shape, out_buffer.shape, out_parsed.as_buffer_dimensions * sizeof(Py_ssize_t));
+        memcpy(out_parsed.as_buffer_strides, out_buffer.strides, out_parsed.as_buffer_dimensions * sizeof(Py_ssize_t));
     }
 
     // First of all, check for our optimal case, when:
+    // - both input operands are not scalars.
     // - there is at least one continuous dimension.
-    // - the types match between:
-    //      - both input tensors and output: uses `sum` kernel.
-    //      - the input tensor and output tensor, with any scalar: uses `scale` kernel.
+    // - the types match between both input tensors and output: uses `sum` kernel.
     // ... where we will use the vectorized kernel!
     if (a_parsed.datatype == b_parsed.datatype && a_parsed.datatype == out_parsed.datatype &&
         out_continuous_dimensions && ins_continuous_dimensions) {
@@ -3182,16 +3338,15 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
         simsimd_kernel_sum_punned_t kernel = NULL;
         simsimd_capability_t capability = simsimd_cap_serial_k;
         simsimd_kernel_kind_t const kernel_kind = simsimd_sum_k;
-        simsimd_find_kernel_punned(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k,
+        simsimd_find_kernel_punned(kernel_kind, ab_dtype, static_capabilities, simsimd_cap_any_k,
                                    (simsimd_kernel_punned_t *)&kernel, &capability);
         if (!kernel) {
             PyErr_Format( //
-                PyExc_LookupError,
-                "Unsupported kernel '%c' and datatype combination across vectors ('%s'/'%s') and "
-                "`dtype` override ('%s'/'%s')",
-                kernel_kind,                                                                             //
-                a_buffer.format ? a_buffer.format : "nil", datatype_to_python_string(a_parsed.datatype), //
-                dtype_str ? dtype_str : "nil", datatype_to_python_string(dtype));
+                PyExc_LookupError, "Unsupported kernel '%c' and datatype combination across inputs ('%s' and '%s')",
+                kernel_kind,                                  //
+                datatype_to_python_string(a_parsed.datatype), //
+                datatype_to_python_string(b_parsed.datatype));
+            return_obj = NULL;
             goto cleanup;
         }
 
@@ -3201,45 +3356,79 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
         Py_ssize_t const non_continuous_ranks = out_parsed.as_buffer_dimensions - continuous_ranks;
         Py_ssize_t const continuous_elements =
             out_continuous_elements < ins_continuous_elements ? out_continuous_elements : ins_continuous_elements;
-        implementation_vectorized_binary_tensor_operation( //
-            &a_parsed, &b_parsed, &out_parsed,             //
+        apply_elementwise_binary_operation_to_each_continuous_slice( //
+            &a_parsed, &b_parsed, &out_parsed,                       //
             non_continuous_ranks, continuous_elements, kernel);
 
         goto cleanup;
     }
 
+    // Our second best case is when:
+    // - there is only one input tensor and the second operand is a scalar,
+    // - there is at least one continuous dimension in the input and output tensor,
+    // - the types match between the input tensor and output tensor.
+    int const is_tensor_a_with_scalar_b =
+        a_parsed.kind == BufferKind && a_parsed.datatype == out_parsed.datatype && b_parsed.kind != BufferKind;
+    int const is_tensor_b_with_scalar_b =
+        b_parsed.kind == BufferKind && b_parsed.datatype == out_parsed.datatype && a_parsed.kind != BufferKind;
+    if ((is_tensor_a_with_scalar_b || is_tensor_b_with_scalar_b) &&
+        (out_continuous_dimensions && ins_continuous_dimensions)) {
+        // Look up the kernel and the capability
+        simsimd_kernel_scale_punned_t kernel = NULL;
+        simsimd_capability_t capability = simsimd_cap_serial_k;
+        simsimd_kernel_kind_t const kernel_kind = simsimd_scale_k;
+        simsimd_find_kernel_punned(kernel_kind, ab_dtype, static_capabilities, simsimd_cap_any_k,
+                                   (simsimd_kernel_punned_t *)&kernel, &capability);
+        if (!kernel) {
+            PyErr_Format( //
+                PyExc_LookupError, "Unsupported kernel '%c' and datatype combination across inputs ('%s' and '%s')",
+                kernel_kind,                                  //
+                datatype_to_python_string(a_parsed.datatype), //
+                datatype_to_python_string(b_parsed.datatype));
+            return_obj = NULL;
+            goto cleanup;
+        }
+
+        Py_ssize_t const continuous_ranks = out_continuous_dimensions < ins_continuous_dimensions
+                                                ? out_continuous_dimensions
+                                                : ins_continuous_dimensions;
+        Py_ssize_t const non_continuous_ranks = out_parsed.as_buffer_dimensions - continuous_ranks;
+        Py_ssize_t const continuous_elements =
+            out_continuous_elements < ins_continuous_elements ? out_continuous_elements : ins_continuous_elements;
+
+        if (is_tensor_a_with_scalar_b)
+            apply_scale_to_each_continuous_slice(           //
+                &a_parsed, 1, b_parsed.as_f64, &out_parsed, //
+                non_continuous_ranks, continuous_elements, kernel);
+        else
+            apply_scale_to_each_continuous_slice(           //
+                &b_parsed, 1, a_parsed.as_f64, &out_parsed, //
+                non_continuous_ranks, continuous_elements, kernel);
+
+        goto cleanup;
+    }
+
     // Finally call the serial kernels!
     // If the output has no continuous dimensions at all, our situation sucks!
     // We can't use SIMD effectively and need to fall back to the scalar operation,
     // but if the input/output types match, at least we don't need to cast the data back and forth.
-    {
+    if (a_parsed.datatype == b_parsed.datatype && a_parsed.datatype == out_parsed.datatype) {
         binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(a_parsed.datatype);
-        implementation_elementwise_binary_tensor_operation(&a_parsed, &b_parsed, &out_parsed, elementwise_sadd_ptr);
+        apply_elementwise_binary_operation_to_each_scalar(&a_parsed, &b_parsed, &out_parsed, elementwise_sadd_ptr);
         goto cleanup;
     }
 
     // If the output has no continuous dimensions at all, our situation sucks!
     // If the type of outputs and inputs doesn't match, it also sucks!
     // We can't use SIMD effectively and need to fall back to the scalar operation.
-    if (simsimd_datatype_family(a_parsed.datatype) == simsimd_datatype_float_family_k &&
+    if (simsimd_datatype_family(a_parsed.datatype) == simsimd_datatype_float_family_k ||
         simsimd_datatype_family(b_parsed.datatype) == simsimd_datatype_float_family_k) {
         unary_kernel_t a_upcast_ptr = elementwise_upcast_to_f64(a_parsed.datatype);
         unary_kernel_t b_upcast_ptr = elementwise_upcast_to_f64(b_parsed.datatype);
         binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_datatype_f64_k);
         unary_kernel_t out_downcast_ptr = elementwise_downcast_from_f64(out_parsed.datatype);
-        implementation_elementwise_binary_mixed_tensor_operation( //
-            &a_parsed, &b_parsed, &out_parsed,                    //
-            a_upcast_ptr, b_upcast_ptr, out_downcast_ptr, elementwise_sadd_ptr);
-        goto cleanup;
-    }
-    else if (simsimd_datatype_family(a_parsed.datatype) == simsimd_datatype_int_family_k &&
-             simsimd_datatype_family(b_parsed.datatype) == simsimd_datatype_int_family_k) {
-        unary_kernel_t a_upcast_ptr = elementwise_upcast_to_i64(a_parsed.datatype);
-        unary_kernel_t b_upcast_ptr = elementwise_upcast_to_i64(b_parsed.datatype);
-        binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_datatype_i64_k);
-        unary_kernel_t out_downcast_ptr = elementwise_downcast_from_i64(out_parsed.datatype);
-        implementation_elementwise_binary_mixed_tensor_operation( //
-            &a_parsed, &b_parsed, &out_parsed,                    //
+        apply_elementwise_casting_binary_operation_to_each_scalar( //
+            &a_parsed, &b_parsed, &out_parsed,                     //
             a_upcast_ptr, b_upcast_ptr, out_downcast_ptr, elementwise_sadd_ptr);
         goto cleanup;
     }
@@ -3249,13 +3438,19 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
         unary_kernel_t b_upcast_ptr = elementwise_upcast_to_u64(b_parsed.datatype);
         binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_datatype_u64_k);
         unary_kernel_t out_downcast_ptr = elementwise_downcast_from_u64(out_parsed.datatype);
-        implementation_elementwise_binary_mixed_tensor_operation( //
-            &a_parsed, &b_parsed, &out_parsed,                    //
+        apply_elementwise_casting_binary_operation_to_each_scalar( //
+            &a_parsed, &b_parsed, &out_parsed,                     //
             a_upcast_ptr, b_upcast_ptr, out_downcast_ptr, elementwise_sadd_ptr);
         goto cleanup;
     }
     else {
-        PyErr_Format(PyExc_ValueError, "Unsupported combination of datatypes");
+        unary_kernel_t a_upcast_ptr = elementwise_upcast_to_i64(a_parsed.datatype);
+        unary_kernel_t b_upcast_ptr = elementwise_upcast_to_i64(b_parsed.datatype);
+        binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_datatype_i64_k);
+        unary_kernel_t out_downcast_ptr = elementwise_downcast_from_i64(out_parsed.datatype);
+        apply_elementwise_casting_binary_operation_to_each_scalar( //
+            &a_parsed, &b_parsed, &out_parsed,                     //
+            a_upcast_ptr, b_upcast_ptr, out_downcast_ptr, elementwise_sadd_ptr);
         goto cleanup;
     }
 
@@ -3263,6 +3458,7 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const
     PyBuffer_Release(&a_buffer);
     PyBuffer_Release(&b_buffer);
     PyBuffer_Release(&out_buffer);
+    if (return_obj == Py_None) Py_INCREF(return_obj);
     return return_obj;
 }
 
@@ -3353,7 +3549,7 @@ static char const doc_module[] = //
     "   impact on performance, but most modern HPC packages explicitly ban non-contiguous rows,\n"
     "   where the nearby matrix cells within a row have multi-byte gaps.\n"
     " - The CPython runtime has a noticeable overhead for function calls, so consider batching\n"
-    "   kernel invokations. Many kernels can compute not only 1-to-1 distance between vectors,\n"
+    "   kernel invocations. Many kernels can compute not only 1-to-1 distance between vectors,\n"
     "   but also 1-to-N and N-to-N distances between two batches of vectors packed into matrices.\n"
     "\n"
     "Example:\n"
diff --git a/scripts/test.py b/scripts/test.py
index 5b0aa2d0..c0985b83 100644
--- a/scripts/test.py
+++ b/scripts/test.py
@@ -45,11 +45,13 @@
 import platform
 import collections
 from typing import Dict, List
+import faulthandler
 
 import tabulate
 import pytest
 import simsimd as simd
 
+faulthandler.enable()
 
 # NumPy is available on most platforms and is required for most tests.
 # When using PyPy on some platforms NumPy has internal issues, that will
@@ -99,7 +101,7 @@ def baseline_add(x, y):
         elif dtype == np.int8:
             return _normalize_element_wise(x.astype(np.int16) + y, dtype)
         else:
-            return _normalize_element_wise(x + y, dtype)
+            return x + y
 
     def baseline_multiply(x, y):
         dtype = x.dtype if isinstance(x, np.ndarray) else y.dtype
@@ -108,7 +110,7 @@ def baseline_multiply(x, y):
         elif dtype == np.int8:
             return _normalize_element_wise(x.astype(np.int16) * y, dtype)
         else:
-            return _normalize_element_wise(x * y, dtype)
+            return x * y
 
 except:
     # NumPy is not installed, most tests will be skipped
@@ -585,6 +587,22 @@ def to_array(x, dtype=None):
         return y
 
 
+def random_of_dtype(dtype, shape):
+    if dtype == "float64" or dtype == "float32" or dtype == "float16":
+        return np.random.randn(*shape).astype(dtype)
+    elif (
+        dtype == "int8"
+        or dtype == "uint8"
+        or dtype == "int16"
+        or dtype == "uint16"
+        or dtype == "int32"
+        or dtype == "uint32"
+        or dtype == "int64"
+        or dtype == "uint64"
+    ):
+        return np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max, shape, dtype=dtype)
+
+
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 @pytest.mark.parametrize(
     "function, expected_error, args, kwargs",
@@ -1546,122 +1564,123 @@ def test_cdist_hamming(ndim, out_dtype, capability):
 
 @pytest.mark.skipif(not numpy_available, reason="NumPy is not installed")
 @pytest.mark.repeat(50)
-@pytest.mark.parametrize("dtype", ["float64", "float32"])
+@pytest.mark.parametrize("first_dtype", ["float64", "float32", "int32", "uint32"])
+@pytest.mark.parametrize("second_dtype", ["float64", "float32", "int32", "uint32"])
+@pytest.mark.parametrize("output_dtype", ["float64", "float32", "int32", "uint32"])
 @pytest.mark.parametrize("kernel", ["add"])  # , "multiply"])
 @pytest.mark.parametrize("capability", possible_capabilities)
-def test_add(dtype, kernel, capability, stats_fixture):
+def test_add(first_dtype, second_dtype, output_dtype, kernel, capability, stats_fixture):
     """Tests NumPy-like compatibility interfaces on all kinds of non-contiguous arrays."""
 
-    if dtype == "float16" and is_running_under_qemu():
-        pytest.skip("Testing low-precision math isn't reliable in QEMU")
-
     np.random.seed()
     keep_one_capability(capability)
     baseline_kernel, simd_kernel = name_to_kernels(kernel)
 
-    def get_shape(a) -> tuple:
-        return a.__array_interface__["shape"]
-
-    def validate(a, b, o):
-        c = baseline_kernel(a, b)
-        d = np.array(simd_kernel(a, b))
-        simd_kernel(a, b, out=o)
-        assert d.size == c.size
-        assert d.shape == c.shape
-        np.testing.assert_allclose(d, c, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
-        np.testing.assert_allclose(d, o, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
-        return d
+    def validate(a, b, inplace_simsimd):
+        result_numpy = baseline_kernel(a, b)
+        result_simsimd = np.array(simd_kernel(a, b))
+        simd_kernel(a, b, out=inplace_simsimd)
+        assert result_simsimd.size == result_numpy.size
+        assert result_simsimd.shape == result_numpy.shape
+        assert result_simsimd.dtype == result_numpy.dtype
+        np.testing.assert_allclose(result_simsimd, result_numpy, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
+        np.testing.assert_allclose(result_simsimd, inplace_simsimd, atol=SIMSIMD_ATOL, rtol=SIMSIMD_RTOL)
+        return result_simsimd
 
     # Vector-Vector addition
-    a = np.random.randn(47).astype(dtype)
-    b = np.random.randn(47).astype(dtype)
-    o = np.zeros(47).astype(dtype)
+    a = random_of_dtype(first_dtype, (47,))
+    b = random_of_dtype(second_dtype, (47,))
+    o = np.zeros(47).astype(output_dtype)
     validate(a, b, o)
 
     # Vector-Scalar addition
+    validate(a, -11, o)
     validate(a, 11, o)
+    validate(a, 11.0, o)
 
     # Scalar-Vector addition
-    validate(11, b, o)
+    validate(-13, b, o)
+    validate(13, b, o)
+    validate(13.0, b, o)
 
     # Matrix-Matrix addition
-    a = np.random.randn(10, 47).astype(dtype)
-    b = np.random.randn(10, 47).astype(dtype)
-    o = np.zeros((10, 47)).astype(dtype)
+    a = random_of_dtype(first_dtype, (10, 47))
+    b = random_of_dtype(second_dtype, (10, 47))
+    o = np.zeros((10, 47)).astype(output_dtype)
     validate(a, b, o)
 
     # Strided Matrix-Matrix addition
-    a_extended = np.random.randn(10, 47).astype(dtype)
-    b_extended = np.random.randn(10, 47).astype(dtype)
+    a_extended = random_of_dtype(first_dtype, (10, 47))
+    b_extended = random_of_dtype(second_dtype, (10, 47))
     a = a_extended[::2, 1:]  # Every second (even) row, all columns but the first
     b = b_extended[1::2, :-1]  # Every second (odd) row, all columns but the last
-    o = np.zeros((5, 46)).astype(dtype)
+    o = np.zeros((5, 46)).astype(output_dtype)
     validate(a, b, o)
 
     # Strided Matrix-Matrix addition in with reverse order of different dimensions
-    a_extended = np.random.randn(10, 47).astype(dtype)
-    b_extended = np.random.randn(10, 47).astype(dtype)
+    a_extended = random_of_dtype(first_dtype, (10, 47))
+    b_extended = random_of_dtype(second_dtype, (10, 47))
     a = a_extended[::-2, 1:]  # Every second (even) row (reverse), all columns but the first
     b = b_extended[1::2, -2::-1]  # Every second (odd) row, all columns (reversed) but the last
-    o = np.zeros((5, 46)).astype(dtype)
+    o = np.zeros((5, 46)).astype(output_dtype)
     validate(a, b, o)
 
     # Raise an error if shapes are different
-    a = np.random.randn(10, 47).astype(dtype)
-    b = np.random.randn(10, 46).astype(dtype)
+    a = random_of_dtype(first_dtype, (10, 47))
+    b = random_of_dtype(second_dtype, (10, 46))
     with pytest.raises(ValueError):
         baseline_kernel(a, b)
     with pytest.raises(ValueError):
         simd_kernel(a, b)
 
     # Raise an error if shapes are different
-    a = np.random.randn(6, 2, 3).astype(dtype)
-    b = np.random.randn(6, 6).astype(dtype)
+    a = random_of_dtype(first_dtype, (6, 2, 3))
+    b = random_of_dtype(second_dtype, (6, 6))
     with pytest.raises(ValueError):
         baseline_kernel(a, b)
     with pytest.raises(ValueError):
         simd_kernel(a, b)
 
     # Make sure broadcasting works as expected for a single scalar
-    a = np.random.randn(4, 7, 5, 3).astype(dtype)
-    b = np.random.randn(1).astype(dtype)
-    o = np.zeros((4, 7, 5, 3)).astype(dtype)
+    a = random_of_dtype(first_dtype, (4, 7, 5, 3))
+    b = random_of_dtype(second_dtype, (1,))
+    o = np.zeros((4, 7, 5, 3)).astype(output_dtype)
     assert validate(a, b, o).shape == (4, 7, 5, 3)
 
     # Make sure broadcasting works as expected for a unit tensor
-    a = np.random.randn(4, 7, 5, 3).astype(dtype)
-    b = np.random.randn(1, 1, 1, 1).astype(dtype)
-    o = np.zeros((4, 7, 5, 3)).astype(dtype)
+    a = random_of_dtype(first_dtype, (4, 7, 5, 3))
+    b = random_of_dtype(second_dtype, (1, 1, 1, 1))
+    o = np.zeros((4, 7, 5, 3)).astype(output_dtype)
     assert validate(a, b, o).shape == (4, 7, 5, 3)
 
-    # Make sure broadcasting works as expected for unit tensors of different rank
-    a = np.random.randn(1, 1, 1, 1).astype(dtype)
-    b = np.random.randn(1, 1, 1).astype(dtype)
-    o = np.zeros((1, 1, 1, 1)).astype(dtype)
+    # Make sure broadcasting works as expected for 2 unit tensors of different rank
+    a = random_of_dtype(first_dtype, (1, 1, 1, 1))
+    b = random_of_dtype(second_dtype, (1, 1, 1))
+    o = np.zeros((1, 1, 1, 1)).astype(output_dtype)
     assert validate(a, b, o).shape == (1, 1, 1, 1)
 
-    # Make sure broadcasting works as expected for a single scalar
-    a = np.random.randn(4, 7, 5, 3).astype(dtype)
-    b = np.random.randn(1, 1, 1).astype(dtype)
-    o = np.zeros((4, 7, 5, 3)).astype(dtype)
+    # Make sure broadcasting works as expected for a unit tensor of different rank
+    a = random_of_dtype(first_dtype, (4, 7, 5, 3))
+    b = random_of_dtype(second_dtype, (1, 1, 1))
+    o = np.zeros((4, 7, 5, 3)).astype(output_dtype)
     assert validate(a, b, o).shape == (4, 7, 5, 3)
 
-    # Make sure broadcasting works as expected for a single scalar
-    a = np.random.randn(4, 7, 5, 3).astype(dtype)
-    b = np.random.randn(1, 1, 1, 1, 1).astype(dtype)
-    o = np.zeros((4, 7, 5, 3)).astype(dtype)
+    # Make sure broadcasting works as expected for an added dimension
+    a = random_of_dtype(first_dtype, (4, 7, 5, 3))
+    b = random_of_dtype(second_dtype, (1, 1, 1, 1, 1))
+    o = np.zeros((1, 4, 7, 5, 3)).astype(output_dtype)
     assert validate(a, b, o).shape == (1, 4, 7, 5, 3)
 
     # Make sure broadcasting works as expected for mixed origin broadcasting
-    a = np.random.randn(4, 7, 5, 3).astype(dtype)
-    b = np.random.randn(2, 1, 1, 1, 1).astype(dtype)
-    o = np.zeros((4, 7, 5, 3)).astype(dtype)
+    a = random_of_dtype(first_dtype, (4, 7, 5, 3))
+    b = random_of_dtype(second_dtype, (2, 1, 1, 1, 1))
+    o = np.zeros((2, 4, 7, 5, 3)).astype(output_dtype)
     assert validate(a, b, o).shape == (2, 4, 7, 5, 3)
 
     # Make sure broadcasting works as expected
-    a = np.random.randn(4, 7, 5, 3).astype(dtype)
-    b = np.random.randn(4, 1, 5, 1).astype(dtype)
-    o = np.zeros((4, 7, 5, 3)).astype(dtype)
+    a = random_of_dtype(first_dtype, (4, 7, 5, 1))
+    b = random_of_dtype(second_dtype, (4, 1, 5, 3))
+    o = np.zeros((4, 7, 5, 3)).astype(output_dtype)
     assert validate(a, b, o).shape == (4, 7, 5, 3)