microsoft · YangWang92 · Oct 29, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/csrc/utils.cuh b/csrc/utils.cuh
@@ -256,7 +256,11 @@ T __device__ __forceinline__ ADD2(T a, T b) {
 template <typename T>
 T __device__ __forceinline__ ZERO_VALUE(T a) {
   if constexpr (std::is_same<T, __bfloat16>::value) {
-    return __ushort_as_bfloat16((unsigned short)0x0000U);
+#if defined(USE_ROCM)
+    return __float2bfloat16(0.0f);
+#else
+    return __float2bfloat16_rn(0.0f);
+#endif
   } else if constexpr (std::is_same<T, float>::value) {
     return 0.0f;
   } else {

diff --git a/setup.py b/setup.py
@@ -46,6 +46,9 @@ def build_cuda_extensions():
             arch_flags += ["-gencode", f"arch=compute_{cap},code=sm_{cap}"]
     print(" build for compute capabilities: ==============", compute_capabilities)
 
+    # set nvcc threads
+    nvcc_threads = os.getenv("NVCC_THREADS") or "4"
+
     extra_compile_args = {
         "nvcc": [
             "-O3",
@@ -58,6 +61,7 @@ def build_cuda_extensions():
             "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
             "-U__CUDA_NO_BFLOAT162_OPERATORS__",
             "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+            f"--threads={nvcc_threads}",
         ] + arch_flags,
         "cxx": ["-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"],
     }

diff --git a/vptq/__init__.py b/vptq/__init__.py
@@ -3,5 +3,5 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-__version__ = "0.0.2.post1"
+__version__ = "0.0.3"
 from vptq.layers import AutoModelForCausalLM as AutoModelForCausalLM