zzzyq
diff --git a/‎cmake/cpu_extension.cmake‎
Lines changed: 32 additions & 1 deletion b/‎cmake/cpu_extension.cmake‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎csrc/cpu/cpu_types_vsx.hpp‎
Lines changed: 265 additions & 0 deletions b/‎csrc/cpu/cpu_types_vsx.hpp‎
Lines changed: 265 additions & 0 deletions
@@ -167,6 +167,33 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
 
     FetchContent_MakeAvailable(oneDNN)
 
+    list(APPEND LIBS dnnl)
+elseif(POWER10_FOUND)
+    FetchContent_Declare(
+        oneDNN
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
+        GIT_TAG v3.7.2
+        GIT_PROGRESS TRUE
+        GIT_SHALLOW TRUE
+    )
+
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
+    set(ONEDNN_BUILD_DOC "OFF")
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
+    set(ONEDNN_BUILD_TESTS "OFF")
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
+    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
+    set(ONEDNN_BUILD_GRAPH "OFF")
+    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
+    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
+    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
+    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
+    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+    set(DNNL_CPU_RUNTIME "OMP")
+
+    FetchContent_MakeAvailable(oneDNN)
+
     list(APPEND LIBS dnnl)
 endif()
 
@@ -197,6 +224,10 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
         "csrc/cpu/quant.cpp"
         "csrc/cpu/shm.cpp"
         ${VLLM_EXT_SRC})
+elseif(POWER10_FOUND)
+    set(VLLM_EXT_SRC
+        "csrc/cpu/quant.cpp"
+        ${VLLM_EXT_SRC})
 endif()
 
 #
@@ -214,4 +245,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
@@ -4,6 +4,7 @@
 
 #include <altivec.h>
 #include <cmath>
+#include <algorithm>
 #include <torch/all.h>
 
 namespace vec_op {
@@ -62,6 +63,10 @@ typedef struct f32x4x4_t {
   __vector float val[4];
 } f32x4x4_t;
 
+typedef struct i32x4x4_t {
+  __vector int32_t val[4];
+} i32x4x4_t;
+
 struct FP32Vec8;
 struct FP32Vec16;
 
@@ -98,6 +103,28 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
     vec_xst(reg.val[0], 0, (signed short*)ptr);
     vec_xst(reg.val[1], 16, (signed short*)ptr);
   }
+
+  void save(void* ptr, const int elem_num) const {
+    const int clamped_elem = std::max(0, std::min(elem_num, 16));
+
+    // Calculate elements to store in each 128-bit part (8 elements each)
+    const int elements_val0 = std::min(clamped_elem, 8);
+    const int elements_val1 = std::max(clamped_elem - 8, 0);
+
+    // Convert elements to bytes (2 bytes per element)
+    const size_t bytes_val0 = elements_val0 * sizeof(signed short);
+    const size_t bytes_val1 = elements_val1 * sizeof(signed short);
+
+    signed short* dest = static_cast<signed short*>(ptr);
+    // Store the first part using vec_xst_len
+    if (bytes_val0 > 0) {
+      vec_xst_len(reg.val[0], dest, bytes_val0);
+    }
+    // Store the second part if needed
+    if (bytes_val1 > 0) {
+      vec_xst_len(reg.val[1], dest + elements_val0, bytes_val1);
+    }
+  }
 };
 
 const static __vector signed short zero = vec_splats((signed short)0);
@@ -257,6 +284,64 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 };
 
+struct INT32Vec16 : public Vec<INT32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    i32x4x4_t reg;
+    int32_t values[VEC_ELEM_NUM];
+  };
+
+  i32x4x4_t reg;
+
+  explicit INT32Vec16(const void* data_ptr) {
+    reg.val[0] = vec_xl(0, reinterpret_cast<const __vector int32_t*>(data_ptr));
+    reg.val[1] =
+        vec_xl(16, reinterpret_cast<const __vector int32_t*>(data_ptr));
+    reg.val[2] =
+        vec_xl(32, reinterpret_cast<const __vector int32_t*>(data_ptr));
+    reg.val[3] =
+        vec_xl(48, reinterpret_cast<const __vector int32_t*>(data_ptr));
+  }
+
+  void save(int32_t* ptr) const {
+    vec_xst(reg.val[0], 0, reinterpret_cast<__vector int32_t*>(ptr));
+    vec_xst(reg.val[1], 16, reinterpret_cast<__vector int32_t*>(ptr));
+    vec_xst(reg.val[2], 32, reinterpret_cast<__vector int32_t*>(ptr));
+    vec_xst(reg.val[3], 48, reinterpret_cast<__vector int32_t*>(ptr));
+  }
+
+  void save(int32_t* ptr, const int elem_num) const {
+    const int elements_in_chunk1 =
+        (elem_num >= 0) ? ((elem_num >= 4) ? 4 : elem_num) : 0;
+    const int elements_in_chunk2 =
+        (elem_num > 4) ? ((elem_num >= 8) ? 4 : elem_num - 4) : 0;
+    const int elements_in_chunk3 =
+        (elem_num > 8) ? ((elem_num >= 12) ? 4 : elem_num - 8) : 0;
+    const int elements_in_chunk4 =
+        (elem_num > 12) ? ((elem_num >= 16) ? 4 : elem_num - 12) : 0;
+
+    const size_t bytes_chunk1 =
+        static_cast<size_t>(elements_in_chunk1 * sizeof(int32_t));
+    const size_t bytes_chunk2 =
+        static_cast<size_t>(elements_in_chunk2 * sizeof(int32_t));
+    const size_t bytes_chunk3 =
+        static_cast<size_t>(elements_in_chunk3 * sizeof(int32_t));
+    const size_t bytes_chunk4 =
+        static_cast<size_t>(elements_in_chunk4 * sizeof(int32_t));
+
+    vec_xst_len(reg.val[0], reinterpret_cast<int32_t*>(ptr), bytes_chunk1);
+    vec_xst_len(reg.val[1],
+                reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 16),
+                bytes_chunk2);
+    vec_xst_len(reg.val[2],
+                reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 32),
+                bytes_chunk3);
+    vec_xst_len(reg.val[3],
+                reinterpret_cast<int32_t*>(reinterpret_cast<char*>(ptr) + 48),
+                bytes_chunk4);
+  }
+};
+
 struct FP32Vec16 : public Vec<FP32Vec16> {
   constexpr static int VEC_ELEM_NUM = 16;
   union AliasReg {
@@ -319,6 +404,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
+  explicit FP32Vec16(const INT32Vec16& v) {
+    reg.val[0] = vec_ctf(v.reg.val[0], 0);
+    reg.val[1] = vec_ctf(v.reg.val[1], 0);
+    reg.val[2] = vec_ctf(v.reg.val[2], 0);
+    reg.val[3] = vec_ctf(v.reg.val[3], 0);
+  }
+
   FP32Vec16 operator*(const FP32Vec16& b) const {
     return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
                                 vec_mul(reg.val[1], b.reg.val[1]),
@@ -347,6 +439,117 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                                 vec_div(reg.val[3], b.reg.val[3])}));
   }
 
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(f32x4x4_t(
+        {vec_min(max.reg.val[0], vec_max(min.reg.val[0], reg.val[0])),
+         vec_min(max.reg.val[1], vec_max(min.reg.val[1], reg.val[1])),
+         vec_min(max.reg.val[2], vec_max(min.reg.val[2], reg.val[2])),
+         vec_min(max.reg.val[3], vec_max(min.reg.val[3], reg.val[3]))}));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+                                vec_max(reg.val[1], b.reg.val[1]),
+                                vec_max(reg.val[2], b.reg.val[2]),
+                                vec_max(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 max(const FP32Vec16& b, int elem_num) const {
+    FP32Vec16 result;
+
+    // Create a vector of element indices for each chunk
+    __vector unsigned int indices = {0, 1, 2, 3};
+    __vector unsigned int elem_num_vec =
+        vec_splats(static_cast<unsigned int>(elem_num));
+
+    // Compute masks for each chunk
+    __vector unsigned int chunk_offset0 = {0, 0, 0,
+                                           0};  // Chunk 0: Elements 0-3
+    __vector unsigned int chunk_offset1 = {4, 4, 4,
+                                           4};  // Chunk 1: Elements 4-7
+    __vector unsigned int chunk_offset2 = {8, 8, 8,
+                                           8};  // Chunk 2: Elements 8-11
+    __vector unsigned int chunk_offset3 = {12, 12, 12,
+                                           12};  // Chunk 3: Elements 12-15
+
+    // Compute masks for each chunk
+    __vector bool int mask0 = vec_cmplt(indices + chunk_offset0, elem_num_vec);
+    __vector bool int mask1 = vec_cmplt(indices + chunk_offset1, elem_num_vec);
+    __vector bool int mask2 = vec_cmplt(indices + chunk_offset2, elem_num_vec);
+    __vector bool int mask3 = vec_cmplt(indices + chunk_offset3, elem_num_vec);
+
+    // Apply masks to compute the result for each chunk
+    result.reg.val[0] = vec_sel(this->reg.val[0],
+                                vec_max(this->reg.val[0], b.reg.val[0]), mask0);
+    result.reg.val[1] = vec_sel(this->reg.val[1],
+                                vec_max(this->reg.val[1], b.reg.val[1]), mask1);
+    result.reg.val[2] = vec_sel(this->reg.val[2],
+                                vec_max(this->reg.val[2], b.reg.val[2]), mask2);
+    result.reg.val[3] = vec_sel(this->reg.val[3],
+                                vec_max(this->reg.val[3], b.reg.val[3]), mask3);
+
+    return FP32Vec16(result.reg);
+  }
+
+  FP32Vec16 min(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_min(reg.val[0], b.reg.val[0]),
+                                vec_min(reg.val[1], b.reg.val[1]),
+                                vec_min(reg.val[2], b.reg.val[2]),
+                                vec_min(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 min(const FP32Vec16& b, int elem_num) const {
+    FP32Vec16 result;
+
+    vector unsigned int indices = {0, 1, 2, 3};
+    vector unsigned int elem_num_vec =
+        vec_splats(static_cast<unsigned int>(elem_num));
+
+    vector unsigned int chunk_offset0 = {0, 0, 0, 0};
+    vector unsigned int chunk_offset1 = {4, 4, 4, 4};
+    vector unsigned int chunk_offset2 = {8, 8, 8, 8};
+    vector unsigned int chunk_offset3 = {12, 12, 12, 12};
+
+    vector bool int mask0 = vec_cmplt(indices + chunk_offset0, elem_num_vec);
+    vector bool int mask1 = vec_cmplt(indices + chunk_offset1, elem_num_vec);
+    vector bool int mask2 = vec_cmplt(indices + chunk_offset2, elem_num_vec);
+    vector bool int mask3 = vec_cmplt(indices + chunk_offset3, elem_num_vec);
+
+    result.reg.val[0] = vec_sel(this->reg.val[0],
+                                vec_min(this->reg.val[0], b.reg.val[0]), mask0);
+    result.reg.val[1] = vec_sel(this->reg.val[1],
+                                vec_min(this->reg.val[1], b.reg.val[1]), mask1);
+    result.reg.val[2] = vec_sel(this->reg.val[2],
+                                vec_min(this->reg.val[2], b.reg.val[2]), mask2);
+    result.reg.val[3] = vec_sel(this->reg.val[3],
+                                vec_min(this->reg.val[3], b.reg.val[3]), mask3);
+
+    return FP32Vec16(result.reg);
+  }
+
+  FP32Vec16 abs() const {
+    return FP32Vec16(f32x4x4_t({vec_abs(reg.val[0]), vec_abs(reg.val[1]),
+                                vec_abs(reg.val[2]), vec_abs(reg.val[3])}));
+  }
+
+  float reduce_max() {
+    __vector float max01 = vec_max(reg.val[0], reg.val[1]);
+    __vector float max23 = vec_max(reg.val[2], reg.val[3]);
+    __vector float max_all = vec_max(max01, max23);
+    __vector float temp = vec_max(max_all, vec_sld(max_all, max_all, 8));
+    temp = vec_max(temp, vec_sld(temp, temp, 4));
+    return vec_extract(temp, 0);
+  }
+
+  float reduce_min() {
+    __vector float min01 = vec_min(reg.val[0], reg.val[1]);
+    __vector float min23 = vec_min(reg.val[2], reg.val[3]);
+    __vector float min_all = vec_min(min01, min23);
+    __vector float temp = vec_min(min_all, vec_sld(min_all, min_all, 8));
+    temp = vec_min(temp, vec_sld(temp, temp, 4));
+    return vec_extract(temp, 0);
+  }
+
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
@@ -377,6 +580,68 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     vec_xst(reg.val[2], 32, ptr);
     vec_xst(reg.val[3], 48, ptr);
   }
+
+  void save(float* ptr, const int elem_num) const {
+    const int elements_in_chunk1 =
+        (elem_num >= 0) ? ((elem_num >= 4) ? 4 : elem_num) : 0;
+    const int elements_in_chunk2 =
+        (elem_num > 4) ? ((elem_num >= 8) ? 4 : elem_num - 4) : 0;
+    const int elements_in_chunk3 =
+        (elem_num > 8) ? ((elem_num >= 12) ? 4 : elem_num - 8) : 0;
+    const int elements_in_chunk4 =
+        (elem_num > 12) ? ((elem_num >= 16) ? 4 : elem_num - 12) : 0;
+
+    const size_t bytes_chunk1 =
+        static_cast<size_t>(elements_in_chunk1 * sizeof(float));
+    const size_t bytes_chunk2 =
+        static_cast<size_t>(elements_in_chunk2 * sizeof(float));
+    const size_t bytes_chunk3 =
+        static_cast<size_t>(elements_in_chunk3 * sizeof(float));
+    const size_t bytes_chunk4 =
+        static_cast<size_t>(elements_in_chunk4 * sizeof(float));
+
+    vec_xst_len(reg.val[0], ptr, bytes_chunk1);
+    vec_xst_len(reg.val[1],
+                reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 16),
+                bytes_chunk2);
+    vec_xst_len(reg.val[2],
+                reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 32),
+                bytes_chunk3);
+    vec_xst_len(reg.val[3],
+                reinterpret_cast<float*>(reinterpret_cast<char*>(ptr) + 48),
+                bytes_chunk4);
+  }
+};
+
+struct INT8Vec16 : public Vec<INT8Vec16> {
+  constexpr static int VEC_NUM_ELEM = 16;  // 128 bits / 8 bits = 16
+
+  union AliasReg {
+    __vector signed char reg;
+    int8_t values[VEC_NUM_ELEM];
+  };
+
+  __vector signed char reg;
+
+  explicit INT8Vec16(const FP32Vec16& vec) {
+    __vector signed int ret[4];
+    ret[0] = vec_cts(vec.reg.val[0], 0);
+    ret[1] = vec_cts(vec.reg.val[1], 0);
+    ret[2] = vec_cts(vec.reg.val[2], 0);
+    ret[3] = vec_cts(vec.reg.val[3], 0);
+
+    __vector signed short packed1 = vec_packs(ret[0], ret[1]);
+    __vector signed short packed2 = vec_packs(ret[2], ret[3]);
+
+    reg = vec_packs(packed1, packed2);
+  }
+
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed char*>(ptr) = reg;
+  }
+  void save(signed char* ptr, const int elem_num) {
+    vec_xst_len(reg, ptr, static_cast<size_t>(elem_num));
+  }
 };
 
 template <typename T>