Skip to content

Commit

Permalink
Merge pull request OpenMathLib#4456 from kseniyazaytseva/riscv-rvv10
Browse files Browse the repository at this point in the history
Fix BLAS and LAPACK tests for RVV 1.0 target, update to 0.12.0 intrincics
  • Loading branch information
martin-frbg authored Jan 26, 2024
2 parents 4e2a32f + b193ea3 commit 889c5d0
Show file tree
Hide file tree
Showing 46 changed files with 1,628 additions and 709 deletions.
8 changes: 5 additions & 3 deletions kernel/riscv64/axpby_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
{
FLOAT_V_T vx, vy;

if ( n < 0 ) return(0);
if ( n <= 0 ) return(0);

if ( beta == 0.0 ) {
if ( alpha == 0.0 ) {
Expand All @@ -63,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
BLASLONG stride_y = inc_y * sizeof(FLOAT);
size_t vl = VSETVL(n);
vy = VFMVVF_FLOAT(0.0, vl);
for ( ; n > 0; n -= vl, y += vl*stride_y) {
for ( ; n > 0; n -= vl, y += vl*inc_y) {
vl = VSETVL(n);
VSSEV_FLOAT(y, stride_y, vy, vl);
}
Expand Down Expand Up @@ -126,10 +126,12 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *

} else {
if ((1 == inc_x) && (1 == inc_y)) {
for (size_t vl; n > 0; n -= vl, y += vl) {
for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
vl = VSETVL(n);
vx = VLEV_FLOAT(x, vl);
vy = VLEV_FLOAT(y, vl);
vy = VFMULVF_FLOAT(vy, beta, vl);
vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
VSEV_FLOAT (y, vy, vl);
}
} else if (1 == inc_x) {
Expand Down
2 changes: 1 addition & 1 deletion kernel/riscv64/copy_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
if(n < 0) return(0);
if(n <= 0) return(0);

FLOAT_V_T v0;

Expand Down
51 changes: 42 additions & 9 deletions kernel/riscv64/gemm_ncopy_8_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define FLOAT_VX2_T vfloat32m1x2_t
#define FLOAT_VX4_T vfloat32m1x4_t
#define FLOAT_VX8_T vfloat32m1x8_t
#define VSET_VX2 __riscv_vset_v_f32m1_f32m1x2
#define VSET_VX4 __riscv_vset_v_f32m1_f32m1x4
#define VSET_VX8 __riscv_vset_v_f32m1_f32m1x8
#define VLEV_FLOAT __riscv_vle32_v_f32m1
#define VSEV_FLOAT __riscv_vse32_v_f32m1
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
#else
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define FLOAT_VX2_T vfloat64m1x2_t
#define FLOAT_VX4_T vfloat64m1x4_t
#define FLOAT_VX8_T vfloat64m1x8_t
#define VSET_VX2 __riscv_vset_v_f64m1_f64m1x2
#define VSET_VX4 __riscv_vset_v_f64m1_f64m1x4
#define VSET_VX8 __riscv_vset_v_f64m1_f64m1x8
#define VLEV_FLOAT __riscv_vle64_v_f64m1
#define VSEV_FLOAT __riscv_vse64_v_f64m1
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
#endif

// Optimizes the implementation in ../generic/gemm_ncopy_8.c
Expand All @@ -57,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
FLOAT *b_offset;

FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
FLOAT_VX2_T vx2;
FLOAT_VX4_T vx4;
FLOAT_VX8_T vx8;

size_t vl;

//fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
Expand Down Expand Up @@ -87,7 +103,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
v7 = VLEV_FLOAT(a_offset7, vl);
v8 = VLEV_FLOAT(a_offset8, vl);

VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl);
vx8 = VSET_VX8(vx8, 0, v1);
vx8 = VSET_VX8(vx8, 1, v2);
vx8 = VSET_VX8(vx8, 2, v3);
vx8 = VSET_VX8(vx8, 3, v4);
vx8 = VSET_VX8(vx8, 4, v5);
vx8 = VSET_VX8(vx8, 5, v6);
vx8 = VSET_VX8(vx8, 6, v7);
vx8 = VSET_VX8(vx8, 7, v8);

VSSEG8_FLOAT(b_offset, vx8, vl);

a_offset1 += vl;
a_offset2 += vl;
Expand Down Expand Up @@ -116,7 +141,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
v3 = VLEV_FLOAT(a_offset3, vl);
v4 = VLEV_FLOAT(a_offset4, vl);

VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
vx4 = VSET_VX4(vx4, 0, v1);
vx4 = VSET_VX4(vx4, 1, v2);
vx4 = VSET_VX4(vx4, 2, v3);
vx4 = VSET_VX4(vx4, 3, v4);

VSSEG4_FLOAT(b_offset, vx4, vl);

a_offset1 += vl;
a_offset2 += vl;
Expand All @@ -137,7 +167,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
v1 = VLEV_FLOAT(a_offset1, vl);
v2 = VLEV_FLOAT(a_offset2, vl);

VSSEG2_FLOAT(b_offset, v1, v2, vl);
vx2 = VSET_VX2(vx2, 0, v1);
vx2 = VSET_VX2(vx2, 1, v2);

VSSEG2_FLOAT(b_offset, vx2, vl);

a_offset1 += vl;
a_offset2 += vl;
Expand Down
71 changes: 40 additions & 31 deletions kernel/riscv64/gemm_tcopy_8_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE)
#define VSETVL(n) __riscv_vsetvl_e32m1(n)
#define FLOAT_V_T vfloat32m1_t
#define FLOAT_VX2_T vfloat32m1x2_t
#define FLOAT_VX4_T vfloat32m1x4_t
#define FLOAT_VX8_T vfloat32m1x8_t
#define VLEV_FLOAT __riscv_vle32_v_f32m1
#define VLSEV_FLOAT __riscv_vlse32_v_f32m1
#define VSEV_FLOAT __riscv_vse32_v_f32m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1
#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1
#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
#else
#define VSETVL(n) __riscv_vsetvl_e64m1(n)
#define FLOAT_V_T vfloat64m1_t
#define FLOAT_VX2_T vfloat64m1x2_t
#define FLOAT_VX4_T vfloat64m1x4_t
#define FLOAT_VX8_T vfloat64m1x8_t
#define VLEV_FLOAT __riscv_vle64_v_f64m1
#define VLSEV_FLOAT __riscv_vlse64_v_f64m1
#define VSEV_FLOAT __riscv_vse64_v_f64m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1
#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1
#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1
#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
#endif

int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
Expand All @@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)

IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;

FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
FLOAT_V_T v0;
FLOAT_VX2_T vx2;
FLOAT_VX4_T vx4;
FLOAT_VX8_T vx8;

// fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);

Expand All @@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
for(i = (n >> 3); i > 0; i--) {
size_t vl = 8;

VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, vx8, vl);

aoffset1 += 8;
boffset1 += m * 8;
Expand All @@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 4) {
size_t vl = 8;

VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, vx4, vl);

aoffset1 += 4;
boffset2 += 32;
Expand All @@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 2) {
size_t vl = 8;

VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, vx2, vl);

aoffset1 += 2;
boffset3 += 16;
Expand Down Expand Up @@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
for(i = (n >> 3); i > 0; i--) {
size_t vl = 4;

VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, vx8, vl);

aoffset1 += 8;
boffset1 += m * 8;
Expand All @@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 4) {
size_t vl = 4;

VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, vx4, vl);

aoffset1 += 4;
boffset2 += 16;
Expand All @@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 2) {
size_t vl = 4;

VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, vx2, vl);

aoffset1 += 2;
boffset3 += 8;
Expand All @@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
for(i = (n >> 3); i > 0; i--) {
size_t vl = 2;

VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG8_FLOAT(boffset1, vx8, vl);

aoffset1 += 8;
boffset1 += m * 8;
Expand All @@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 4) {
size_t vl = 2;

VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG4_FLOAT(boffset2, vx4, vl);

aoffset1 += 4;
boffset2 += 8;
Expand All @@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
if (n & 2) {
size_t vl = 2;

VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, v0, v1, vl);
vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
VSSEG2_FLOAT(boffset3, vx2, vl);

aoffset1 += 2;
boffset3 += 4;
Expand Down
23 changes: 17 additions & 6 deletions kernel/riscv64/izamax_rvv.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX __riscv_vsetvlmax_e64m4()
#define FLOAT_V_T vfloat64m4_t
#define FLOAT_V_T_M1 vfloat64m1_t
#define FLOAT_VX2_T vfloat64m4x2_t
#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4
#define VLEV_FLOAT __riscv_vle64_v_f64m4
#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4
#define VLSEG_FLOAT __riscv_vlseg2e64_v_f64m4x2
#define VLSSEG_FLOAT __riscv_vlsseg2e64_v_f64m4x2
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1
#define MASK_T vbool16_t
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f64m4_b16
Expand All @@ -61,10 +63,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL_MAX __riscv_vsetvlmax_e32m4()
#define FLOAT_V_T vfloat32m4_t
#define FLOAT_V_T_M1 vfloat32m1_t
#define FLOAT_VX2_T vfloat32m4x2_t
#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4
#define VLEV_FLOAT __riscv_vle32_v_f32m4
#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4
#define VLSEG_FLOAT __riscv_vlseg2e32_v_f32m4x2
#define VLSSEG_FLOAT __riscv_vlsseg2e32_v_f32m4x2
#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1
#define MASK_T vbool8_t
#define VMFLTVF_FLOAT __riscv_vmflt_vf_f32m4_b8
Expand Down Expand Up @@ -93,6 +97,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if (n <= 0 || inc_x <= 0) return(max_index);

FLOAT_V_T vx0, vx1, v_max;
FLOAT_VX2_T vxx2;
UINT_V_T v_max_index;
MASK_T mask;

Expand All @@ -107,7 +112,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
vl = VSETVL(n);

VLSEG_FLOAT(&vx0, &vx1, x, vl);
vxx2 = VLSEG_FLOAT(x, vl);

vx0 = VGET_VX2(vxx2, 0);
vx1 = VGET_VX2(vxx2, 1);

vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
Expand All @@ -129,7 +137,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
vl = VSETVL(n);

VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
vxx2 = VLSSEG_FLOAT(x, stride_x, vl);

vx0 = VGET_VX2(vxx2, 0);
vx1 = VGET_VX2(vxx2, 1);

vx0 = VFABSV_FLOAT(vx0, vl);
vx1 = VFABSV_FLOAT(vx1, vl);
Expand Down
Loading

0 comments on commit 889c5d0

Please sign in to comment.