Skip to content

Commit

Permalink
Simplify gemv_t_sve_v1x3 kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
annop-w committed Jan 21, 2025
1 parent 76db346 commit c0318ce
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 39 deletions.
3 changes: 3 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,3 +232,6 @@ In chronological order:

* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32>
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE

* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel
94 changes: 55 additions & 39 deletions kernel/arm64/gemv_t_sve_v1x3.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2024, The OpenBLAS Project
Copyright (c) 2024, 2025 The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *y_ptr;
FLOAT temp;

iy = 0;

if (inc_x == 1) {
BLASLONG width = (n + 3 - 1) / 3;
BLASLONG width = n / 3;
BLASLONG sve_size = SV_COUNT();
svbool_t pg_true = SV_TRUE();
svbool_t pg = SV_WHILE(0, m % sve_size);

FLOAT *a0_ptr = a + lda * width * 0;
FLOAT *a1_ptr = a + lda * width * 1;
Expand All @@ -72,67 +76,79 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
FLOAT *y2_ptr = y + inc_y * width * 2;

for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp00_vec = SV_DUP(0.0);
SV_TYPE temp01_vec = SV_DUP(0.0);
SV_TYPE temp02_vec = SV_DUP(0.0);

i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0);
SV_TYPE x0_vec = svld1(pg_true, x + i);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i);

temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec);

i += sve_size * 1;
}

if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg02 = svand_z(SV_TRUE(), pg0, pg02);
SV_TYPE x0_vec = svld1(pg, x + i);

SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0);
SV_TYPE a00_vec = svld1(pg, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg, a2_ptr + i);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec);
}

if ((j + width * 0) < n) {
temp = svaddv(SV_TRUE(), temp00_vec);
y0_ptr[iy] += alpha * temp;
}
if ((j + width * 1) < n) {
temp = svaddv(SV_TRUE(), temp01_vec);
y1_ptr[iy] += alpha * temp;
}
if ((j + width * 2) < n) {
temp = svaddv(SV_TRUE(), temp02_vec);
y2_ptr[iy] += alpha * temp;
}
y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec);
y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec);
y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec);

iy += inc_y;

a0_ptr += lda;
a1_ptr += lda;
a2_ptr += lda;
}

a_ptr = a2_ptr;
y_ptr = y2_ptr;
for (j = width * 3; j < n; j++) {
SV_TYPE temp_vec = SV_DUP(0.0);

i = 0;
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x_vec = svld1(pg_true, x + i);

SV_TYPE a_vec = svld1(pg_true, a_ptr + i);

temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec);

i += sve_size * 1;
}

if (i < m) {
SV_TYPE x_vec = svld1(pg, x + i);

SV_TYPE a_vec = svld1(pg, a_ptr + i);

temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec);
}

y_ptr[iy] += alpha * svaddv(pg_true, temp_vec);

iy += inc_y;

a_ptr += lda;
}

return(0);
}

Expand Down

0 comments on commit c0318ce

Please sign in to comment.