Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use .p2align instead of .align for portability on Haswell and Sandybridge #1471

Merged
merged 2 commits into from
Feb 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions kernel/x86_64/caxpy_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmulps (%5), %%ymm0 , %%ymm0 \n\t"
#endif

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x
"vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x
Expand All @@ -70,7 +70,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part

"vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t"
".align 2 \n\t"
".p2align 1 \n\t"
"vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t"
"vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t"
"vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t"
Expand All @@ -96,7 +96,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t"

"vmovups %%ymm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups %%ymm7 , 32(%3,%0,4) \n\t"
"vmovups %%ymm9 , 64(%3,%0,4) \n\t"
"vmovups %%ymm11, 96(%3,%0,4) \n\t"
Expand Down
6 changes: 3 additions & 3 deletions kernel/x86_64/caxpy_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmulps (%5), %%ymm0 , %%ymm0 \n\t"
#endif

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x
"vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x
Expand Down Expand Up @@ -85,7 +85,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vaddps %%ymm10, %%ymm11, %%ymm11 \n\t"

"vmovups %%ymm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups %%ymm7 , 32(%3,%0,4) \n\t"
"vmovups %%ymm9 , 64(%3,%0,4) \n\t"
"vmovups %%ymm11, 96(%3,%0,4) \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/cdot_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorps %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7, %%ymm7, %%ymm7 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x
"vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/cdot_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorps %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7, %%ymm7, %%ymm7 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 2 * x
"vmovups 32(%2,%0,4), %%ymm9 \n\t" // 2 * x
Expand Down
8 changes: 4 additions & 4 deletions kernel/x86_64/cscal_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"subq $16, %0 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

//"prefetcht0 128(%1) \n\t"
Expand Down Expand Up @@ -156,7 +156,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"subq $16, %0 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

//"prefetcht0 128(%1) \n\t"
Expand Down Expand Up @@ -245,7 +245,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"subq $16, %0 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

//"prefetcht0 128(%1) \n\t"
Expand Down Expand Up @@ -312,7 +312,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)

"addq $128, %1 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

//"prefetcht0 128(%1) \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/daxpy_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
(
"vbroadcastsd (%4), %%ymm0 \n\t" // alpha

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,8), %%ymm12 \n\t" // 4 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/daxpy_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmulpd %%ymm4, %%ymm0, %%ymm4 \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/ddot_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/ddot_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 2 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 2 * x
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/dger_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $8, %1 \n\t"
"jz 2f \n\t"

".align 8 \n\t"
".p2align 3 \n\t"
"1: \n\t"

"vmulpd %%xmm4, %%xmm0, %%xmm4 \n\t"
Expand Down
4 changes: 2 additions & 2 deletions kernel/x86_64/dscal_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"subq $1 , %0 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
// "prefetcht0 640(%1) \n\t"

Expand Down Expand Up @@ -156,7 +156,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"cmpq $0, %0 \n\t"
"je 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups %%xmm0 ,-128(%1) \n\t"
Expand Down
4 changes: 2 additions & 2 deletions kernel/x86_64/dscal_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"subq $1 , %0 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"prefetcht0 640(%1) \n\t"

Expand Down Expand Up @@ -156,7 +156,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"cmpq $0, %0 \n\t"
"je 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups %%xmm0 ,-128(%1) \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/dsymv_L_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1]
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/dsymv_L_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vbroadcastsd 16(%8), %%ymm6 \n\t" // temp1[1]
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/dsymv_U_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
"xorq %0,%0 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/dsymv_U_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vbroadcastsd 24(%8), %%ymm7 \n\t" // temp1[1]
"xorq %0,%0 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,8), %%ymm9 \n\t" // 2 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/dtrmm_kernel_4x8_haswell.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
" cmp $0, %1 \n\t"
" jz 2f \n\t"

" .align 16 \n\t"
" .p2align 4 \n\t"
"1: \n\t"
" vmovups (%2,%0,4) , %%ymm0 \n\t"
" vmovups (%3,%0,8) , %%ymm1 \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/dtrsm_kernel_RN_haswell.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq %1, %0 \n\t"
" je 21f \n\t"

" .align 16 \n\t"
" .p2align 4 \n\t"
"1: \n\t"

" vmovups (%2,%1,4), %%ymm4 \n\t" // read a
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/saxpy_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
(
"vbroadcastss (%4), %%ymm0 \n\t" // alpha

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,4), %%ymm12 \n\t" // 8 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/saxpy_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $32, %1 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmulps %%ymm4, %%ymm0, %%ymm4 \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/sdot_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorps %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7, %%ymm7, %%ymm7 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 2 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 2 * x
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/sdot_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorps %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorps %%ymm7, %%ymm7, %%ymm7 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 2 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 2 * x
Expand Down
4 changes: 2 additions & 2 deletions kernel/x86_64/sgemv_n_microk_sandy-4.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"je 4f \n\t"


".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
Expand Down Expand Up @@ -299,7 +299,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"je 4f \n\t"


".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/sgemv_t_microk_sandy-4.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"je 4f \n\t"


".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"prefetcht0 384(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/sger_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jz 2f \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmulps %%xmm4, %%xmm0, %%xmm4 \n\t"
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/ssymv_L_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1]
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1]

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y
Expand Down
4 changes: 2 additions & 2 deletions kernel/x86_64/ssymv_L_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vbroadcastss 8(%8), %%xmm6 \n\t" // temp1[1]
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[1]

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,4), %%xmm9 \n\t" // 2 * y
Expand Down Expand Up @@ -143,7 +143,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vbroadcastss 8(%8), %%ymm6 \n\t" // temp1[1]
"vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1]

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/ssymv_U_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1]
"xorq %0,%0 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y
Expand Down
2 changes: 1 addition & 1 deletion kernel/x86_64/ssymv_U_microk_sandy-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vbroadcastss 12(%8), %%ymm7 \n\t" // temp1[1]
"xorq %0,%0 \n\t"

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%3,%0,4), %%ymm9 \n\t" // 2 * y
Expand Down
8 changes: 4 additions & 4 deletions kernel/x86_64/zaxpy_microk_haswell-2.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmulpd (%5), %%ymm0 , %%ymm0 \n\t"
#endif

".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"

"vmovups (%2,%0,8), %%ymm5 \n\t" // 2 complex values from x
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups 32(%2,%0,8), %%ymm7 \n\t" // 2 complex values from x
"vmovups 64(%2,%0,8), %%ymm9 \n\t" // 2 complex values from x
"vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 complex values from x
Expand All @@ -70,7 +70,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part

"vfmadd213pd (%3,%0,8), %%ymm0 , %%ymm5 \n\t"
".align 2 \n\t"
".p2align 1 \n\t"
"vfmadd213pd 32(%3,%0,8), %%ymm0 , %%ymm7 \n\t"
"vfmadd213pd 64(%3,%0,8), %%ymm0 , %%ymm9 \n\t"
"vfmadd213pd 96(%3,%0,8), %%ymm0 , %%ymm11 \n\t"
Expand All @@ -96,7 +96,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vfmadd231pd %%ymm1 , %%ymm10, %%ymm15 \n\t"

"vmovups %%ymm5 , (%3,%0,8) \n\t"
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups %%ymm7 , 32(%3,%0,8) \n\t"
"vmovups %%ymm9 , 64(%3,%0,8) \n\t"
"vmovups %%ymm11, 96(%3,%0,8) \n\t"
Expand Down
Loading