Skip to content

Commit

Permalink
Adjust costing of emulated vectorized gather/scatter
Browse files Browse the repository at this point in the history
Emulated gather/scatter behave similar to strided elementwise
accesses in that they need to decompose the offset vector
and construct or decompose the data vector so handle them
the same way, pessimizing the cases with may elements.

For pr88531-2c.c instead of

.L4:
        leaq    (%r15,%rcx), %rdx
        incl    %edi
        movl    16(%rdx), %r13d
        movl    24(%rdx), %r14d
        movl    (%rdx), %r10d
        movl    4(%rdx), %r9d
        movl    8(%rdx), %ebx
        movl    12(%rdx), %r11d
        movl    20(%rdx), %r12d
        vmovss  (%rax,%r14,4), %xmm2
        movl    28(%rdx), %edx
        vmovss  (%rax,%r13,4), %xmm1
        vmovss  (%rax,%r10,4), %xmm0
        vinsertps       $0x10, (%rax,%rdx,4), %xmm2, %xmm2
        vinsertps       $0x10, (%rax,%r12,4), %xmm1, %xmm1
        vinsertps       $0x10, (%rax,%r9,4), %xmm0, %xmm0
        vmovlhps        %xmm2, %xmm1, %xmm1
        vmovss  (%rax,%rbx,4), %xmm2
        vinsertps       $0x10, (%rax,%r11,4), %xmm2, %xmm2
        vmovlhps        %xmm2, %xmm0, %xmm0
        vinsertf128     $0x1, %xmm1, %ymm0, %ymm0
        vmulps  %ymm3, %ymm0, %ymm0
        vmovups %ymm0, (%r8,%rcx)
        addq    $32, %rcx
        cmpl    %esi, %edi
        jb      .L4

we now prefer

.L4:
        leaq    0(%rbp,%rdx,8), %rcx
        movl    (%rcx), %r10d
        movl    4(%rcx), %ecx
        vmovss  (%rsi,%r10,4), %xmm0
        vinsertps       $0x10, (%rsi,%rcx,4), %xmm0, %xmm0
        vmulps  %xmm1, %xmm0, %xmm0
        vmovlps %xmm0, (%rbx,%rdx,8)
        incq    %rdx
        cmpl    %edi, %edx
        jb      .L4

	* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
	Tame down element extracts and scalar loads for gather/scatter
	similar to elementwise strided accesses.

	* gcc.target/i386/pr89618-2.c: New testcase.
	* gcc.target/i386/pr88531-2b.c: Adjust.
	* gcc.target/i386/pr88531-2c.c: Likewise.
  • Loading branch information
rguenth committed Apr 28, 2023
1 parent 8b84d87 commit 24905a4
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 4 deletions.
6 changes: 4 additions & 2 deletions gcc/config/i386/i386.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23576,8 +23576,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
&& stmt_info
&& (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
|| STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
&& STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
&& TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
&& ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
&& (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
!= INTEGER_CST))
|| STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
{
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
Expand Down
2 changes: 1 addition & 1 deletion gcc/testsuite/gcc.target/i386/pr88531-2b.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

#include "pr88531-2a.c"

/* { dg-final { scan-assembler-times "vmulps" 2 } } */
/* { dg-final { scan-assembler-times "vmulps" 1 } } */
2 changes: 1 addition & 1 deletion gcc/testsuite/gcc.target/i386/pr88531-2c.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

#include "pr88531-2a.c"

/* { dg-final { scan-assembler-times "vmulps" 2 } } */
/* { dg-final { scan-assembler-times "vmulps" 1 } } */
23 changes: 23 additions & 0 deletions gcc/testsuite/gcc.target/i386/pr89618-2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/* { dg-do compile } */
/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */

void foo (int n, int *off, double *a)
{
const int m = 32;

for (int j = 0; j < n/m; ++j)
{
int const start = j*m;
int const end = (j+1)*m;

#pragma GCC ivdep
for (int i = start; i < end; ++i)
{
a[off[i]] = a[i] < 0 ? a[i] : 0;
}
}
}

/* Make sure the cost model selects SSE vectors rather than AVX to avoid
too many scalar ops for the address computes in the loop. */
/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */

0 comments on commit 24905a4

Please sign in to comment.