Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AVX2+] Vectorized 1 << u3 in a byte vector should turn into vpshufb #110317

Closed
Validark opened this issue Sep 27, 2024 · 3 comments · Fixed by #112175
Closed

[AVX2+] Vectorized 1 << u3 in a byte vector should turn into vpshufb #110317

Validark opened this issue Sep 27, 2024 · 3 comments · Fixed by #112175
Assignees

Comments

@Validark
Copy link

Validark commented Sep 27, 2024

This code: (Godbolt link)

export fn foo(chunk: @Vector(32, u8)) @TypeOf(chunk) {
    return @as(@TypeOf(chunk), @splat(1)) << @truncate(chunk);
}
define dso_local range(i8 1, -127) <32 x i8> @foo(<32 x i8> %0) local_unnamed_addr {
Entry:
  %1 = and <32 x i8> %0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
  %2 = shl nuw <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %1
  ret <32 x i8> %2
}

Compiles like so for Zen 3:

.LCPI0_1:
        .zero   32,16
.LCPI0_2:
        .zero   32,252
.LCPI0_3:
        .zero   32,224
.LCPI0_4:
        .byte   1
foo:
        vpsllw  ymm0, ymm0, 5
        vpbroadcastb    ymm1, byte ptr [rip + .LCPI0_4]
        vpblendvb       ymm1, ymm1, ymmword ptr [rip + .LCPI0_1], ymm0
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_3]
        vpsllw  ymm2, ymm1, 2
        vpand   ymm2, ymm2, ymmword ptr [rip + .LCPI0_2]
        vpaddb  ymm0, ymm0, ymm0
        vpblendvb       ymm1, ymm1, ymm2, ymm0
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm2, ymm1, ymm1
        vpblendvb       ymm0, ymm1, ymm2, ymm0
        ret

However, because the bytes resulting from @truncate(chunk) are in the range [0, 7], we can precompute all 8 possible answers and use vpshufb instead (Godbolt, full code):

export fn foo2(chunk: @Vector(32, u8)) @TypeOf(chunk) {
    const table = comptime foo(std.simd.repeat(@sizeOf(@TypeOf(chunk)), std.simd.iota(u8, 16)));
    return vpshufb(table, @as(@Vector(32, u3), @truncate(chunk)));
}

fn vpshufb(table: anytype, indices: @TypeOf(table)) @TypeOf(table) {
    if (@inComptime()) {
        var result: @TypeOf(indices) = undefined;
        for (0..@bitSizeOf(@TypeOf(indices)) / 8) |i| {
            const index = indices[i];
            result[i] = if (index >= 0x80) 0 else table[index % (@bitSizeOf(@TypeOf(table)) / 8)];
        }

        return result;
    }

    const methods = struct {
        extern fn @"llvm.x86.avx512.pshuf.b.512"(@Vector(64, u8), @Vector(64, u8)) @Vector(64, u8);
        extern fn @"llvm.x86.avx2.pshuf.b"(@Vector(32, u8), @Vector(32, u8)) @Vector(32, u8);
        extern fn @"llvm.x86.ssse3.pshuf.b.128"(@Vector(16, u8), @Vector(16, u8)) @Vector(16, u8);
    };

    return switch (@TypeOf(table)) {
        @Vector(64, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx512bw)) methods.@"llvm.x86.avx512.pshuf.b.512"(table, indices) else @compileError("CPU target lacks support for vpshufb512"),
        @Vector(32, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) methods.@"llvm.x86.avx2.pshuf.b"(table, indices) else @compileError("CPU target lacks support for vpshufb256"),
        @Vector(16, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) methods.@"llvm.x86.ssse3.pshuf.b.128"(table, indices) else @compileError("CPU target lacks support for vpshufb128"),
        else => @compileError(std.fmt.comptimePrint("Invalid argument type passed to vpshufb: {}\n", .{@TypeOf(table)})),
    };
}
.LCPI0_0:
        .zero   32,7
# Removed dead vector data. See https://github.com/llvm/llvm-project/issues/110305
.LCPI0_2:
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
foo2:
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
        vpbroadcastq    ymm1, qword ptr [rip + .LCPI0_2]
        vpshufb ymm0, ymm1, ymm0
        ret
@Validark Validark changed the title [AVX2+] Vectorized 1 << u4 in a byte vector should turn into vpshufb [AVX2+] Vectorized 1 << u3 in a byte vector should turn into vpshufb Sep 27, 2024
@llvmbot
Copy link
Member

llvmbot commented Sep 27, 2024

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

This code: ([Godbolt link](https://zig.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:3,lang:zig,selection:(endColumn:2,endLineNumber:3,positionColumn:2,positionLineNumber:3,selectionStartColumn:2,selectionStartLineNumber:3,startColumn:2,startLineNumber:3),source:'export+fn+foo(chunk:+@Vector(32,+u8))+@TypeOf(chunk)+%7B%0A++++return+@as(@TypeOf(chunk),+@splat(1))+%3C%3C+@truncate(chunk+%3E%3E+@splat(4))%3B%0A%7D'),l:'5',n:'1',o:'Zig+source+%233',t:'0')),header:(),k:50.61449749453956,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:ztrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:zig,libs:!(),options:'-O+ReleaseFast+-target+x86_64-linux+-mcpu%3Dznver3',overrides:!(),selection:(endColumn:33,endLineNumber:18,positionColumn:33,positionLineNumber:18,selectionStartColumn:33,selectionStartLineNumber:18,startColumn:33,startLineNumber:18),source:3),l:'5',n:'0',o:'+zig+trunk+(Editor+%233)',t:'0')),header:(),k:49.38550250546045,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4))
export fn foo(chunk: @<!-- -->Vector(32, u8)) @<!-- -->TypeOf(chunk) {
    return @<!-- -->as(@<!-- -->TypeOf(chunk), @<!-- -->splat(1)) &lt;&lt; @<!-- -->truncate(chunk &gt;&gt; @<!-- -->splat(4));
}
define dso_local range(i8 1, -127) &lt;32 x i8&gt; @<!-- -->foo(&lt;32 x i8&gt; %0) local_unnamed_addr {
Entry:
  %1 = lshr &lt;32 x i8&gt; %0, &lt;i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4&gt;
  %2 = and &lt;32 x i8&gt; %1, &lt;i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7&gt;
  %3 = shl nuw &lt;32 x i8&gt; &lt;i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1&gt;, %2
  ret &lt;32 x i8&gt; %3
}

Compiles like so for Zen 3:

.LCPI0_1:
        .zero   32,16
.LCPI0_2:
        .zero   32,252
.LCPI0_3:
        .zero   32,224
.LCPI0_4:
        .byte   1
foo:
        vpsllw  ymm0, ymm0, 1
        vpbroadcastb    ymm1, byte ptr [rip + .LCPI0_4]
        vpblendvb       ymm1, ymm1, ymmword ptr [rip + .LCPI0_1], ymm0
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_3]
        vpsllw  ymm2, ymm1, 2
        vpand   ymm2, ymm2, ymmword ptr [rip + .LCPI0_2]
        vpaddb  ymm0, ymm0, ymm0
        vpblendvb       ymm1, ymm1, ymm2, ymm0
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm2, ymm1, ymm1
        vpblendvb       ymm0, ymm1, ymm2, ymm0
        ret

However, because the bytes resulting from chunk &gt;&gt; @<!-- -->splat(4) are in the range [0, 15], we can precompute all 16 possible answers and use vpshufb instead (Godbolt, full code):

export fn foo2(chunk: @<!-- -->Vector(32, u8)) @<!-- -->TypeOf(chunk) {
    const table = comptime foo(std.simd.repeat(@<!-- -->sizeOf(@<!-- -->TypeOf(chunk)), std.simd.iota(u8, 16) &lt;&lt; @<!-- -->splat(4)));
    return vpshufb(table, chunk &gt;&gt; @<!-- -->splat(4));
}

fn vpshufb(table: anytype, indices: @<!-- -->TypeOf(table)) @<!-- -->TypeOf(table) {
    if (@<!-- -->inComptime()) {
        var result: @<!-- -->TypeOf(indices) = undefined;
        for (0..@<!-- -->bitSizeOf(@<!-- -->TypeOf(indices)) / 8) |i| {
            const index = indices[i];
            result[i] = if (index &gt;= 0x80) 0 else table[index % (@<!-- -->bitSizeOf(@<!-- -->TypeOf(table)) / 8)];
        }

        return result;
    }

    const methods = struct {
        extern fn @"llvm.x86.avx512.pshuf.b.512"(@<!-- -->Vector(64, u8), @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8);
        extern fn @"llvm.x86.avx2.pshuf.b"(@<!-- -->Vector(32, u8), @<!-- -->Vector(32, u8)) @<!-- -->Vector(32, u8);
        extern fn @"llvm.x86.ssse3.pshuf.b.128"(@<!-- -->Vector(16, u8), @<!-- -->Vector(16, u8)) @<!-- -->Vector(16, u8);
    };

    return switch (@<!-- -->TypeOf(table)) {
        @<!-- -->Vector(64, u8) =&gt; if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx512bw)) methods.@"llvm.x86.avx512.pshuf.b.512"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb512"),
        @<!-- -->Vector(32, u8) =&gt; if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) methods.@"llvm.x86.avx2.pshuf.b"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb256"),
        @<!-- -->Vector(16, u8) =&gt; if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) methods.@"llvm.x86.ssse3.pshuf.b.128"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb128"),
        else =&gt; @<!-- -->compileError(std.fmt.comptimePrint("Invalid argument type passed to vpshufb: {}\n", .{@<!-- -->TypeOf(table)})),
    };
}
.LCPI0_0:
        .zero   32,15
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
.LCPI0_2:
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
foo2:
        vpsrlw  ymm0, ymm0, 4
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
        vpbroadcastq    ymm1, qword ptr [rip + .LCPI0_2]
        vpshufb ymm0, ymm1, ymm0
        ret

@RKSimon RKSimon self-assigned this Sep 28, 2024
@RKSimon
Copy link
Collaborator

RKSimon commented Sep 28, 2024

The AND to clamp the shift amount is irrelevant for the SHL -> PSHUFB lowering as anything out of bounds would be poison anyway. All we need is to be shifting a vXi8 splat constant for this to work.

@Validark
Copy link
Author

The AND to clamp the shift amount is irrelevant for the SHL -> PSHUFB lowering as anything out of bounds would be poison anyway. All we need is to be shifting a vXi8 splat constant for this to work.

I included it because you can't really do an out-of-bounds shift in Zig. You have to do a @truncate which gives you the lower log_2(int) bits or an @intCast which is a promise that it's already truncated, and oftentimes an AND gets inserted anyway.

RKSimon added a commit that referenced this issue Oct 11, 2024
Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts
RKSimon added a commit to RKSimon/llvm-project that referenced this issue Oct 14, 2024
If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount.

Fixes llvm#110317
DanielCChen pushed a commit to DanielCChen/llvm-project that referenced this issue Oct 16, 2024
Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts
DanielCChen pushed a commit to DanielCChen/llvm-project that referenced this issue Oct 16, 2024
…UFB (llvm#112175)

If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount.

Fixes llvm#110317
bricknerb pushed a commit to bricknerb/llvm-project that referenced this issue Oct 17, 2024
Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts
bricknerb pushed a commit to bricknerb/llvm-project that referenced this issue Oct 17, 2024
…UFB (llvm#112175)

If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount.

Fixes llvm#110317
EricWF pushed a commit to efcs/llvm-project that referenced this issue Oct 22, 2024
…UFB (llvm#112175)

If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount.

Fixes llvm#110317
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

4 participants