[AVX2+] Vectorized `1 << u3` in a byte vector should turn into `vpshufb` #110317

Validark · 2024-09-27T19:13:04Z

export fn foo(chunk: @Vector(32, u8)) @TypeOf(chunk) {
    return @as(@TypeOf(chunk), @splat(1)) << @truncate(chunk);
}

define dso_local range(i8 1, -127) <32 x i8> @foo(<32 x i8> %0) local_unnamed_addr {
Entry:
  %1 = and <32 x i8> %0, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
  %2 = shl nuw <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %1
  ret <32 x i8> %2
}

Compiles like so for Zen 3:

.LCPI0_1:
        .zero   32,16
.LCPI0_2:
        .zero   32,252
.LCPI0_3:
        .zero   32,224
.LCPI0_4:
        .byte   1
foo:
        vpsllw  ymm0, ymm0, 5
        vpbroadcastb    ymm1, byte ptr [rip + .LCPI0_4]
        vpblendvb       ymm1, ymm1, ymmword ptr [rip + .LCPI0_1], ymm0
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_3]
        vpsllw  ymm2, ymm1, 2
        vpand   ymm2, ymm2, ymmword ptr [rip + .LCPI0_2]
        vpaddb  ymm0, ymm0, ymm0
        vpblendvb       ymm1, ymm1, ymm2, ymm0
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm2, ymm1, ymm1
        vpblendvb       ymm0, ymm1, ymm2, ymm0
        ret

However, because the bytes resulting from @truncate(chunk) are in the range [0, 7], we can precompute all 8 possible answers and use vpshufb instead (Godbolt, full code):

export fn foo2(chunk: @Vector(32, u8)) @TypeOf(chunk) {
    const table = comptime foo(std.simd.repeat(@sizeOf(@TypeOf(chunk)), std.simd.iota(u8, 16)));
    return vpshufb(table, @as(@Vector(32, u3), @truncate(chunk)));
}

fn vpshufb(table: anytype, indices: @TypeOf(table)) @TypeOf(table) {
    if (@inComptime()) {
        var result: @TypeOf(indices) = undefined;
        for (0..@bitSizeOf(@TypeOf(indices)) / 8) |i| {
            const index = indices[i];
            result[i] = if (index >= 0x80) 0 else table[index % (@bitSizeOf(@TypeOf(table)) / 8)];
        }

        return result;
    }

    const methods = struct {
        extern fn @"llvm.x86.avx512.pshuf.b.512"(@Vector(64, u8), @Vector(64, u8)) @Vector(64, u8);
        extern fn @"llvm.x86.avx2.pshuf.b"(@Vector(32, u8), @Vector(32, u8)) @Vector(32, u8);
        extern fn @"llvm.x86.ssse3.pshuf.b.128"(@Vector(16, u8), @Vector(16, u8)) @Vector(16, u8);
    };

    return switch (@TypeOf(table)) {
        @Vector(64, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx512bw)) methods.@"llvm.x86.avx512.pshuf.b.512"(table, indices) else @compileError("CPU target lacks support for vpshufb512"),
        @Vector(32, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) methods.@"llvm.x86.avx2.pshuf.b"(table, indices) else @compileError("CPU target lacks support for vpshufb256"),
        @Vector(16, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) methods.@"llvm.x86.ssse3.pshuf.b.128"(table, indices) else @compileError("CPU target lacks support for vpshufb128"),
        else => @compileError(std.fmt.comptimePrint("Invalid argument type passed to vpshufb: {}\n", .{@TypeOf(table)})),
    };
}

.LCPI0_0:
        .zero   32,7
# Removed dead vector data. See https://github.com/llvm/llvm-project/issues/110305
.LCPI0_2:
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
foo2:
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
        vpbroadcastq    ymm1, qword ptr [rip + .LCPI0_2]
        vpshufb ymm0, ymm1, ymm0
        ret

The text was updated successfully, but these errors were encountered:

llvmbot · 2024-09-27T19:17:34Z

@llvm/issue-subscribers-backend-x86

Author: Niles Salter (Validark)

This code: ([Godbolt link](https://zig.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:3,lang:zig,selection:(endColumn:2,endLineNumber:3,positionColumn:2,positionLineNumber:3,selectionStartColumn:2,selectionStartLineNumber:3,startColumn:2,startLineNumber:3),source:'export+fn+foo(chunk:+@Vector(32,+u8))+@TypeOf(chunk)+%7B%0A++++return+@as(@TypeOf(chunk),+@splat(1))+%3C%3C+@truncate(chunk+%3E%3E+@splat(4))%3B%0A%7D'),l:'5',n:'1',o:'Zig+source+%233',t:'0')),header:(),k:50.61449749453956,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:ztrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:zig,libs:!(),options:'-O+ReleaseFast+-target+x86_64-linux+-mcpu%3Dznver3',overrides:!(),selection:(endColumn:33,endLineNumber:18,positionColumn:33,positionLineNumber:18,selectionStartColumn:33,selectionStartLineNumber:18,startColumn:33,startLineNumber:18),source:3),l:'5',n:'0',o:'+zig+trunk+(Editor+%233)',t:'0')),header:(),k:49.38550250546045,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4))

export fn foo(chunk: @<!-- -->Vector(32, u8)) @<!-- -->TypeOf(chunk) {
    return @<!-- -->as(@<!-- -->TypeOf(chunk), @<!-- -->splat(1)) &lt;&lt; @<!-- -->truncate(chunk &gt;&gt; @<!-- -->splat(4));
}

define dso_local range(i8 1, -127) &lt;32 x i8&gt; @<!-- -->foo(&lt;32 x i8&gt; %0) local_unnamed_addr {
Entry:
  %1 = lshr &lt;32 x i8&gt; %0, &lt;i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4&gt;
  %2 = and &lt;32 x i8&gt; %1, &lt;i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7&gt;
  %3 = shl nuw &lt;32 x i8&gt; &lt;i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1&gt;, %2
  ret &lt;32 x i8&gt; %3
}

Compiles like so for Zen 3:

.LCPI0_1:
        .zero   32,16
.LCPI0_2:
        .zero   32,252
.LCPI0_3:
        .zero   32,224
.LCPI0_4:
        .byte   1
foo:
        vpsllw  ymm0, ymm0, 1
        vpbroadcastb    ymm1, byte ptr [rip + .LCPI0_4]
        vpblendvb       ymm1, ymm1, ymmword ptr [rip + .LCPI0_1], ymm0
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_3]
        vpsllw  ymm2, ymm1, 2
        vpand   ymm2, ymm2, ymmword ptr [rip + .LCPI0_2]
        vpaddb  ymm0, ymm0, ymm0
        vpblendvb       ymm1, ymm1, ymm2, ymm0
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm2, ymm1, ymm1
        vpblendvb       ymm0, ymm1, ymm2, ymm0
        ret

However, because the bytes resulting from chunk >> @splat(4) are in the range [0, 15], we can precompute all 16 possible answers and use vpshufb instead (Godbolt, full code):

export fn foo2(chunk: @<!-- -->Vector(32, u8)) @<!-- -->TypeOf(chunk) {
    const table = comptime foo(std.simd.repeat(@<!-- -->sizeOf(@<!-- -->TypeOf(chunk)), std.simd.iota(u8, 16) &lt;&lt; @<!-- -->splat(4)));
    return vpshufb(table, chunk &gt;&gt; @<!-- -->splat(4));
}

fn vpshufb(table: anytype, indices: @<!-- -->TypeOf(table)) @<!-- -->TypeOf(table) {
    if (@<!-- -->inComptime()) {
        var result: @<!-- -->TypeOf(indices) = undefined;
        for (0..@<!-- -->bitSizeOf(@<!-- -->TypeOf(indices)) / 8) |i| {
            const index = indices[i];
            result[i] = if (index &gt;= 0x80) 0 else table[index % (@<!-- -->bitSizeOf(@<!-- -->TypeOf(table)) / 8)];
        }

        return result;
    }

    const methods = struct {
        extern fn @"llvm.x86.avx512.pshuf.b.512"(@<!-- -->Vector(64, u8), @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8);
        extern fn @"llvm.x86.avx2.pshuf.b"(@<!-- -->Vector(32, u8), @<!-- -->Vector(32, u8)) @<!-- -->Vector(32, u8);
        extern fn @"llvm.x86.ssse3.pshuf.b.128"(@<!-- -->Vector(16, u8), @<!-- -->Vector(16, u8)) @<!-- -->Vector(16, u8);
    };

    return switch (@<!-- -->TypeOf(table)) {
        @<!-- -->Vector(64, u8) =&gt; if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx512bw)) methods.@"llvm.x86.avx512.pshuf.b.512"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb512"),
        @<!-- -->Vector(32, u8) =&gt; if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) methods.@"llvm.x86.avx2.pshuf.b"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb256"),
        @<!-- -->Vector(16, u8) =&gt; if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) methods.@"llvm.x86.ssse3.pshuf.b.128"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb128"),
        else =&gt; @<!-- -->compileError(std.fmt.comptimePrint("Invalid argument type passed to vpshufb: {}\n", .{@<!-- -->TypeOf(table)})),
    };
}

.LCPI0_0:
        .zero   32,15
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
.LCPI0_2:
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
foo2:
        vpsrlw  ymm0, ymm0, 4
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
        vpbroadcastq    ymm1, qword ptr [rip + .LCPI0_2]
        vpshufb ymm0, ymm1, ymm0
        ret

RKSimon · 2024-09-28T20:46:30Z

The AND to clamp the shift amount is irrelevant for the SHL -> PSHUFB lowering as anything out of bounds would be poison anyway. All we need is to be shifting a vXi8 splat constant for this to work.

Validark · 2024-09-29T05:29:17Z

The AND to clamp the shift amount is irrelevant for the SHL -> PSHUFB lowering as anything out of bounds would be poison anyway. All we need is to be shifting a vXi8 splat constant for this to work.

I included it because you can't really do an out-of-bounds shift in Zig. You have to do a @truncate which gives you the lower log_2(int) bits or an @intCast which is a promise that it's already truncated, and oftentimes an AND gets inserted anyway.

Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts

If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317

Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts

…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317

Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts

…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317

github-actions bot added the new issue label Sep 27, 2024

Validark changed the title ~~[AVX2+] Vectorized 1 << u4 in a byte vector should turn into vpshufb~~ [AVX2+] Vectorized 1 << u3 in a byte vector should turn into vpshufb Sep 27, 2024

EugeneZelenko added backend:X86 and removed new issue labels Sep 27, 2024

RKSimon self-assigned this Sep 28, 2024

RKSimon added a commit that referenced this issue Oct 11, 2024

[X86] Add test coverage for #110317

03447ab

Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts

RKSimon mentioned this issue Oct 14, 2024

[X86] LowerShift - lower vXi8 shifts of an uniform constant using PSHUFB #112175

Merged

RKSimon closed this as completed in #112175 Oct 14, 2024

RKSimon closed this as completed in ccb9835 Oct 14, 2024

DanielCChen pushed a commit to DanielCChen/llvm-project that referenced this issue Oct 16, 2024

[X86] Add test coverage for llvm#110317

0a31803

Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts

bricknerb pushed a commit to bricknerb/llvm-project that referenced this issue Oct 17, 2024

[X86] Add test coverage for llvm#110317

95aaaca

Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AVX2+] Vectorized `1 << u3` in a byte vector should turn into `vpshufb` #110317

[AVX2+] Vectorized `1 << u3` in a byte vector should turn into `vpshufb` #110317

Validark commented Sep 27, 2024 •

edited

Loading

llvmbot commented Sep 27, 2024

RKSimon commented Sep 28, 2024

Validark commented Sep 29, 2024

[AVX2+] Vectorized 1 << u3 in a byte vector should turn into vpshufb #110317

[AVX2+] Vectorized 1 << u3 in a byte vector should turn into vpshufb #110317

Comments

Validark commented Sep 27, 2024 • edited Loading

llvmbot commented Sep 27, 2024

RKSimon commented Sep 28, 2024

Validark commented Sep 29, 2024

[AVX2+] Vectorized `1 << u3` in a byte vector should turn into `vpshufb` #110317

[AVX2+] Vectorized `1 << u3` in a byte vector should turn into `vpshufb` #110317

Validark commented Sep 27, 2024 •

edited

Loading