-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AVX2+] Vectorized 1 << u3
in a byte vector should turn into vpshufb
#110317
Comments
1 << u4
in a byte vector should turn into vpshufb
1 << u3
in a byte vector should turn into vpshufb
@llvm/issue-subscribers-backend-x86 Author: Niles Salter (Validark)
This code: ([Godbolt link](https://zig.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:3,lang:zig,selection:(endColumn:2,endLineNumber:3,positionColumn:2,positionLineNumber:3,selectionStartColumn:2,selectionStartLineNumber:3,startColumn:2,startLineNumber:3),source:'export+fn+foo(chunk:+@Vector(32,+u8))+@TypeOf(chunk)+%7B%0A++++return+@as(@TypeOf(chunk),+@splat(1))+%3C%3C+@truncate(chunk+%3E%3E+@splat(4))%3B%0A%7D'),l:'5',n:'1',o:'Zig+source+%233',t:'0')),header:(),k:50.61449749453956,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:ztrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:zig,libs:!(),options:'-O+ReleaseFast+-target+x86_64-linux+-mcpu%3Dznver3',overrides:!(),selection:(endColumn:33,endLineNumber:18,positionColumn:33,positionLineNumber:18,selectionStartColumn:33,selectionStartLineNumber:18,startColumn:33,startLineNumber:18),source:3),l:'5',n:'0',o:'+zig+trunk+(Editor+%233)',t:'0')),header:(),k:49.38550250546045,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4))
export fn foo(chunk: @<!-- -->Vector(32, u8)) @<!-- -->TypeOf(chunk) {
return @<!-- -->as(@<!-- -->TypeOf(chunk), @<!-- -->splat(1)) << @<!-- -->truncate(chunk >> @<!-- -->splat(4));
} define dso_local range(i8 1, -127) <32 x i8> @<!-- -->foo(<32 x i8> %0) local_unnamed_addr {
Entry:
%1 = lshr <32 x i8> %0, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%2 = and <32 x i8> %1, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
%3 = shl nuw <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
ret <32 x i8> %3
} Compiles like so for Zen 3: .LCPI0_1:
.zero 32,16
.LCPI0_2:
.zero 32,252
.LCPI0_3:
.zero 32,224
.LCPI0_4:
.byte 1
foo:
vpsllw ymm0, ymm0, 1
vpbroadcastb ymm1, byte ptr [rip + .LCPI0_4]
vpblendvb ymm1, ymm1, ymmword ptr [rip + .LCPI0_1], ymm0
vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_3]
vpsllw ymm2, ymm1, 2
vpand ymm2, ymm2, ymmword ptr [rip + .LCPI0_2]
vpaddb ymm0, ymm0, ymm0
vpblendvb ymm1, ymm1, ymm2, ymm0
vpaddb ymm0, ymm0, ymm0
vpaddb ymm2, ymm1, ymm1
vpblendvb ymm0, ymm1, ymm2, ymm0
ret However, because the bytes resulting from export fn foo2(chunk: @<!-- -->Vector(32, u8)) @<!-- -->TypeOf(chunk) {
const table = comptime foo(std.simd.repeat(@<!-- -->sizeOf(@<!-- -->TypeOf(chunk)), std.simd.iota(u8, 16) << @<!-- -->splat(4)));
return vpshufb(table, chunk >> @<!-- -->splat(4));
}
fn vpshufb(table: anytype, indices: @<!-- -->TypeOf(table)) @<!-- -->TypeOf(table) {
if (@<!-- -->inComptime()) {
var result: @<!-- -->TypeOf(indices) = undefined;
for (0..@<!-- -->bitSizeOf(@<!-- -->TypeOf(indices)) / 8) |i| {
const index = indices[i];
result[i] = if (index >= 0x80) 0 else table[index % (@<!-- -->bitSizeOf(@<!-- -->TypeOf(table)) / 8)];
}
return result;
}
const methods = struct {
extern fn @"llvm.x86.avx512.pshuf.b.512"(@<!-- -->Vector(64, u8), @<!-- -->Vector(64, u8)) @<!-- -->Vector(64, u8);
extern fn @"llvm.x86.avx2.pshuf.b"(@<!-- -->Vector(32, u8), @<!-- -->Vector(32, u8)) @<!-- -->Vector(32, u8);
extern fn @"llvm.x86.ssse3.pshuf.b.128"(@<!-- -->Vector(16, u8), @<!-- -->Vector(16, u8)) @<!-- -->Vector(16, u8);
};
return switch (@<!-- -->TypeOf(table)) {
@<!-- -->Vector(64, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx512bw)) methods.@"llvm.x86.avx512.pshuf.b.512"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb512"),
@<!-- -->Vector(32, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) methods.@"llvm.x86.avx2.pshuf.b"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb256"),
@<!-- -->Vector(16, u8) => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) methods.@"llvm.x86.ssse3.pshuf.b.128"(table, indices) else @<!-- -->compileError("CPU target lacks support for vpshufb128"),
else => @<!-- -->compileError(std.fmt.comptimePrint("Invalid argument type passed to vpshufb: {}\n", .{@<!-- -->TypeOf(table)})),
};
} .LCPI0_0:
.zero 32,15
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
.LCPI0_2:
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
foo2:
vpsrlw ymm0, ymm0, 4
vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
vpbroadcastq ymm1, qword ptr [rip + .LCPI0_2]
vpshufb ymm0, ymm1, ymm0
ret |
The AND to clamp the shift amount is irrelevant for the SHL -> PSHUFB lowering as anything out of bounds would be poison anyway. All we need is to be shifting a vXi8 splat constant for this to work. |
I included it because you can't really do an out-of-bounds shift in Zig. You have to do a |
Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts
If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts
…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
Add tests showing potential to use PSHUFB for shifts of constant uniform values by using a pre-computed LUT of all legal shift amounts
…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
…UFB (llvm#112175) If each 128-bit vXi8 lane is shifting the same constant value, we can pre-compute the 8 valid shift results and use PSHUFB to act as a LUT with the shift amount. Fixes llvm#110317
This code: (Godbolt link)
Compiles like so for Zen 3:
However, because the bytes resulting from
@truncate(chunk)
are in the range [0, 7], we can precompute all 8 possible answers and use vpshufb instead (Godbolt, full code):The text was updated successfully, but these errors were encountered: