Closed
Description
This Zig code:
fn compressstore(vec: @Vector(64, u8), ptr: *@Vector(64, u8), bitstr: u64) void {
return struct {
extern fn @"llvm.masked.compressstore.v64i8"(@Vector(64, u8), *@Vector(64, u8), @Vector(64, u1)) callconv(.Unspecified) void;
}.@"llvm.masked.compressstore.v64i8"(vec, ptr, @bitCast(bitstr));
}
export fn compress(vec: @Vector(64, u8), bitstr: u64, vec2: @Vector(64, u8)) @Vector(64, u8) {
var buffer: [64]u8 align(64) = undefined;
compressstore(vec, &buffer, bitstr);
return buffer -% vec2;
}
Gives us this optimized LLVM code:
define dso_local void @compress(ptr noalias nocapture nonnull writeonly sret(<64 x i8>) %0, ptr nocapture noundef readonly %1, i64 %2, ptr nocapture noundef readonly %3) local_unnamed_addr {
Entry:
%4 = alloca [64 x i8], align 64
%5 = load <64 x i8>, ptr %1, align 64
%6 = load <64 x i8>, ptr %3, align 64
#dbg_value(<64 x i8> %5, !138, !DIExpression(), !139)
#dbg_value(i64 %2, !140, !DIExpression(), !139)
#dbg_value(<64 x i8> %6, !141, !DIExpression(), !139)
#dbg_declare(ptr %4, !142, !DIExpression(), !144)
#dbg_value(<64 x i8> %5, !145, !DIExpression(), !149)
#dbg_value(ptr %4, !151, !DIExpression(), !149)
#dbg_value(i64 %2, !152, !DIExpression(), !149)
%7 = bitcast i64 %2 to <64 x i1>
call fastcc void @llvm.masked.compressstore.v64i8(<64 x i8> %5, ptr nonnull align 64 %4, <64 x i1> %7)
%8 = load <64 x i8>, ptr %4, align 64
%9 = sub <64 x i8> %8, %6
store <64 x i8> %9, ptr %0, align 64
ret void
}
declare fastcc void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr nocapture, <64 x i1>) #1
Which gets emitted like so for spacemit_x60
:
compress:
addi sp, sp, -128
sd ra, 120(sp)
sd s0, 112(sp)
addi s0, sp, 128
andi sp, sp, -64
li a4, 64
vsetvli zero, a4, e8, m2, ta, ma
vle8.v v8, (a1)
vle8.v v10, (a3)
vsetivli zero, 1, e64, m1, ta, ma
vmv.s.x v12, a2
vsetvli zero, a4, e8, m2, ta, ma
vcompress.vm v14, v8, v12
vcpop.m a1, v12
mv a2, sp
vsetvli zero, a1, e8, m2, ta, ma
vse8.v v14, (a2)
vsetvli zero, a4, e8, m2, ta, ma
vle8.v v8, (a2)
vsub.vv v8, v8, v10
vse8.v v8, (a0)
addi sp, s0, -128
ld ra, 120(sp)
ld s0, 112(sp)
addi sp, sp, 128
ret
Is it necessary to have this section of the assembly?
vse8.v v14, (a2)
vsetvli zero, a4, e8, m2, ta, ma
vle8.v v8, (a2)
I haven't read that much RISC-V Vector assembly yet, but my hunch is this could be done better.