Skip to content

[RISC-V] Remove round-trip to memory when using compressstore #113242

Closed
@Validark

Description

@Validark

This Zig code:

fn compressstore(vec: @Vector(64, u8), ptr: *@Vector(64, u8), bitstr: u64) void {
    return struct {
        extern fn @"llvm.masked.compressstore.v64i8"(@Vector(64, u8), *@Vector(64, u8), @Vector(64, u1)) callconv(.Unspecified) void;
    }.@"llvm.masked.compressstore.v64i8"(vec, ptr, @bitCast(bitstr));
}

export fn compress(vec: @Vector(64, u8), bitstr: u64, vec2: @Vector(64, u8)) @Vector(64, u8) {
    var buffer: [64]u8 align(64) = undefined;
    compressstore(vec, &buffer, bitstr);
    return buffer -% vec2;
}

Gives us this optimized LLVM code:

define dso_local void @compress(ptr noalias nocapture nonnull writeonly sret(<64 x i8>) %0, ptr nocapture noundef readonly %1, i64 %2, ptr nocapture noundef readonly %3) local_unnamed_addr {
Entry:
  %4 = alloca [64 x i8], align 64
  %5 = load <64 x i8>, ptr %1, align 64
  %6 = load <64 x i8>, ptr %3, align 64
    #dbg_value(<64 x i8> %5, !138, !DIExpression(), !139)
    #dbg_value(i64 %2, !140, !DIExpression(), !139)
    #dbg_value(<64 x i8> %6, !141, !DIExpression(), !139)
    #dbg_declare(ptr %4, !142, !DIExpression(), !144)
    #dbg_value(<64 x i8> %5, !145, !DIExpression(), !149)
    #dbg_value(ptr %4, !151, !DIExpression(), !149)
    #dbg_value(i64 %2, !152, !DIExpression(), !149)
  %7 = bitcast i64 %2 to <64 x i1>
  call fastcc void @llvm.masked.compressstore.v64i8(<64 x i8> %5, ptr nonnull align 64 %4, <64 x i1> %7)
  %8 = load <64 x i8>, ptr %4, align 64
  %9 = sub <64 x i8> %8, %6
  store <64 x i8> %9, ptr %0, align 64
  ret void
}

declare fastcc void @llvm.masked.compressstore.v64i8(<64 x i8>, ptr nocapture, <64 x i1>) #1

Which gets emitted like so for spacemit_x60:

compress:
        addi    sp, sp, -128
        sd      ra, 120(sp)
        sd      s0, 112(sp)
        addi    s0, sp, 128
        andi    sp, sp, -64
        li      a4, 64
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a1)
        vle8.v  v10, (a3)
        vsetivli        zero, 1, e64, m1, ta, ma
        vmv.s.x v12, a2
        vsetvli zero, a4, e8, m2, ta, ma
        vcompress.vm    v14, v8, v12
        vcpop.m a1, v12
        mv      a2, sp
        vsetvli zero, a1, e8, m2, ta, ma
        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)
        vsub.vv v8, v8, v10
        vse8.v  v8, (a0)
        addi    sp, s0, -128
        ld      ra, 120(sp)
        ld      s0, 112(sp)
        addi    sp, sp, 128
        ret

Is it necessary to have this section of the assembly?

        vse8.v  v14, (a2)
        vsetvli zero, a4, e8, m2, ta, ma
        vle8.v  v8, (a2)

I haven't read that much RISC-V Vector assembly yet, but my hunch is this could be done better.

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions