Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

x64: Add AVX support for some more float-related instructions #6092

Merged
merged 3 commits into from
Mar 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions cranelift/codegen/src/isa/x64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,12 @@
Vpmovmskb
Vcvtsi2ss
Vcvtsi2sd
Vcvtss2sd
Vcvtsd2ss
Vsqrtss
Vsqrtsd
Vroundss
Vroundsd
))

(type Avx512Opcode extern
Expand Down Expand Up @@ -3348,11 +3354,17 @@
(decl x64_roundss (XmmMem RoundImm) Xmm)
(rule (x64_roundss src1 round)
(xmm_unary_rm_r_imm (SseOpcode.Roundss) src1 (encode_round_imm round)))
(rule 1 (x64_roundss src1 round)
(if-let $true (use_avx_simd))
(xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundss) src1 (encode_round_imm round)))

;; Helper for creating `roundsd` instructions.
(decl x64_roundsd (XmmMem RoundImm) Xmm)
(rule (x64_roundsd src1 round)
(xmm_unary_rm_r_imm (SseOpcode.Roundsd) src1 (encode_round_imm round)))
(rule 1 (x64_roundsd src1 round)
(if-let $true (use_avx_simd))
(xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundsd) src1 (encode_round_imm round)))

;; Helper for creating `roundps` instructions.
(decl x64_roundps (XmmMem RoundImm) Xmm)
Expand Down Expand Up @@ -3985,10 +3997,16 @@
;; Helper for creating `sqrtss` instructions.
(decl x64_sqrtss (XmmMem) Xmm)
(rule (x64_sqrtss x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtss) x))
(rule 1 (x64_sqrtss x)
(if-let $true (use_avx_simd))
(xmm_unary_rm_r_vex (AvxOpcode.Vsqrtss) x))

;; Helper for creating `sqrtsd` instructions.
(decl x64_sqrtsd (XmmMem) Xmm)
(rule (x64_sqrtsd x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtsd) x))
(rule 1 (x64_sqrtsd x)
(if-let $true (use_avx_simd))
(xmm_unary_rm_r_vex (AvxOpcode.Vsqrtsd) x))

;; Helper for creating `sqrtps` instructions.
(decl x64_sqrtps (XmmMem) Xmm)
Expand All @@ -4005,12 +4023,18 @@
(xmm_unary_rm_r_vex (AvxOpcode.Vsqrtpd) x))

;; Helper for creating `cvtss2sd` instructions.
(decl x64_cvtss2sd (Xmm) Xmm)
(rule (x64_cvtss2sd x) (xmm_unary_rm_r (SseOpcode.Cvtss2sd) x))
(decl x64_cvtss2sd (XmmMem) Xmm)
(rule (x64_cvtss2sd x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtss2sd) x))
(rule 1 (x64_cvtss2sd x)
(if-let $true (use_avx_simd))
(xmm_unary_rm_r_vex (AvxOpcode.Vcvtss2sd) x))

;; Helper for creating `cvtsd2ss` instructions.
(decl x64_cvtsd2ss (Xmm) Xmm)
(rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x))
(decl x64_cvtsd2ss (XmmMem) Xmm)
(rule (x64_cvtsd2ss x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtsd2ss) x))
(rule 1 (x64_cvtsd2ss x)
(if-let $true (use_avx_simd))
(xmm_unary_rm_r_vex (AvxOpcode.Vcvtsd2ss) x))

;; Helper for creating `cvtdq2ps` instructions.
(decl x64_cvtdq2ps (XmmMem) Xmm)
Expand Down
8 changes: 7 additions & 1 deletion cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1722,7 +1722,13 @@ impl AvxOpcode {
| AvxOpcode::Vmovmskpd
| AvxOpcode::Vpmovmskb
| AvxOpcode::Vcvtsi2ss
| AvxOpcode::Vcvtsi2sd => {
| AvxOpcode::Vcvtsi2sd
| AvxOpcode::Vcvtss2sd
| AvxOpcode::Vcvtsd2ss
| AvxOpcode::Vsqrtss
| AvxOpcode::Vsqrtsd
| AvxOpcode::Vroundss
| AvxOpcode::Vroundsd => {
smallvec![InstructionSet::AVX]
}

Expand Down
42 changes: 36 additions & 6 deletions cranelift/codegen/src/isa/x64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2405,17 +2405,36 @@ pub(crate) fn emit(
AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),

AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A),
AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A),
AvxOpcode::Vsqrtss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x51),
AvxOpcode::Vsqrtsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x51),

_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};

VexInstruction::new()
let vex = VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src)
.encode(sink);
.rm(src);

// These opcodes take a second operand through `vvvv` which copies
// the upper bits into the destination register. That's not
// reflected in the CLIF instruction, however, since the SSE version
// doesn't have this functionality. Instead just copy whatever
// happens to already be in the destination, which at least is what
// LLVM seems to do.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

let vex = match op {
AvxOpcode::Vcvtss2sd
| AvxOpcode::Vcvtsd2ss
| AvxOpcode::Vsqrtss
| AvxOpcode::Vsqrtsd => vex.vvvv(dst.to_real_reg().unwrap().hw_enc()),
_ => vex,
};
vex.encode(sink);
}

Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => {
Expand All @@ -2433,18 +2452,29 @@ pub(crate) fn emit(
AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70),
AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A),
AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B),
_ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
};

VexInstruction::new()
let vex = VexInstruction::new()
.length(VexVectorLength::V128)
.prefix(prefix)
.map(map)
.opcode(opcode)
.reg(dst.to_real_reg().unwrap().hw_enc())
.rm(src)
.imm(*imm)
.encode(sink);
.imm(*imm);

// See comments in similar block above in `XmmUnaryRmRVex` for what
// this is doing.
let vex = match op {
AvxOpcode::Vroundss | AvxOpcode::Vroundsd => {
vex.vvvv(dst.to_real_reg().unwrap().hw_enc())
}
_ => vex,
};
vex.encode(sink);
}

Inst::XmmMovRMVex { op, src, dst } => {
Expand Down
104 changes: 104 additions & 0 deletions cranelift/filetests/filetests/isa/x64/ceil-avx.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
test compile precise-output
set enable_simd
target x86_64 has_avx

function %f1(f32) -> f32 {
block0(v0: f32):
v1 = ceil v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundss $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundss $2, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %f2(f64) -> f64 {
block0(v0: f64):
v1 = ceil v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundsd $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundsd $2, %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %f4(f32x4) -> f32x4 {
block0(v0: f32x4):
v1 = ceil v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundps $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundps $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %f4(f64x2) -> f64x2 {
block0(v0: f64x2):
v1 = ceil v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vroundpd $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vroundpd $2, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

130 changes: 130 additions & 0 deletions cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
test compile precise-output
set enable_simd
target x86_64 has_avx

function %fpromote(f32) -> f64 {
block0(v0: f32):
v1 = fpromote.f64 v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vcvtss2sd %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vcvtss2sd %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %fpromote_load(i64, f32) -> f64 {
ss0 = explicit_slot 16

block0(v1: i64, v2: f32):
v3 = stack_addr.i64 ss0
store.f32 v2, v3
v4 = load.f32 v3
v5 = fpromote.f64 v4
return v5
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $16, %rsp
; block0:
; lea rsp(0 + virtual offset), %rdx
; vmovss %xmm0, 0(%rdx)
; vcvtss2sd 0(%rdx), %xmm0
; addq %rsp, $16, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; subq $0x10, %rsp
; block1: ; offset 0x8
; leaq (%rsp), %rdx
; vmovss %xmm0, (%rdx) ; trap: heap_oob
; vcvtss2sd (%rdx), %xmm0, %xmm0 ; trap: heap_oob
; addq $0x10, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq

function %fdemote(f64) -> f32 {
block0(v0: f64):
v1 = fdemote.f32 v0
return v1
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vcvtsd2ss %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vcvtsd2ss %xmm0, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; retq

function %fdemote_load(i64, f64) -> f32 {
ss0 = explicit_slot 16

block0(v1: i64, v2: f64):
v3 = stack_addr.i64 ss0
store.f64 v2, v3
v4 = load.f64 v3
v5 = fdemote.f32 v4
return v5
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; subq %rsp, $16, %rsp
; block0:
; lea rsp(0 + virtual offset), %rdx
; vmovsd %xmm0, 0(%rdx)
; vcvtsd2ss 0(%rdx), %xmm0
; addq %rsp, $16, %rsp
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; subq $0x10, %rsp
; block1: ; offset 0x8
; leaq (%rsp), %rdx
; vmovsd %xmm0, (%rdx) ; trap: heap_oob
; vcvtsd2ss (%rdx), %xmm0, %xmm0 ; trap: heap_oob
; addq $0x10, %rsp
; movq %rbp, %rsp
; popq %rbp
; retq

Loading