Skip to content

Commit 96df79a

Browse files
committedFeb 16, 2023
[X86] Support load/store for bf16 in avx
Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D144163
1 parent 7e6e636 commit 96df79a

File tree

2 files changed

+60
-0
lines changed

2 files changed

+60
-0
lines changed
 

‎llvm/lib/Target/X86/X86InstrSSE.td

+17
Original file line numberDiff line numberDiff line change
@@ -577,20 +577,37 @@ let Predicates = [HasAVX, NoVLX] in {
577577

578578
def : Pat<(alignedloadv8f16 addr:$src),
579579
(VMOVAPSrm addr:$src)>;
580+
def : Pat<(alignedloadv8bf16 addr:$src),
581+
(VMOVAPSrm addr:$src)>;
580582
def : Pat<(loadv8f16 addr:$src),
581583
(VMOVUPSrm addr:$src)>;
584+
def : Pat<(loadv8bf16 addr:$src),
585+
(VMOVUPSrm addr:$src)>;
582586
def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
583587
(VMOVAPSmr addr:$dst, VR128:$src)>;
588+
def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst),
589+
(VMOVAPSmr addr:$dst, VR128:$src)>;
584590
def : Pat<(store (v8f16 VR128:$src), addr:$dst),
585591
(VMOVUPSmr addr:$dst, VR128:$src)>;
592+
def : Pat<(store (v8bf16 VR128:$src), addr:$dst),
593+
(VMOVUPSmr addr:$dst, VR128:$src)>;
594+
586595
def : Pat<(alignedloadv16f16 addr:$src),
587596
(VMOVAPSYrm addr:$src)>;
597+
def : Pat<(alignedloadv16bf16 addr:$src),
598+
(VMOVAPSYrm addr:$src)>;
588599
def : Pat<(loadv16f16 addr:$src),
589600
(VMOVUPSYrm addr:$src)>;
601+
def : Pat<(loadv16bf16 addr:$src),
602+
(VMOVUPSYrm addr:$src)>;
590603
def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
591604
(VMOVAPSYmr addr:$dst, VR256:$src)>;
605+
def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst),
606+
(VMOVAPSYmr addr:$dst, VR256:$src)>;
592607
def : Pat<(store (v16f16 VR256:$src), addr:$dst),
593608
(VMOVUPSYmr addr:$dst, VR256:$src)>;
609+
def : Pat<(store (v16bf16 VR256:$src), addr:$dst),
610+
(VMOVUPSYmr addr:$dst, VR256:$src)>;
594611
}
595612

596613
// Use movaps / movups for SSE integer load / store (one byte shorter).
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16 | FileCheck %s --check-prefix=X64
3+
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bf16 | FileCheck %s --check-prefix=X86
4+
5+
define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) {
6+
; X64-LABEL: funbf16:
7+
; X64: # %bb.0: # %entry
8+
; X64-NEXT: vmovups (%rdi), %xmm0
9+
; X64-NEXT: vmovups %xmm0, (%rsi)
10+
; X64-NEXT: vmovaps (%rdi), %xmm0
11+
; X64-NEXT: vmovaps %xmm0, (%rsi)
12+
; X64-NEXT: vmovups (%rdi), %ymm0
13+
; X64-NEXT: vmovups %ymm0, (%rsi)
14+
; X64-NEXT: vmovaps (%rdi), %ymm0
15+
; X64-NEXT: vmovaps %ymm0, (%rsi)
16+
; X64-NEXT: vzeroupper
17+
; X64-NEXT: retq
18+
;
19+
; X86-LABEL: funbf16:
20+
; X86: # %bb.0: # %entry
21+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
22+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
23+
; X86-NEXT: vmovups (%ecx), %xmm0
24+
; X86-NEXT: vmovups %xmm0, (%eax)
25+
; X86-NEXT: vmovaps (%ecx), %xmm0
26+
; X86-NEXT: vmovaps %xmm0, (%eax)
27+
; X86-NEXT: vmovups (%ecx), %ymm0
28+
; X86-NEXT: vmovups %ymm0, (%eax)
29+
; X86-NEXT: vmovaps (%ecx), %ymm0
30+
; X86-NEXT: vmovaps %ymm0, (%eax)
31+
; X86-NEXT: vzeroupper
32+
; X86-NEXT: retl
33+
entry:
34+
%0 = load <8 x bfloat>, ptr %src, align 1
35+
store <8 x bfloat> %0, ptr %dst, align 1
36+
%1 = load <8 x bfloat>, ptr %src, align 32
37+
store <8 x bfloat> %1, ptr %dst, align 32
38+
%2 = load <16 x bfloat>, ptr %src, align 1
39+
store <16 x bfloat> %2, ptr %dst, align 1
40+
%3 = load <16 x bfloat>, ptr %src, align 32
41+
store <16 x bfloat> %3, ptr %dst, align 32
42+
ret void
43+
}

0 commit comments

Comments
 (0)