Skip to content

[X86] AMD Znver2 (Rome) Scheduler enablement #90

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ include "X86SchedHaswell.td"
include "X86SchedBroadwell.td"
include "X86ScheduleSLM.td"
include "X86ScheduleZnver1.td"
include "X86ScheduleZnver2.td"
include "X86ScheduleBdVer2.td"
include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
Expand Down Expand Up @@ -1204,7 +1205,7 @@ def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;

def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;
def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>;

def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
Feature3DNowA, FeatureInsertVZEROUPPER]>;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2851,7 +2851,7 @@ let SchedRW = [WriteStore], Defs = [EFLAGS] in {
//===----------------------------------------------------------------------===//
// CLZERO Instruction
//
let SchedRW = [WriteSystem] in {
let SchedRW = [WriteLoad] in {
let Uses = [EAX] in
def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
TB, Requires<[HasCLZERO, Not64BitMode]>;
Expand Down
1,548 changes: 1,548 additions & 0 deletions llvm/lib/Target/X86/X86ScheduleZnver2.td

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions llvm/test/MC/X86/x86_long_nop.s
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=btver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver2 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15

# Ensure alignment directives also emit sequences of 10, 11 and 15-byte NOPs on processors
# capable of using long NOPs.
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/X86/Generic/resources-clzero.s
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ clzero
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 100 0.33 U clzero
# CHECK-NEXT: 1 5 0.50 U clzero

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
Expand All @@ -26,8 +26,8 @@ clzero

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 0.33 0.33 - 0.33 - -
# CHECK-NEXT: - - - - - - 0.50 0.50

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - 0.33 0.33 - 0.33 - - clzero
# CHECK-NEXT: - - - - - - 0.50 0.50 clzero
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/X86/Znver1/resources-clzero.s
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ clzero
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 100 0.25 U clzero
# CHECK-NEXT: 1 8 0.50 U clzero

# CHECK: Resources:
# CHECK-NEXT: [0] - ZnAGU0
Expand All @@ -30,8 +30,8 @@ clzero

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
# CHECK-NEXT: - - - - - - - - - - - -
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
# CHECK-NEXT: - - - - - - - - - - - - clzero
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - clzero
47 changes: 47 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-2.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s

imul %rax, %rbx
lzcnt %ax, %bx
add %ecx, %ebx

# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 3
# CHECK-NEXT: Total Cycles: 9
# CHECK-NEXT: Total uOps: 4

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.44
# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 4 1.00 imulq %rax, %rbx
# CHECK-NEXT: 1 1 0.25 lzcntw %ax, %bx
# CHECK-NEXT: 1 1 0.25 addl %ecx, %ebx

# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678

# CHECK: [0,0] DeeeeER . imulq %rax, %rbx
# CHECK-NEXT: [0,1] D====eER. lzcntw %ax, %bx
# CHECK-NEXT: [0,2] D=====eER addl %ecx, %ebx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx
# CHECK-NEXT: 1. 1 5.0 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 1 6.0 0.0 0.0 addl %ecx, %ebx
91 changes: 91 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-3.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1500 -timeline -timeline-max-iterations=6 < %s | FileCheck %s

# The ILP is limited by the false dependency on %dx. So, the mov cannot execute
# in parallel with the add.

add %cx, %dx
mov %ax, %dx
xor %bx, %dx

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
# CHECK-NEXT: Total Cycles: 4503
# CHECK-NEXT: Total uOps: 4500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.25 addw %cx, %dx
# CHECK-NEXT: 1 1 0.25 movw %ax, %dx
# CHECK-NEXT: 1 1 0.25 xorw %bx, %dx

# CHECK: Resources:
# CHECK-NEXT: [0] - Zn2AGU0
# CHECK-NEXT: [1] - Zn2AGU1
# CHECK-NEXT: [2] - Zn2AGU2
# CHECK-NEXT: [3] - Zn2ALU0
# CHECK-NEXT: [4] - Zn2ALU1
# CHECK-NEXT: [5] - Zn2ALU2
# CHECK-NEXT: [6] - Zn2ALU3
# CHECK-NEXT: [7] - Zn2Divider
# CHECK-NEXT: [8] - Zn2FPU0
# CHECK-NEXT: [9] - Zn2FPU1
# CHECK-NEXT: [10] - Zn2FPU2
# CHECK-NEXT: [11] - Zn2FPU3
# CHECK-NEXT: [12] - Zn2Multiplier

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12]
# CHECK-NEXT: - - - 0.75 0.75 0.75 0.75 - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions:
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - addw %cx, %dx
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - movw %ax, %dx
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - xorw %bx, %dx

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0

# CHECK: [0,0] DeER . . . . addw %cx, %dx
# CHECK-NEXT: [0,1] D=eER. . . . movw %ax, %dx
# CHECK-NEXT: [0,2] D==eER . . . xorw %bx, %dx
# CHECK-NEXT: [1,0] D===eER . . . addw %cx, %dx
# CHECK-NEXT: [1,1] .D===eER . . . movw %ax, %dx
# CHECK-NEXT: [1,2] .D====eER . . . xorw %bx, %dx
# CHECK-NEXT: [2,0] .D=====eER. . . addw %cx, %dx
# CHECK-NEXT: [2,1] .D======eER . . movw %ax, %dx
# CHECK-NEXT: [2,2] . D======eER . . xorw %bx, %dx
# CHECK-NEXT: [3,0] . D=======eER . . addw %cx, %dx
# CHECK-NEXT: [3,1] . D========eER . . movw %ax, %dx
# CHECK-NEXT: [3,2] . D=========eER. . xorw %bx, %dx
# CHECK-NEXT: [4,0] . D=========eER . addw %cx, %dx
# CHECK-NEXT: [4,1] . D==========eER . movw %ax, %dx
# CHECK-NEXT: [4,2] . D===========eER . xorw %bx, %dx
# CHECK-NEXT: [5,0] . D============eER . addw %cx, %dx
# CHECK-NEXT: [5,1] . D============eER. movw %ax, %dx
# CHECK-NEXT: [5,2] . D=============eER xorw %bx, %dx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 6 7.0 0.2 0.0 addw %cx, %dx
# CHECK-NEXT: 1. 6 7.7 0.0 0.0 movw %ax, %dx
# CHECK-NEXT: 2. 6 8.5 0.0 0.0 xorw %bx, %dx
94 changes: 94 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-4.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1500 -timeline -timeline-max-iterations=7 < %s | FileCheck %s

# The lzcnt cannot execute in parallel with the imul because there is a false
# dependency on %bx.

imul %ax, %bx
lzcnt %ax, %bx
add %cx, %bx

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
# CHECK-NEXT: Total Cycles: 7503
# CHECK-NEXT: Total uOps: 4500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.60
# CHECK-NEXT: IPC: 0.60
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 3 1.00 imulw %ax, %bx
# CHECK-NEXT: 1 1 0.25 lzcntw %ax, %bx
# CHECK-NEXT: 1 1 0.25 addw %cx, %bx

# CHECK: Resources:
# CHECK-NEXT: [0] - Zn2AGU0
# CHECK-NEXT: [1] - Zn2AGU1
# CHECK-NEXT: [2] - Zn2AGU2
# CHECK-NEXT: [3] - Zn2ALU0
# CHECK-NEXT: [4] - Zn2ALU1
# CHECK-NEXT: [5] - Zn2ALU2
# CHECK-NEXT: [6] - Zn2ALU3
# CHECK-NEXT: [7] - Zn2Divider
# CHECK-NEXT: [8] - Zn2FPU0
# CHECK-NEXT: [9] - Zn2FPU1
# CHECK-NEXT: [10] - Zn2FPU2
# CHECK-NEXT: [11] - Zn2FPU3
# CHECK-NEXT: [12] - Zn2Multiplier

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12]
# CHECK-NEXT: - - 0.67 1.00 0.67 0.67 - - - - - 1.00

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions:
# CHECK-NEXT: - - - 1.00 - - - - - - - 1.00 imulw %ax, %bx
# CHECK-NEXT: - - 0.33 - 0.33 0.33 - - - - - - lzcntw %ax, %bx
# CHECK-NEXT: - - 0.33 - 0.33 0.33 - - - - - - addw %cx, %bx

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 01234567
# CHECK-NEXT: Index 0123456789 0123456789

# CHECK: [0,0] DeeeER . . . . . . . imulw %ax, %bx
# CHECK-NEXT: [0,1] D===eER . . . . . . . lzcntw %ax, %bx
# CHECK-NEXT: [0,2] D====eER . . . . . . . addw %cx, %bx
# CHECK-NEXT: [1,0] D=====eeeER . . . . . . imulw %ax, %bx
# CHECK-NEXT: [1,1] .D=======eER . . . . . . lzcntw %ax, %bx
# CHECK-NEXT: [1,2] .D========eER . . . . . . addw %cx, %bx
# CHECK-NEXT: [2,0] .D=========eeeER . . . . . imulw %ax, %bx
# CHECK-NEXT: [2,1] .D============eER . . . . . lzcntw %ax, %bx
# CHECK-NEXT: [2,2] . D============eER . . . . . addw %cx, %bx
# CHECK-NEXT: [3,0] . D=============eeeER . . . . imulw %ax, %bx
# CHECK-NEXT: [3,1] . D================eER . . . . lzcntw %ax, %bx
# CHECK-NEXT: [3,2] . D=================eER . . . . addw %cx, %bx
# CHECK-NEXT: [4,0] . D=================eeeER . . . imulw %ax, %bx
# CHECK-NEXT: [4,1] . D====================eER . . . lzcntw %ax, %bx
# CHECK-NEXT: [4,2] . D=====================eER . . . addw %cx, %bx
# CHECK-NEXT: [5,0] . D======================eeeER . . imulw %ax, %bx
# CHECK-NEXT: [5,1] . D========================eER . . lzcntw %ax, %bx
# CHECK-NEXT: [5,2] . D=========================eER . . addw %cx, %bx
# CHECK-NEXT: [6,0] . D==========================eeeER . imulw %ax, %bx
# CHECK-NEXT: [6,1] . D=============================eER. lzcntw %ax, %bx
# CHECK-NEXT: [6,2] . D=============================eER addw %cx, %bx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 7 14.1 0.1 0.0 imulw %ax, %bx
# CHECK-NEXT: 1. 7 16.9 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 7 17.6 0.0 0.0 addw %cx, %bx
70 changes: 70 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-5.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1500 -timeline -timeline-max-iterations=8 < %s | FileCheck %s

lzcnt %ax, %bx ## partial register stall.

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 1500
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 1500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 0.3

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.25 lzcntw %ax, %bx

# CHECK: Resources:
# CHECK-NEXT: [0] - Zn2AGU0
# CHECK-NEXT: [1] - Zn2AGU1
# CHECK-NEXT: [2] - Zn2AGU2
# CHECK-NEXT: [3] - Zn2ALU0
# CHECK-NEXT: [4] - Zn2ALU1
# CHECK-NEXT: [5] - Zn2ALU2
# CHECK-NEXT: [6] - Zn2ALU3
# CHECK-NEXT: [7] - Zn2Divider
# CHECK-NEXT: [8] - Zn2FPU0
# CHECK-NEXT: [9] - Zn2FPU1
# CHECK-NEXT: [10] - Zn2FPU2
# CHECK-NEXT: [11] - Zn2FPU3
# CHECK-NEXT: [12] - Zn2Multiplier

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12]
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions:
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - lzcntw %ax, %bx

# CHECK: Timeline view:
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeER . . lzcntw %ax, %bx
# CHECK-NEXT: [1,0] D=eER. . lzcntw %ax, %bx
# CHECK-NEXT: [2,0] D==eER . lzcntw %ax, %bx
# CHECK-NEXT: [3,0] D===eER . lzcntw %ax, %bx
# CHECK-NEXT: [4,0] .D===eER . lzcntw %ax, %bx
# CHECK-NEXT: [5,0] .D====eER . lzcntw %ax, %bx
# CHECK-NEXT: [6,0] .D=====eER. lzcntw %ax, %bx
# CHECK-NEXT: [7,0] .D======eER lzcntw %ax, %bx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 8 4.0 0.1 0.0 lzcntw %ax, %bx
Loading