From b5f288fc7711e5b69486d3d2c0078973889b361a Mon Sep 17 00:00:00 2001 From: BradleyWood Date: Tue, 27 Sep 2022 17:05:40 -0400 Subject: [PATCH] Revert "Preserve ymm/zmm registers on x" This reverts PR #14632. A 10%+ performance regression (issue #15716) reported on JDK 8 was isolated to #14632. Reverting this PR should not have any function change, it was only intended to support the Vector API JEPs which have not yet been enabled on x86 in OpenJ9. A previous attempt at disabling just the extended vector register preservation did not recoup the performance, but only completely reverting this PR restored the regression. This suggests there is something with the way the helpers have been restructured that is causing the performance degradation. This PR will be reverted while the cause is investigated. --- runtime/codert_vm/xnathelp.m4 | 9 +- runtime/jilgen/jilconsts.c | 2 - runtime/oti/j9nonbuilder.h | 9 +- runtime/oti/xhelpers.m4 | 291 +++------------------------------- 4 files changed, 26 insertions(+), 285 deletions(-) diff --git a/runtime/codert_vm/xnathelp.m4 b/runtime/codert_vm/xnathelp.m4 index 581a756c9dd..db4b16c0644 100644 --- a/runtime/codert_vm/xnathelp.m4 +++ b/runtime/codert_vm/xnathelp.m4 @@ -1,4 +1,4 @@ -dnl Copyright (c) 2017, 2022 IBM Corp. and others +dnl Copyright (c) 2017, 2021 IBM Corp. and others dnl dnl This program and the accompanying materials are made available under dnl the terms of the Eclipse Public License 2.0 which accompanies this @@ -1051,13 +1051,10 @@ START_PROC(jitReferenceArrayCopy) mov PARM_REG(2),_rcx mov PARM_REG(1),_rbp call FASTCALL_SYMBOL(impl_jitReferenceArrayCopy,2) - dnl Save return value to check later. - dnl We don't check it now because restoring the register clobbers flags. - mov dword ptr J9TR_VMThread_floatTemp3[_rbp],eax + dnl set ZF if succeed + test _rax,_rax RESTORE_C_VOLATILE_REGS SWITCH_TO_JAVA_STACK - dnl Set ZF on success. - test dword ptr J9TR_VMThread_floatTemp3[_rbp], -1 push uword ptr J9TR_VMThread_jitReturnAddress[_rbp] ret END_PROC(jitReferenceArrayCopy) diff --git a/runtime/jilgen/jilconsts.c b/runtime/jilgen/jilconsts.c index a9fe19a0f12..e9169559b7a 100644 --- a/runtime/jilgen/jilconsts.c +++ b/runtime/jilgen/jilconsts.c @@ -393,7 +393,6 @@ writeConstants(OMRPortLibrary *OMRPORTLIB, IDATA fd) writeConstant(OMRPORTLIB, fd, "J9TR_cframe_machineBP", offsetof(J9CInterpreterStackFrame, machineBP)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_jitGPRs", offsetof(J9CInterpreterStackFrame, jitGPRs)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_jitFPRs", offsetof(J9CInterpreterStackFrame, jitFPRs)) | - writeConstant(OMRPORTLIB, fd, "J9TR_cframe_maskRegisters", offsetof(J9CInterpreterStackFrame, maskRegisters)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_rax", offsetof(J9CInterpreterStackFrame, jitGPRs.jitGPRs.named.rax)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_rbx", offsetof(J9CInterpreterStackFrame, jitGPRs.jitGPRs.named.rbx)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_rcx", offsetof(J9CInterpreterStackFrame, jitGPRs.jitGPRs.named.rcx)) | @@ -759,7 +758,6 @@ writeConstants(OMRPortLibrary *OMRPORTLIB, IDATA fd) writeConstant(OMRPORTLIB, fd, "J9TR_ELSSize", sizeof(J9VMEntryLocalStorage)) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME_DEBUG_MODE", J9_EXTENDED_RUNTIME_DEBUG_MODE) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS", J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS) | - writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS", J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME2_COMPRESS_OBJECT_REFERENCES", J9_EXTENDED_RUNTIME2_COMPRESS_OBJECT_REFERENCES) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_INLINE_JNI_MAX_ARG_COUNT", J9_INLINE_JNI_MAX_ARG_COUNT) | diff --git a/runtime/oti/j9nonbuilder.h b/runtime/oti/j9nonbuilder.h index 5bc90d6b9e9..2b4cbab53b3 100644 --- a/runtime/oti/j9nonbuilder.h +++ b/runtime/oti/j9nonbuilder.h @@ -6134,9 +6134,8 @@ typedef struct J9CInterpreterStackFrame { * * Stack must be 16-byte aligned. */ - U_8 jitFPRs[6 * 64]; /* zmm0-5 512-bit OR xmm0-7 64-bit */ + U_8 jitFPRs[6 * 16]; /* xmm0-5 128-bit OR xmm0-7 64-bit */ U_8 preservedFPRs[10 * 16]; /* xmm6-15 128-bit */ - U_8 maskRegisters[8 * 8]; /* k0-k7 */ UDATA align[1]; /* r15,r14,r13,r12,rdi,rsi,rbx,rbp,return address * RSP is 16-byte aligned at this point @@ -6146,8 +6145,7 @@ typedef struct J9CInterpreterStackFrame { * * Stack must be 16-byte aligned. */ - U_8 jitFPRs[32 * 64]; /* zmm0-31 512-bit OR xmm0-7 64-bit */ - U_8 maskRegisters[8 * 8]; /* k0-k7 */ + U_8 jitFPRs[16 * 16]; /* xmm0-15 128-bit OR xmm0-7 64-bit */ UDATA align[1]; /* r15,r14,r13,r12,rbx,rbp,return address * RSP is 16-byte aligned at this point @@ -6160,8 +6158,7 @@ typedef struct J9CInterpreterStackFrame { */ J9JITGPRSpillArea jitGPRs; UDATA align1[2]; - U_8 jitFPRs[8 * 64]; /* zmm0-7 512-bit */ - U_8 maskRegisters[8 * 8]; /* k0-k7 */ + U_8 jitFPRs[8 * 16]; /* xmm0-7 128-bit */ UDATA align2[1]; /* ebx,edi,esi * ESP is forcibly 16-byte aligned at this point diff --git a/runtime/oti/xhelpers.m4 b/runtime/oti/xhelpers.m4 index f92d0a43144..04477cbf2ae 100644 --- a/runtime/oti/xhelpers.m4 +++ b/runtime/oti/xhelpers.m4 @@ -20,76 +20,8 @@ dnl SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0 WITH Classpath-exc include(jilvalues.m4) -dnl for(=; < ; ++) { } -dnl $1 = symbol name -dnl $2 = starting value -dnl $3 = ending value -dnl $4 = expression -define({forloop}, - {define({$1}, {$2})$4 - ifelse({$2}, {$3}, {},{$0({$1}, incr({$2}), {$3}, {$4})})}) -define({SYM_COUNT},0) -define({INC_SYM_COUNT},{define({SYM_COUNT},incr(SYM_COUNT))}) - J9CONST({CINTERP_STACK_SIZE},J9TR_cframe_sizeof) -dnl Work arround for older versions of MASM which don't support AVX-512 -ifdef({WIN32},{ - - dnl Generate instruction of format OP , [ + rsp] - dnl $1 - prefix - dnl $2 - opcode - dnl $3 - reg number - dnl $4 - offset - dnl low 3 bits of register number are stored in modR/M[5:3] - define({INSTRUCTION}, { - dnl prefix - $1 - dnl opcode - BYTE $2 - dnl modR/M byte - BYTE 084h OR (($3 AND 7) SHL 3) - dnl SIB byte - BYTE 024h - dnl displacement - DWORD $4 - }) - - dnl 2 byte VEX prefix - define({VEX2},{BYTE 0c5h, 0f8h}) - - dnl 3 byte VEX prefix with W bit set - define({VEX3},{BYTE 0c4h, 0e1h, 0f8h}) - - dnl EVEX prefix - dnl $1 - register number - dnl bits 3 and 4 of the register number are stored inverted in bits 7 and 4 in the second byte of the EVEX prefix - define({EVEX},{ - BYTE 062h - BYTE 061h OR ((NOT $1 AND 8) SHL 4) OR (NOT $1 AND 010h) - BYTE 0feh, 048h - }) - - dnl $1 = register number - dnl $2 = stack displacment - define({SAVE_MASK_16}, {INSTRUCTION({VEX2}, 090h, {$1}, {$2})}) - define({RESTORE_MASK_16}, {INSTRUCTION({VEX2}, 091h, {$1}, {$2})}) - define({SAVE_MASK_64}, {INSTRUCTION({VEX3}, 090h, {$1}, {$2})}) - define({RESTORE_MASK_64}, {INSTRUCTION({VEX3}, 091h, {$1}, {$2})}) - define({SAVE_ZMM_REG}, {INSTRUCTION({EVEX({$1})}, 07fh, {$1}, {$2})}) - define({RESTORE_ZMM_REG}, {INSTRUCTION({EVEX({$1})}, 06fh, {$1}, {$2})}) - -},{ dnl WIN32 - dnl $1 = register number - dnl $2 = stack displacment - define({SAVE_MASK_16}, {kmovw word ptr $2[_rsp],k{}$1}) - define({RESTORE_MASK_16}, {kmovw k{}$1,word ptr $2[_rsp]}) - define({SAVE_MASK_64}, {kmovq qword ptr $2[_rsp],k{}$1}) - define({RESTORE_MASK_64}, {kmovq k{}$1,qword ptr $2[_rsp]}) - define({SAVE_ZMM_REG}, {vmovdqu64 zmmword ptr $2[_rsp],zmm{}$1}) - define({RESTORE_ZMM_REG}, {vmovdqu64 zmm{}$1,zmmword ptr $2[_rsp]}) - -}) dnl WIN32 ifdef({WIN32},{ define({SHORT_JMP},{short}) @@ -161,6 +93,7 @@ define({SHORT_JMP},{short}) define({FILE_START},{ .intel_syntax noprefix + .arch pentium4 .text }) @@ -383,45 +316,23 @@ ifdef({METHOD_INVOCATION},{ movq qword ptr J9TR_cframe_jitFPRs+(4*8)[_rsp],xmm4 movq qword ptr J9TR_cframe_jitFPRs+(5*8)[_rsp],xmm5 },{ dnl METHOD_INVOCATION - mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] - mov r8d,J9TR_JavaVM_extendedRuntimeFlags[r8] - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS - jnz LABEL(L_zmm_save{}SYM_COUNT) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jz LABEL(L_xmm_save{}SYM_COUNT) - - dnl save YMM registers - forloop({REG_CTR}, 0, 5, {vmovdqu ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp],ymm{}REG_CTR}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - dnl save ZMM registers - LABEL(L_zmm_save{}SYM_COUNT): - forloop({REG_CTR}, 0, 5, {SAVE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jnz LABEL(L_avx_512bw_save{}SYM_COUNT) - - forloop({REG_CTR}, 0, 7, {SAVE_MASK_16(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*2))}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - LABEL(L_avx_512bw_save{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {SAVE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - dnl save XMM registers - LABEL(L_xmm_save{}SYM_COUNT): movdqa J9TR_cframe_jitFPRs+(0*16)[_rsp],xmm0 movdqa J9TR_cframe_jitFPRs+(1*16)[_rsp],xmm1 movdqa J9TR_cframe_jitFPRs+(2*16)[_rsp],xmm2 movdqa J9TR_cframe_jitFPRs+(3*16)[_rsp],xmm3 movdqa J9TR_cframe_jitFPRs+(4*16)[_rsp],xmm4 movdqa J9TR_cframe_jitFPRs+(5*16)[_rsp],xmm5 - - LABEL(L_save_volatile_done{}SYM_COUNT): - INC_SYM_COUNT() }) dnl METHOD_INVOCATION }) define({RESTORE_C_VOLATILE_REGS},{ + mov rax,qword ptr J9TR_cframe_rax[_rsp] + mov rcx,qword ptr J9TR_cframe_rcx[_rsp] + mov rdx,qword ptr J9TR_cframe_rdx[_rsp] + mov r8,qword ptr J9TR_cframe_r8[_rsp] + mov r9,qword ptr J9TR_cframe_r9[_rsp] + mov r10,qword ptr J9TR_cframe_r10[_rsp] + mov r11,qword ptr J9TR_cframe_r11[_rsp] ifdef({METHOD_INVOCATION},{ movq xmm0,qword ptr J9TR_cframe_jitFPRs+(0*8)[_rsp] movq xmm1,qword ptr J9TR_cframe_jitFPRs+(1*8)[_rsp] @@ -430,51 +341,13 @@ ifdef({METHOD_INVOCATION},{ movq xmm4,qword ptr J9TR_cframe_jitFPRs+(4*8)[_rsp] movq xmm5,qword ptr J9TR_cframe_jitFPRs+(5*8)[_rsp] },{ dnl METHOD_INVOCATION - dnl J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS marks if we are using AVX-2 (eg YMM) - dnl J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS marks if we are using AVX-512 (eg ZMM) - dnl No flags means normal SSE registers (XMM) - mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] - mov r8d,J9TR_JavaVM_extendedRuntimeFlags[r8] - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS - jnz LABEL(L_zmm_restore{}SYM_COUNT) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jz LABEL(L_xmm_restore{}SYM_COUNT) - - dnl restore YMM registers - forloop({REG_CTR}, 0, 5, {vmovdqu ymm{}REG_CTR,ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp]}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - dnl restore ZMM registers - LABEL(L_zmm_restore{}SYM_COUNT): - forloop({REG_CTR}, 0, 5, {RESTORE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jnz LABEL(L_avx_512bw_restore{}SYM_COUNT) - - forloop({REG_CTR}, 0, 7, {RESTORE_MASK_16(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*2))}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - LABEL(L_avx_512bw_restore{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {RESTORE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - dnl restore XMM registers - LABEL(L_xmm_restore{}SYM_COUNT): movdqa xmm0,J9TR_cframe_jitFPRs+(0*16)[_rsp] movdqa xmm1,J9TR_cframe_jitFPRs+(1*16)[_rsp] movdqa xmm2,J9TR_cframe_jitFPRs+(2*16)[_rsp] movdqa xmm3,J9TR_cframe_jitFPRs+(3*16)[_rsp] movdqa xmm4,J9TR_cframe_jitFPRs+(4*16)[_rsp] movdqa xmm5,J9TR_cframe_jitFPRs+(5*16)[_rsp] - LABEL(L_restore_volatile_done{}SYM_COUNT): - INC_SYM_COUNT() }) dnl METHOD_INVOCATION - mov rax,qword ptr J9TR_cframe_rax[_rsp] - mov rcx,qword ptr J9TR_cframe_rcx[_rsp] - mov rdx,qword ptr J9TR_cframe_rdx[_rsp] - mov r8,qword ptr J9TR_cframe_r8[_rsp] - mov r9,qword ptr J9TR_cframe_r9[_rsp] - mov r10,qword ptr J9TR_cframe_r10[_rsp] - mov r11,qword ptr J9TR_cframe_r11[_rsp] }) dnl No need to save/restore xmm8-15 - the stack walker will never need to read @@ -534,35 +407,6 @@ ifdef({METHOD_INVOCATION},{ movq qword ptr J9TR_cframe_jitFPRs+(6*8)[_rsp],xmm6 movq qword ptr J9TR_cframe_jitFPRs+(7*8)[_rsp],xmm7 },{ dnl METHOD_INVOCATION - dnl J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS marks if we are using AVX-2 (eg YMM) - dnl J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS marks if we are using AVX-512 (eg ZMM) - dnl No flags means normal SSE registers (XMM) - mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] - mov r8d,J9TR_JavaVM_extendedRuntimeFlags[r8] - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS - jnz LABEL(L_zmm_save{}SYM_COUNT) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jz LABEL(L_xmm_save{}SYM_COUNT) - - dnl save YMM registers - forloop({REG_CTR}, 0, 15, {vmovdqu ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp],ymm{}REG_CTR}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - dnl save ZMM registers - LABEL(L_zmm_save{}SYM_COUNT): - forloop({REG_CTR}, 0, 31, {SAVE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jnz LABEL(L_avx_512bw_save{}SYM_COUNT) - - forloop({REG_CTR}, 0, 7, {SAVE_MASK_16(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*2))}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - LABEL(L_avx_512bw_save{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {SAVE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - dnl save XMM registers - LABEL(L_xmm_save{}SYM_COUNT): movdqa J9TR_cframe_jitFPRs+(0*16)[_rsp],xmm0 movdqa J9TR_cframe_jitFPRs+(1*16)[_rsp],xmm1 movdqa J9TR_cframe_jitFPRs+(2*16)[_rsp],xmm2 @@ -579,13 +423,19 @@ ifdef({METHOD_INVOCATION},{ movdqa J9TR_cframe_jitFPRs+(13*16)[_rsp],xmm13 movdqa J9TR_cframe_jitFPRs+(14*16)[_rsp],xmm14 movdqa J9TR_cframe_jitFPRs+(15*16)[_rsp],xmm15 - - LABEL(L_save_volatile_done{}SYM_COUNT): - INC_SYM_COUNT() }) dnl METHOD_INVOCATION }) define({RESTORE_C_VOLATILE_REGS},{ + mov rax,qword ptr J9TR_cframe_rax[_rsp] + mov rcx,qword ptr J9TR_cframe_rcx[_rsp] + mov rdx,qword ptr J9TR_cframe_rdx[_rsp] + mov rdi,qword ptr J9TR_cframe_rdi[_rsp] + mov rsi,qword ptr J9TR_cframe_rsi[_rsp] + mov r8,qword ptr J9TR_cframe_r8[_rsp] + mov r9,qword ptr J9TR_cframe_r9[_rsp] + mov r10,qword ptr J9TR_cframe_r10[_rsp] + mov r11,qword ptr J9TR_cframe_r11[_rsp] ifdef({METHOD_INVOCATION},{ movq xmm0,qword ptr J9TR_cframe_jitFPRs+(0*8)[_rsp] movq xmm1,qword ptr J9TR_cframe_jitFPRs+(1*8)[_rsp] @@ -596,36 +446,6 @@ ifdef({METHOD_INVOCATION},{ movq xmm6,qword ptr J9TR_cframe_jitFPRs+(6*8)[_rsp] movq xmm7,qword ptr J9TR_cframe_jitFPRs+(7*8)[_rsp] },{ dnl METHOD_INVOCATION - - dnl J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS marks if we are using AVX-2 (eg YMM) - dnl J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS marks if we are using AVX-512 (eg ZMM) - dnl No flags means normal SSE registers (XMM) - mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] - mov r8d,J9TR_JavaVM_extendedRuntimeFlags[r8] - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS - jnz LABEL(L_zmm_restore{}SYM_COUNT) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jz LABEL(L_xmm_restore{}SYM_COUNT) - - dnl restore YMM registers - forloop({REG_CTR}, 0, 15, {vmovdqu ymm{}REG_CTR,ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp]}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - dnl restore ZMM registers - LABEL(L_zmm_restore{}SYM_COUNT): - forloop({REG_CTR}, 0, 31, {RESTORE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) - test r8d,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jnz LABEL(L_avx_512bw_restore{}SYM_COUNT) - - forloop({REG_CTR}, 0, 7, {RESTORE_MASK_16(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*2))}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - LABEL(L_avx_512bw_restore{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {RESTORE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - dnl restore XMM registers - LABEL(L_xmm_restore{}SYM_COUNT): movdqa xmm0,J9TR_cframe_jitFPRs+(0*16)[_rsp] movdqa xmm1,J9TR_cframe_jitFPRs+(1*16)[_rsp] movdqa xmm2,J9TR_cframe_jitFPRs+(2*16)[_rsp] @@ -642,19 +462,7 @@ ifdef({METHOD_INVOCATION},{ movdqa xmm13,J9TR_cframe_jitFPRs+(13*16)[_rsp] movdqa xmm14,J9TR_cframe_jitFPRs+(14*16)[_rsp] movdqa xmm15,J9TR_cframe_jitFPRs+(15*16)[_rsp] - - LABEL(L_restore_volatile_done{}SYM_COUNT): - INC_SYM_COUNT() }) dnl METHOD_INVOCATION - mov rax,qword ptr J9TR_cframe_rax[_rsp] - mov rcx,qword ptr J9TR_cframe_rcx[_rsp] - mov rdx,qword ptr J9TR_cframe_rdx[_rsp] - mov rdi,qword ptr J9TR_cframe_rdi[_rsp] - mov rsi,qword ptr J9TR_cframe_rsi[_rsp] - mov r8,qword ptr J9TR_cframe_r8[_rsp] - mov r9,qword ptr J9TR_cframe_r9[_rsp] - mov r10,qword ptr J9TR_cframe_r10[_rsp] - mov r11,qword ptr J9TR_cframe_r11[_rsp] }) define({SAVE_C_NONVOLATILE_REGS},{ @@ -718,32 +526,6 @@ define({SAVE_C_VOLATILE_REGS},{ ifdef({METHOD_INVOCATION},{ dnl No FP parameter registers },{ dnl METHOD_INVOCATION - mov eax,dword ptr J9TR_VMThread_javaVM[J9VMTHREAD] - mov eax,dword ptr J9TR_JavaVM_extendedRuntimeFlags[eax] - test eax,J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS - jnz LABEL(L_zmm_save{}SYM_COUNT) - test eax,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jz LABEL(L_xmm_save{}SYM_COUNT) - - dnl save YMM registers - forloop({REG_CTR}, 0, 7, {vmovdqu ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp],ymm{}REG_CTR}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - dnl save ZMM registers - LABEL(L_zmm_save{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {SAVE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) - test eax,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jnz LABEL(L_avx_512bw_save{}SYM_COUNT) - - forloop({REG_CTR}, 0, 7, {SAVE_MASK_16(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*2))}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - LABEL(L_avx_512bw_save{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {SAVE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) - jmp LABEL(L_save_volatile_done{}SYM_COUNT) - - dnl save XMM registers - LABEL(L_xmm_save{}SYM_COUNT): movdqa J9TR_cframe_jitFPRs+(0*16)[_rsp],xmm0 movdqa J9TR_cframe_jitFPRs+(1*16)[_rsp],xmm1 movdqa J9TR_cframe_jitFPRs+(2*16)[_rsp],xmm2 @@ -752,43 +534,16 @@ dnl No FP parameter registers movdqa J9TR_cframe_jitFPRs+(5*16)[_rsp],xmm5 movdqa J9TR_cframe_jitFPRs+(6*16)[_rsp],xmm6 movdqa J9TR_cframe_jitFPRs+(7*16)[_rsp],xmm7 - - LABEL(L_save_volatile_done{}SYM_COUNT): - INC_SYM_COUNT() - - mov eax,dword ptr J9TR_cframe_rax[_rsp] }) dnl METHOD_INVOCATION }) define({RESTORE_C_VOLATILE_REGS},{ + mov eax,dword ptr J9TR_cframe_rax[_rsp] + mov ecx,dword ptr J9TR_cframe_rcx[_rsp] + mov edx,dword ptr J9TR_cframe_rdx[_rsp] ifdef({METHOD_INVOCATION},{ dnl No FP parameter registers },{ dnl METHOD_INVOCATION - mov eax,dword ptr J9TR_VMThread_javaVM[J9VMTHREAD] - mov eax,dword ptr J9TR_JavaVM_extendedRuntimeFlags[eax] - test eax,J9TR_J9_EXTENDED_RUNTIME_USE_EXTENDED_VECTOR_REGISTERS - jnz LABEL(L_zmm_restore{}SYM_COUNT) - test eax,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jz LABEL(L_xmm_restore{}SYM_COUNT) - - dnl restore YMM registers - forloop({REG_CTR}, 0, 7, {vmovdqu ymm{}REG_CTR,ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp]}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - dnl restore ZMM registers - LABEL(L_zmm_restore{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {RESTORE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) - test eax,J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS - jnz LABEL(L_avx_512bw_restore{}SYM_COUNT) - - forloop({REG_CTR}, 0, 7, {RESTORE_MASK_16(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*2))}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - LABEL(L_avx_512bw_restore{}SYM_COUNT): - forloop({REG_CTR}, 0, 7, {RESTORE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) - jmp LABEL(L_restore_volatile_done{}SYM_COUNT) - - LABEL(L_xmm_restore{}SYM_COUNT): movdqa xmm0,J9TR_cframe_jitFPRs+(0*16)[_rsp] movdqa xmm1,J9TR_cframe_jitFPRs+(1*16)[_rsp] movdqa xmm2,J9TR_cframe_jitFPRs+(2*16)[_rsp] @@ -797,13 +552,7 @@ dnl No FP parameter registers movdqa xmm5,J9TR_cframe_jitFPRs+(5*16)[_rsp] movdqa xmm6,J9TR_cframe_jitFPRs+(6*16)[_rsp] movdqa xmm7,J9TR_cframe_jitFPRs+(7*16)[_rsp] - - LABEL(L_restore_volatile_done{}SYM_COUNT): - INC_SYM_COUNT() }) dnl METHOD_INVOCATION - mov eax,dword ptr J9TR_cframe_rax[_rsp] - mov ecx,dword ptr J9TR_cframe_rcx[_rsp] - mov edx,dword ptr J9TR_cframe_rdx[_rsp] }) define({SAVE_C_NONVOLATILE_REGS},{