From d5f8e45c621eea122cf558c366054951261fd356 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Mon, 29 Jul 2024 05:24:26 +0100 Subject: [PATCH] Update Keccak example to use stack spilling --- example.py | 37 +- .../aarch64/keccak_f1600_x1_scalar_slothy.s | 488 +++++------ ...eccak_f1600_x1_scalar_slothy_a55_opt_a55.s | 825 +++++++++--------- 3 files changed, 674 insertions(+), 676 deletions(-) diff --git a/example.py b/example.py index 681347d1..6c9bedf2 100644 --- a/example.py +++ b/example.py @@ -1395,30 +1395,39 @@ def core(self, slothy): slothy.optimize(start="loop_0", end="end_loop_0") slothy.optimize(start="loop_1", end="end_loop_1") -class neon_keccak_x1(Example): +class neon_keccak_x1_no_symbolic(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): - name = "keccak_f1600_x1_scalar_slothy" + name = "keccak_f1600_x1_scalar_slothy_no_symbolic" infile = "keccak_f1600_x1_scalar_slothy" + outfile = "keccak_f1600_x1_scalar_no_symbolic" - if var != "": - name += f"_{var}" - infile += f"_{var}" - name += f"_{target_label_dict[target]}" - - super().__init__(infile, name, outfile=name, rename=True, arch=arch, target=target) + super().__init__(infile, name, outfile=outfile, rename=True, arch=arch, target=target) def core(self, slothy): + slothy.config.reserved_regs = ["x18", "sp"] + slothy.config.inputs_are_outputs = True - slothy.config.variable_size = True - slothy.config.visualize_expected_performance = True - slothy.config.timeout = 3600*24 + slothy.config.variable_size = False + slothy.config.visualize_expected_performance = False + slothy.config.timeout = 3600 - slothy.config.outputs = ["x27"] + slothy.config.selfcheck_failure_logfile = "selfcheck_fail.log" + + slothy.config.outputs = ["flags"] + slothy.config.constraints.stalls_first_attempt = 0 + slothy.config.ignore_objective = True + slothy.config.constraints.minimize_spills = True slothy.config.constraints.functional_only = True - slothy.config.constraints.stalls_first_attempt = 32 + slothy.config.constraints.allow_reordering = False + slothy.config.constraints.allow_spills = True + slothy.config.visualize_expected_performance = True +# slothy.config.visualize_show_old_code = True slothy.optimize(start="loop", end="end_loop") + slothy.config.outputs = ["hint_STACK_OFFSET_COUNT"] + slothy.optimize(start="initial_round_start", end="initial_round_end") + ############################################################################################# @@ -1563,7 +1572,7 @@ def main(): fft_fixedpoint_radix4(), # Keccak neon_keccak_x4(), - neon_keccak_x1(), + neon_keccak_x1_no_symbolic(), ] all_example_names = [e.name for e in examples] diff --git a/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s b/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s index a1fd2927..e07cf1a6 100644 --- a/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s +++ b/examples/naive/aarch64/keccak_f1600_x1_scalar_slothy.s @@ -63,8 +63,6 @@ round_constants: input_addr .req x0 const_addr .req x26 - cur_const .req x26 - count .req w27 /* Mapping of Kecck-f1600 state to scalar registers * at the beginning and end of each round. */ @@ -81,7 +79,7 @@ round_constants: Aka .req x3 Ake .req x8 Aki .req x13 - Ako .req x18 + Ako .req x28 Aku .req x23 Ama .req x4 Ame .req x9 @@ -94,60 +92,58 @@ round_constants: Aso .req x20 Asu .req x25 - /* A_[y,2*x+3*y] = rot(A[x,y]) */ - Aba_ .req x30 - Abe_ .req x28 - Abi_ .req x11 - Abo_ .req x16 - Abu_ .req x21 - Aga_ .req x3 - Age_ .req x8 - Agi_ .req x12 - Ago_ .req x17 - Agu_ .req x22 - Aka_ .req x4 - Ake_ .req x9 - Aki_ .req x13 - Ako_ .req x18 - Aku_ .req x23 - Ama_ .req x5 - Ame_ .req x10 - Ami_ .req x14 - Amo_ .req x19 - Amu_ .req x24 - Asa_ .req x1 - Ase_ .req x6 - Asi_ .req x15 - Aso_ .req x20 - Asu_ .req x25 - - /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ - /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ - C0 .req x30 - E0 .req x29 - C1 .req x26 - E1 .req x0 - C2 .req x27 - E2 .req x26 - C3 .req x28 - E3 .req x27 - C4 .req x29 - E4 .req x28 - - tmp .req x0 - - - tmp0 .req x0 - tmp1 .req x29 - /************************ MACROS ****************************/ -#define STACK_SIZE (16*6 + 3*8 + 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) +#define STACK_LOCS 40 + +#define STACK_SIZE (16*6 + 3*8 + 8 + (STACK_LOCS) * 8) // GPRs (16*6), count (8), const (8), input (8), padding (8) #define STACK_BASE_GPRS (3*8+8) #define STACK_OFFSET_INPUT (0*8) #define STACK_OFFSET_CONST (1*8) #define STACK_OFFSET_COUNT (2*8) +#define STACK_OFFSET_LOCS (16*6 + 4*8) +#define STACK_LOC_0 ((STACK_OFFSET_LOCS) + 0*8) +#define STACK_LOC_1 ((STACK_OFFSET_LOCS) + 1*8) +#define STACK_LOC_2 ((STACK_OFFSET_LOCS) + 2*8) +#define STACK_LOC_3 ((STACK_OFFSET_LOCS) + 3*8) +#define STACK_LOC_4 ((STACK_OFFSET_LOCS) + 4*8) +#define STACK_LOC_5 ((STACK_OFFSET_LOCS) + 5*8) +#define STACK_LOC_6 ((STACK_OFFSET_LOCS) + 6*8) +#define STACK_LOC_7 ((STACK_OFFSET_LOCS) + 7*8) +#define STACK_LOC_8 ((STACK_OFFSET_LOCS) + 8*8) +#define STACK_LOC_9 ((STACK_OFFSET_LOCS) + 9*8) +#define STACK_LOC_10 ((STACK_OFFSET_LOCS) + 10*8) +#define STACK_LOC_11 ((STACK_OFFSET_LOCS) + 11*8) +#define STACK_LOC_12 ((STACK_OFFSET_LOCS) + 12*8) +#define STACK_LOC_13 ((STACK_OFFSET_LOCS) + 13*8) +#define STACK_LOC_14 ((STACK_OFFSET_LOCS) + 14*8) +#define STACK_LOC_15 ((STACK_OFFSET_LOCS) + 15*8) +#define STACK_LOC_16 ((STACK_OFFSET_LOCS) + 16*8) +#define STACK_LOC_17 ((STACK_OFFSET_LOCS) + 17*8) +#define STACK_LOC_18 ((STACK_OFFSET_LOCS) + 18*8) +#define STACK_LOC_19 ((STACK_OFFSET_LOCS) + 19*8) +#define STACK_LOC_20 ((STACK_OFFSET_LOCS) + 20*8) +#define STACK_LOC_21 ((STACK_OFFSET_LOCS) + 21*8) +#define STACK_LOC_22 ((STACK_OFFSET_LOCS) + 22*8) +#define STACK_LOC_23 ((STACK_OFFSET_LOCS) + 23*8) +#define STACK_LOC_24 ((STACK_OFFSET_LOCS) + 24*8) +#define STACK_LOC_25 ((STACK_OFFSET_LOCS) + 25*8) +#define STACK_LOC_26 ((STACK_OFFSET_LOCS) + 26*8) +#define STACK_LOC_27 ((STACK_OFFSET_LOCS) + 27*8) +#define STACK_LOC_28 ((STACK_OFFSET_LOCS) + 28*8) +#define STACK_LOC_29 ((STACK_OFFSET_LOCS) + 29*8) +#define STACK_LOC_30 ((STACK_OFFSET_LOCS) + 30*8) +#define STACK_LOC_31 ((STACK_OFFSET_LOCS) + 31*8) +#define STACK_LOC_32 ((STACK_OFFSET_LOCS) + 32*8) +#define STACK_LOC_33 ((STACK_OFFSET_LOCS) + 33*8) +#define STACK_LOC_34 ((STACK_OFFSET_LOCS) + 34*8) +#define STACK_LOC_35 ((STACK_OFFSET_LOCS) + 35*8) +#define STACK_LOC_36 ((STACK_OFFSET_LOCS) + 36*8) +#define STACK_LOC_37 ((STACK_OFFSET_LOCS) + 37*8) +#define STACK_LOC_38 ((STACK_OFFSET_LOCS) + 38*8) +#define STACK_LOC_39 ((STACK_OFFSET_LOCS) + 39*8) + .macro alloc_stack sub sp, sp, #(STACK_SIZE) .endm @@ -181,220 +177,186 @@ round_constants: eor \dst, \dst, \src4 .endm - - -.macro addparity prty, dst0, src0, dst1, src1, dst2, src2, dst3, src3, dst4, src4 - eor \dst0, \src0, \prty - eor \dst1, \src1, \prty - eor \dst2, \src2, \prty - eor \dst3, \src3, \prty - eor \dst4, \src4, \prty +.macro chi_step_ror out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), X, \a\(), ror #\r2 .endm - - - -.macro keccak_f1600_round_initial - eor5 C0, Ama, Asa, Aba, Aga, Aka - eor5 C1, Ame, Ase, Abe, Age, Ake - eor5 C2, Ami, Asi, Abi, Agi, Aki - eor5 C3, Amo, Aso, Abo, Ago, Ako - eor5 C4, Amu, Asu, Abu, Agu, Aku - - eor E1, C0, C2, ror #63 - eor E3, C2, C4, ror #63 - eor E0, C4, C1, ror #63 - eor E2, C1, C3, ror #63 - eor E4, C3, C0, ror #63 - - eor Aba_, Aba, E0 - eor Asa_, Abi, E2 - eor Abi_, Aki, E2 - eor Aki_, Ako, E3 - eor Ako_, Amu, E4 - eor Amu_, Aso, E3 - eor Aso_, Ama, E0 - eor Aka_, Abe, E1 - eor Ase_, Ago, E3 - eor Ago_, Ame, E1 - eor Ake_, Agi, E2 - eor Agi_, Aka, E0 - eor Aga_, Abo, E3 - eor Abo_, Amo, E3 - eor Amo_, Ami, E2 - eor Ami_, Ake, E1 - eor Age_, Agu, E4 - eor Agu_, Asi, E2 - eor Asi_, Aku, E4 - eor Aku_, Asa, E0 - eor Ama_, Abu, E4 - eor Abu_, Asu, E4 - eor Asu_, Ase, E1 - eor Ame_, Aga, E0 - eor Abe_, Age, E1 - - load_constant_ptr - - bic tmp0, Agi_, Age_, ror #47 - bic tmp1, Ago_, Agi_, ror #42 - eor Aga, tmp0, Aga_, ror #39 - bic tmp0, Agu_, Ago_, ror #16 - eor Age, tmp1, Age_, ror #25 - bic tmp1, Aga_, Agu_, ror #31 - eor Agi, tmp0, Agi_, ror #58 - bic tmp0, Age_, Aga_, ror #56 - eor Ago, tmp1, Ago_, ror #47 - bic tmp1, Aki_, Ake_, ror #19 - eor Agu, tmp0, Agu_, ror #23 - bic tmp0, Ako_, Aki_, ror #47 - eor Aka, tmp1, Aka_, ror #24 - bic tmp1, Aku_, Ako_, ror #10 - eor Ake, tmp0, Ake_, ror #2 - bic tmp0, Aka_, Aku_, ror #47 - eor Aki, tmp1, Aki_, ror #57 - bic tmp1, Ake_, Aka_, ror #5 - eor Ako, tmp0, Ako_, ror #57 - bic tmp0, Ami_, Ame_, ror #38 - eor Aku, tmp1, Aku_, ror #52 - bic tmp1, Amo_, Ami_, ror #5 - eor Ama, tmp0, Ama_, ror #47 - bic tmp0, Amu_, Amo_, ror #41 - eor Ame, tmp1, Ame_, ror #43 - bic tmp1, Ama_, Amu_, ror #35 - eor Ami, tmp0, Ami_, ror #46 - bic tmp0, Ame_, Ama_, ror #9 - - str const_addr, [sp, #(STACK_OFFSET_CONST)] - ldr cur_const, [const_addr] - - eor Amo, tmp1, Amo_, ror #12 - bic tmp1, Asi_, Ase_, ror #48 - eor Amu, tmp0, Amu_, ror #44 - bic tmp0, Aso_, Asi_, ror #2 - eor Asa, tmp1, Asa_, ror #41 - bic tmp1, Asu_, Aso_, ror #25 - eor Ase, tmp0, Ase_, ror #50 - bic tmp0, Asa_, Asu_, ror #60 - eor Asi, tmp1, Asi_, ror #27 - bic tmp1, Ase_, Asa_, ror #57 - eor Aso, tmp0, Aso_, ror #21 - - mov count, #1 - - bic tmp0, Abi_, Abe_, ror #63 - eor Asu, tmp1, Asu_, ror #53 - bic tmp1, Abo_, Abi_, ror #42 - eor Aba, Aba_, tmp0, ror #21 - bic tmp0, Abu_, Abo_, ror #57 - eor Abe, tmp1, Abe_, ror #41 - bic tmp1, Aba_, Abu_, ror #50 - eor Abi, tmp0, Abi_, ror #35 - bic tmp0, Abe_, Aba_, ror #44 - eor Abo, tmp1, Abo_, ror #43 - eor Abu, tmp0, Abu_, ror #30 - - eor Aba, Aba, cur_const - str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - +.macro chi_step_ror2 out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), \a\(), X, ror #\r2 .endm -.macro eor5ror dst, src0, src1, rot1, src2, rot2, src3, rot3, src4, rot4 - eor \dst, \src0, \src1, ror \rot1 - eor \dst, \dst, \src2, ror \rot2 - eor \dst, \dst, \src3, ror \rot3 - eor \dst, \dst, \src4, ror \rot4 -.endm +.macro keccak_f1600_round_initial + eor5 X, Ama, Asa, Aba, Aga, Aka + eor5 X, Ame, Ase, Abe, Age, Ake + eor5 X, Ami, Asi, Abi, Agi, Aki + eor5 X, Amo, Aso, Abo, Ago, Ako + eor5 X, Amu, Asu, Abu, Agu, Aku + + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + eor X, Aba, X + eor X, Abi, X + eor X, Aki, X + eor X, Ako, X + eor X, Amu, X + eor X, Aso, X + eor X, Ama, X + eor X, Abe, X + eor X, Ago, X + eor X, Ame, X + eor X, Agi, X + eor X, Aka, X + eor X, Abo, X + eor X, Amo, X + eor X, Ami, X + eor X, Ake, X + eor X, Agu, X + eor X, Asi, X + eor X, Aku, X + eor X, Asa, X + eor X, Abu, X + eor X, Asu, X + eor X, Ase, X + eor X, Aga, X + eor X, Age, X + + ldr X, [sp, #STACK_OFFSET_CONST] + ldr X, [X] + mov X, #1 + str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + chi_step_ror Aga, X, X, X, 47, 39 + chi_step_ror Age, X, X, X, 42, 25 + chi_step_ror Agi, X, X, X, 16, 58 + chi_step_ror Ago, X, X, X, 31, 47 + chi_step_ror Agu, X, X, X, 56, 23 + chi_step_ror Aka, X, X, X, 19, 24 + chi_step_ror Ake, X, X, X, 47, 2 + chi_step_ror Aki, X, X, X, 10, 57 + chi_step_ror Ako, X, X, X, 47, 57 + chi_step_ror Aku, X, X, X, 5, 52 + chi_step_ror Ama, X, X, X, 38, 47 + chi_step_ror Ame, X, X, X, 5, 43 + chi_step_ror Ami, X, X, X, 41, 46 + chi_step_ror Amo, X, X, X, 35, 12 + chi_step_ror Amu, X, X, X, 9, 44 + chi_step_ror Asa, X, X, X, 48, 41 + chi_step_ror Ase, X, X, X, 2, 50 + chi_step_ror Asi, X, X, X, 25, 27 + chi_step_ror Aso, X, X, X, 60, 21 + chi_step_ror Asu, X, X, X, 57, 53 + chi_step_ror2 Aba, X, X, X, 63, 21 + chi_step_ror Abe, X, X, X, 42, 41 + chi_step_ror Abi, X, X, X, 57, 35 + chi_step_ror Abo, X, X, X, 50, 43 + chi_step_ror Abu, X, X, X, 44, 30 + + eor Aba, Aba, X -.macro addparityror prty, dst0, src0, rot0, dst1, src1, rot1, dst2, src2, rot2, dst3, src3, rot3, dst4, src4, rot4 - eor \dst0, \prty, \src0, ror \rot0 - eor \dst1, \prty, \src1, ror \rot1 - eor \dst2, \prty, \src2, ror \rot2 - eor \dst3, \prty, \src3, ror \rot3 - eor \dst4, \prty, \src4, ror \rot4 .endm .macro keccak_f1600_round_noninitial - eor5ror C0, Aba, Aga, #61, Ama, #54, Aka, #39, Asa, #25 - eor5ror C1, Ake, Ame, #57, Abe, #51, Ase, #31, Age, #27 - eor5ror C2, Asi, Abi, #52, Aki, #48, Ami, #10, Agi, #5 - eor5ror C3, Abo, Ako, #63, Amo, #37, Ago, #36, Aso, #2 - eor5ror C4, Aku, Agu, #50, Amu, #34, Abu, #26, Asu, #15 - - eor E1, C0, C2, ror #61 - ror C2, C2, #62 - eor E3, C2, C4, ror #57 - ror C4, C4, #58 - eor E0, C4, C1, ror #55 - ror C1, C1, #56 - eor E2, C1, C3, ror #63 - eor E4, C3, C0, ror #63 - - addparityror E0, X, Aba, #0, X, Ama, #54, X, Aka, #39, X, Asa, #25, X, Aga, #61 - addparityror E1, X, Abe, #43, X, Ame, #49, X, Ake, #56, X, Ase, #23, X, Age, #19 - addparityror E2, X, Abi, #50, X, Aki, #46, X, Agi, #3, X, Ami, #8, X, Asi, #62 - addparityror E3, X, Ako, #63, X, Aso, #2, X, Ago, #36, X, Abo, #0, X, Amo, #37 - addparityror E4, X, Amu, #28, X, Agu, #44, X, Aku, #58, X, Abu, #20, X, Asu, #9 - - load_constant_ptr_stack - ldr count, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT - - bic tmp0, X, X, ror #47 - bic tmp1, X, X, ror #42 - eor Aga, tmp0, X, ror #39 - bic tmp0, X, X, ror #16 - eor Age, tmp1, X, ror #25 - bic tmp1, X, X, ror #31 - eor Agi, tmp0, X, ror #58 - bic tmp0, X, X, ror #56 - eor Ago, tmp1, X, ror #47 - bic tmp1, X, X, ror #19 - eor Agu, tmp0, X, ror #23 - bic tmp0, X, X, ror #47 - eor Aka, tmp1, X, ror #24 - bic tmp1, X, X, ror #10 - eor Ake, tmp0, X, ror #2 - bic tmp0, X, X, ror #47 - eor Aki, tmp1, X, ror #57 - bic tmp1, X, X, ror #5 - eor Ako, tmp0, X, ror #57 - bic tmp0, X, X, ror #38 - eor Aku, tmp1, X, ror #52 - bic tmp1, X, X, ror #5 - eor Ama, tmp0, X, ror #47 - bic tmp0, X, X, ror #41 - eor Ame, tmp1, X, ror #43 - bic tmp1, X, X, ror #35 - eor Ami, tmp0, X, ror #46 - bic tmp0, X, X, ror #9 - - ldr cur_const, [const_addr, count, UXTW #3] - - eor Amo, tmp1, X, ror #12 - bic tmp1, X, X, ror #48 - eor Amu, tmp0, X, ror #44 - bic tmp0, X, X, ror #2 - eor Asa, tmp1, X, ror #41 - bic tmp1, X, X, ror #25 - eor Ase, tmp0, X, ror #50 - bic tmp0, X, X, ror #60 - eor Asi, tmp1, X, ror #27 - bic tmp1, X, X, ror #57 - eor Aso, tmp0, X, ror #21 - bic tmp0, X, X, ror #63 - add count, count, #1 - str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - eor Asu, tmp1, X, ror #53 - bic tmp1, X, X, ror #42 - eor Aba, X, tmp0, ror #21 - bic tmp0, X, X, ror #57 - eor Abe, tmp1, X, ror #41 - bic tmp1, X, X, ror #50 - eor Abi, tmp0, X, ror #35 - bic tmp0, X, X, ror #44 - - eor Abo, tmp1, X, ror #43 - eor Abu, tmp0, X, ror #30 - eor Aba, Aba, cur_const + + eor X, Aba, Aga, ror #61 + eor X, X, Ama, ror #54 + eor X, X, Aka, ror #39 + eor X, X, Asa, ror #25 + + eor X, Ake, Ame, ror #57 + eor X, X, Abe, ror #51 + eor X, X, Ase, ror #31 + eor X, X, Age, ror #27 + + eor X, Asi, Abi, ror #52 + eor X, X, Aki, ror #48 + eor X, X, Ami, ror #10 + eor X, X, Agi, ror #5 + + eor X, Abo, Ako, ror #63 + eor X, X, Amo, ror #37 + eor X, X, Ago, ror #36 + eor X, X, Aso, ror #2 + + eor X, Aku, Agu, ror #50 + eor X, X, Amu, ror #34 + eor X, X, Abu, ror #26 + eor X, X, Asu, ror #15 + + eor X, X, X, ror #61 + ror X, X, #62 + eor X, X, X, ror #57 + ror X, X, #58 + eor X, X, X, ror #55 + ror X, X, #56 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + eor X, X, Aba + eor X, X, Abi, ror #50 + eor X, X, Aki, ror #46 + eor X, X, Ako, ror #63 + eor X, X, Amu, ror #28 + eor X, X, Aso, ror #2 + eor X, X, Ama, ror #54 + eor X, X, Abe, ror #43 + eor X, X, Ago, ror #36 + eor X, X, Ame, ror #49 + eor X, X, Agi, ror #3 + eor X, X, Aka, ror #39 + eor X, X, Abo + eor X, X, Amo, ror #37 + eor X, X, Ami, ror #8 + eor X, X, Ake, ror #56 + eor X, X, Agu, ror #44 + eor X, X, Asi, ror #62 + eor X, X, Aku, ror #58 + eor X, X, Asa, ror #25 + eor X, X, Abu, ror #20 + eor X, X, Asu, ror #9 + eor X, X, Ase, ror #23 + eor X, X, Aga, ror #61 + eor X, X, Age, ror #19 + + ldr X, [sp, #STACK_OFFSET_CONST] + ldr X, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT + ldr X, [X, W, UXTW #3] + add X, X, #1 + cmp X, #(KECCAK_F1600_ROUNDS-1) + str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + chi_step_ror Aga, X, X, X, 47, 39 + chi_step_ror Age, X, X, X, 42, 25 + chi_step_ror Agi, X, X, X, 16, 58 + chi_step_ror Ago, X, X, X, 31, 47 + chi_step_ror Agu, X, X, X, 56, 23 + chi_step_ror Aka, X, X, X, 19, 24 + chi_step_ror Ake, X, X, X, 47, 2 + chi_step_ror Aki, X, X, X, 10, 57 + chi_step_ror Ako, X, X, X, 47, 57 + chi_step_ror Aku, X, X, X, 5, 52 + chi_step_ror Ama, X, X, X, 38, 47 + chi_step_ror Ame, X, X, X, 5, 43 + chi_step_ror Ami, X, X, X, 41, 46 + chi_step_ror Amo, X, X, X, 35, 12 + chi_step_ror Amu, X, X, X, 9, 44 + chi_step_ror Asa, X, X, X, 48, 41 + chi_step_ror Ase, X, X, X, 2, 50 + chi_step_ror Asi, X, X, X, 25, 27 + chi_step_ror Aso, X, X, X, 60, 21 + chi_step_ror Asu, X, X, X, 57, 53 + chi_step_ror2 Aba, X, X, X, 63, 21 + chi_step_ror Abe, X, X, X, 42, 41 + chi_step_ror Abi, X, X, X, 57, 35 + chi_step_ror Abo, X, X, X, 50, 43 + chi_step_ror Abu, X, X, X, 44, 30 + + eor Aba, Aba, X .endm .macro load_state @@ -462,28 +424,32 @@ round_constants: .global keccak_f1600_x1_scalar_slothy .global _keccak_f1600_x1_scalar_slothy -.macro load_constant_ptr_stack - ldr const_addr, [sp, #(STACK_OFFSET_CONST)] -.endm keccak_f1600_x1_scalar_slothy: _keccak_f1600_x1_scalar_slothy: alloc_stack save_gprs initial: + load_constant_ptr + str const_addr, [sp, #STACK_OFFSET_CONST] load_state str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT - keccak_f1600_round_initial + +initial_round_start: + keccak_f1600_round_initial +initial_round_end: + loop: keccak_f1600_round_noninitial end_loop: - cmp count, #(KECCAK_F1600_ROUNDS-1) ble loop + final: final_rotate ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT store_state end_final: + restore_gprs free_stack ret diff --git a/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s b/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s index 3c053b85..0a1bf7d7 100644 --- a/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s +++ b/examples/opt/aarch64/keccak_f1600_x1_scalar_slothy_a55_opt_a55.s @@ -136,7 +136,6 @@ round_constants: tmp .req x0 - tmp0 .req x0 tmp1 .req x29 @@ -174,41 +173,32 @@ round_constants: ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] .endm +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + + + +.macro addparity prty, dst0, src0, dst1, src1, dst2, src2, dst3, src3, dst4, src4 + eor \dst0, \src0, \prty + eor \dst1, \src1, \prty + eor \dst2, \src2, \prty + eor \dst3, \src3, \prty + eor \dst4, \src4, \prty +.endm + + + + .macro keccak_f1600_round_initial - ldp Aku, Ama, [input_addr, #(1*8*14)] - ldp Asa, Ase, [input_addr, #(1*8*20)] - eor C0, Ama, Asa - ldp Ame, Ami, [input_addr, #(1*8*16)] - eor C1, Ame, Ase - ldp Asi, Aso, [input_addr, #(1*8*22)] - eor C2, Ami, Asi - ldp Amo, Amu, [input_addr, #(1*8*18)] - eor C3, Amo, Aso - ldr Asu, [input_addr, #(1*8*24)] - eor C4, Amu, Asu - ldp Aka, Ake, [input_addr, #(1*8*10)] - eor C0, Aka, C0 - eor C1, Ake, C1 - ldp Aki, Ako, [input_addr, #(1*8*12)] - eor C2, Aki, C2 - ldp Abu, Aga, [input_addr, #(1*8*4)] - eor C3, Ako, C3 - eor C4, Aku, C4 - ldp Age, Agi, [input_addr, #(1*8*6)] - eor C0, Aga, C0 - ldp Ago, Agu, [input_addr, #(1*8*8)] - eor C1, Age, C1 - ldp Aba, Abe, [input_addr, #(1*8*0)] - eor C2, Agi, C2 - ldp Abi, Abo, [input_addr, #(1*8*2)] - eor C3, Ago, C3 - str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT - eor C4, Agu, C4 - eor C0, Aba, C0 - eor C1, Abe, C1 - eor C2, Abi, C2 - eor C3, Abo, C3 - eor C4, Abu, C4 + eor5 C0, Ama, Asa, Aba, Aga, Aka + eor5 C1, Ame, Ase, Abe, Age, Ake + eor5 C2, Ami, Asi, Abi, Agi, Aki + eor5 C3, Amo, Aso, Abo, Ago, Ako + eor5 C4, Amu, Asu, Abu, Agu, Aku eor E1, C0, C2, ror #63 eor E3, C2, C4, ror #63 @@ -307,167 +297,180 @@ round_constants: .endm +.macro eor5ror dst, src0, src1, rot1, src2, rot2, src3, rot3, src4, rot4 + eor \dst, \src0, \src1, ror \rot1 + eor \dst, \dst, \src2, ror \rot2 + eor \dst, \dst, \src3, ror \rot3 + eor \dst, \dst, \src4, ror \rot4 +.endm -.macro keccak_f1600_round_noninitial +.macro addparityror prty, dst0, src0, rot0, dst1, src1, rot1, dst2, src2, rot2, dst3, src3, rot3, dst4, src4, rot4 + eor \dst0, \prty, \src0, ror \rot0 + eor \dst1, \prty, \src1, ror \rot1 + eor \dst2, \prty, \src2, ror \rot2 + eor \dst3, \prty, \src3, ror \rot3 + eor \dst4, \prty, \src4, ror \rot4 +.endm - eor C2, Asi, Abi, ror #52 - eor C0, Aba, Aga, ror #61 - eor C4, Aku, Agu, ror #50 - eor C1, Ake, Ame, ror #57 - eor C3, Abo, Ako, ror #63 - eor C2, C2, Aki, ror #48 - eor C0, C0, Ama, ror #54 - eor C4, C4, Amu, ror #34 - eor C1, C1, Abe, ror #51 - eor C3, C3, Amo, ror #37 - eor C2, C2, Ami, ror #10 - eor C0, C0, Aka, ror #39 - eor C4, C4, Abu, ror #26 - eor C1, C1, Ase, ror #31 - eor C3, C3, Ago, ror #36 - eor C2, C2, Agi, ror #5 - eor C0, C0, Asa, ror #25 - eor C4, C4, Asu, ror #15 - eor C1, C1, Age, ror #27 - eor C3, C3, Aso, ror #2 - - eor E1, C0, C2, ror #61 - ror C2, C2, #62 - eor E3, C2, C4, ror #57 - ror C4, C4, #58 - eor E0, C4, C1, ror #55 - ror C1, C1, #56 - eor E2, C1, C3, ror #63 - eor E4, C3, C0, ror #63 +.macro chi_step_ror out, a, b, c, r1, r2 + bic X, \c\(), \b\(), ror #\r1 + eor \out\(), X, \a\(), ror #\r2 +.endm - eor Aba_, E0, Aba - eor Asa_, E2, Abi, ror #50 - eor Abi_, E2, Aki, ror #46 - eor Aki_, E3, Ako, ror #63 - eor Ako_, E4, Amu, ror #28 - eor Amu_, E3, Aso, ror #2 - eor Aso_, E0, Ama, ror #54 - eor Aka_, E1, Abe, ror #43 - eor Ase_, E3, Ago, ror #36 - eor Ago_, E1, Ame, ror #49 - eor Ake_, E2, Agi, ror #3 - eor Agi_, E0, Aka, ror #39 - eor Aga_, E3, Abo - eor Abo_, E3, Amo, ror #37 - eor Amo_, E2, Ami, ror #8 - eor Ami_, E1, Ake, ror #56 - eor Age_, E4, Agu, ror #44 - eor Agu_, E2, Asi, ror #62 - eor Asi_, E4, Aku, ror #58 - eor Aku_, E0, Asa, ror #25 - eor Ama_, E4, Abu, ror #20 - eor Abu_, E4, Asu, ror #9 - eor Asu_, E1, Ase, ror #23 - eor Ame_, E0, Aga, ror #61 - eor Abe_, E1, Age, ror #19 +.macro keccak_f1600_round_noninitial + + eor X, Aba, Aga, ror #61 + eor X, X, Ama, ror #54 + eor X, X, Aka, ror #39 + eor X, X, Asa, ror #25 + eor X, Ake, Ame, ror #57 + eor X, X, Abe, ror #51 + eor X, X, Ase, ror #31 + eor X, X, Age, ror #27 + eor X, Asi, Abi, ror #52 + eor X, X, Aki, ror #48 + eor X, X, Ami, ror #10 + eor X, X, Agi, ror #5 + eor X, Abo, Ako, ror #63 + eor X, X, Amo, ror #37 + eor X, X, Ago, ror #36 + eor X, X, Aso, ror #2 + eor X, Aku, Agu, ror #50 + eor X, X, Amu, ror #34 + eor X, X, Abu, ror #26 + eor X, X, Asu, ror #15 + + eor X, X, X, ror #61 + ror X, X, #62 + eor X, X, X, ror #57 + ror X, X, #58 + eor X, X, X, ror #55 + ror X, X, #56 + eor X, X, X, ror #63 + eor X, X, X, ror #63 + + str Age, [sp, #16] // @slothy:writes=Age + str Aga, [sp, #24] // @slothy:writes=Aga + ldr Aga, [sp, #24] // @slothy:reads=Aga + ldr Age, [sp, #16] // @slothy:reads=Age + + eor Aba_, X, Aba + eor Asa_, X, Abi, ror #50 + eor Abi_, X, Aki, ror #46 + eor Aki_, X, Ako, ror #63 + eor Ako_, X, Amu, ror #28 + eor Amu_, X, Aso, ror #2 + eor Aso_, X, Ama, ror #54 + eor Aka_, X, Abe, ror #43 + eor Ase_, X, Ago, ror #36 + eor Ago_, X, Ame, ror #49 + eor Ake_, X, Agi, ror #3 + eor Agi_, X, Aka, ror #39 + eor Aga_, X, Abo + eor Abo_, X, Amo, ror #37 + eor Amo_, X, Ami, ror #8 + eor Ami_, X, Ake, ror #56 + eor Age_, X, Agu, ror #44 + eor Agu_, X, Asi, ror #62 + eor Asi_, X, Aku, ror #58 + eor Aku_, X, Asa, ror #25 + eor Ama_, X, Abu, ror #20 + eor Abu_, X, Asu, ror #9 + eor Asu_, X, Ase, ror #23 + eor Ame_, X, Aga, ror #61 + eor Abe_, X, Age, ror #19 load_constant_ptr_stack ldr count, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT - - bic tmp0, Agi_, Age_, ror #47 - bic tmp1, Ago_, Agi_, ror #42 - eor Aga, tmp0, Aga_, ror #39 - bic tmp0, Agu_, Ago_, ror #16 - eor Age, tmp1, Age_, ror #25 - bic tmp1, Aga_, Agu_, ror #31 - eor Agi, tmp0, Agi_, ror #58 - bic tmp0, Age_, Aga_, ror #56 - eor Ago, tmp1, Ago_, ror #47 - bic tmp1, Aki_, Ake_, ror #19 - eor Agu, tmp0, Agu_, ror #23 - bic tmp0, Ako_, Aki_, ror #47 - eor Aka, tmp1, Aka_, ror #24 - bic tmp1, Aku_, Ako_, ror #10 - eor Ake, tmp0, Ake_, ror #2 - bic tmp0, Aka_, Aku_, ror #47 - eor Aki, tmp1, Aki_, ror #57 - bic tmp1, Ake_, Aka_, ror #5 - eor Ako, tmp0, Ako_, ror #57 - bic tmp0, Ami_, Ame_, ror #38 - eor Aku, tmp1, Aku_, ror #52 - bic tmp1, Amo_, Ami_, ror #5 - eor Ama, tmp0, Ama_, ror #47 - bic tmp0, Amu_, Amo_, ror #41 - eor Ame, tmp1, Ame_, ror #43 - bic tmp1, Ama_, Amu_, ror #35 - eor Ami, tmp0, Ami_, ror #46 - bic tmp0, Ame_, Ama_, ror #9 - ldr cur_const, [const_addr, count, UXTW #3] - - eor Amo, tmp1, Amo_, ror #12 - bic tmp1, Asi_, Ase_, ror #48 - eor Amu, tmp0, Amu_, ror #44 - bic tmp0, Aso_, Asi_, ror #2 - eor Asa, tmp1, Asa_, ror #41 - bic tmp1, Asu_, Aso_, ror #25 - eor Ase, tmp0, Ase_, ror #50 - bic tmp0, Asa_, Asu_, ror #60 - eor Asi, tmp1, Asi_, ror #27 - bic tmp1, Ase_, Asa_, ror #57 - eor Aso, tmp0, Aso_, ror #21 - bic tmp0, Abi_, Abe_, ror #63 add count, count, #1 str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - eor Asu, tmp1, Asu_, ror #53 - bic tmp1, Abo_, Abi_, ror #42 - eor Aba, Aba_, tmp0, ror #21 - bic tmp0, Abu_, Abo_, ror #57 - eor Abe, tmp1, Abe_, ror #41 - bic tmp1, Aba_, Abu_, ror #50 - eor Abi, tmp0, Abi_, ror #35 - bic tmp0, Abe_, Aba_, ror #44 - eor Abo, tmp1, Abo_, ror #43 - eor Abu, tmp0, Abu_, ror #30 - eor Aba, Aba, cur_const + chi_step_ror Aga, Aga_, Agi_, Age_, 47, 39 + chi_step_ror Age, Age_, Ago_, Agi_, 42, 25 + chi_step_ror Agi, Agi_, Agu_, Ago_, 16, 58 + chi_step_ror Ago, Ago_, Aga_, Agu_, 31, 47 + chi_step_ror Agu, Agu_, Age_, Aga_, 56, 23 + chi_step_ror Aka, Aka_, Aki_, Ake_, 19, 24 + chi_step_ror Ake, Ake_, Ako_, Aki_, 47, 2 + chi_step_ror Aki, Aki_, Aku_, Ako_, 10, 57 + chi_step_ror Ako, Ako_, Aka_, Aku_, 47, 57 + chi_step_ror Aku, Aku_, Ake_, Aka_, 5, 52 + chi_step_ror Ama, Ama_, Ami_, Ame_, 38, 47 + chi_step_ror Ame, Ame_, Amo_, Ami_, 5, 43 + chi_step_ror Ami, Ami_, Amu_, Amo_, 41, 46 + chi_step_ror Amo, Amo_, Ama_, Amu_, 35, 12 + chi_step_ror Amu, Amu_, Ame_, Ama_, 9, 44 + chi_step_ror Asa, Asa_, Asi_, Ase_, 48, 41 + chi_step_ror Ase, Ase_, Aso_, Asi_, 2, 50 + chi_step_ror Asi, Asi_, Asu_, Aso_, 25, 27 + chi_step_ror Aso, Aso_, Asa_, Asu_, 60, 21 + chi_step_ror Asu, Asu_, Ase_, Asa_, 57, 53 + chi_step_ror Aba, Aba_, Abi_, Abe_, 63, 21 + chi_step_ror Abe, Abe_, Abo_, Abi_, 42, 41 + chi_step_ror Abi, Abi_, Abu_, Abo_, 57, 35 + chi_step_ror Abo, Abo_, Aba_, Abu_, 50, 43 + chi_step_ror Abu, Abu_, Abe_, Aba_, 44, 30 + eor Aba, Aba, cur_const .endm -.macro final_rotate_store - ror Aga, Aga,#(64-3) - ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT +.macro load_state + ldp Aba, Abe, [input_addr, #(1*8*0)] + ldp Abi, Abo, [input_addr, #(1*8*2)] + ldp Abu, Aga, [input_addr, #(1*8*4)] + ldp Age, Agi, [input_addr, #(1*8*6)] + ldp Ago, Agu, [input_addr, #(1*8*8)] + ldp Aka, Ake, [input_addr, #(1*8*10)] + ldp Aki, Ako, [input_addr, #(1*8*12)] + ldp Aku, Ama, [input_addr, #(1*8*14)] + ldp Ame, Ami, [input_addr, #(1*8*16)] + ldp Amo, Amu, [input_addr, #(1*8*18)] + ldp Asa, Ase, [input_addr, #(1*8*20)] + ldp Asi, Aso, [input_addr, #(1*8*22)] + ldr Asu, [input_addr, #(1*8*24)] +.endm - ror Abu, Abu,#(64-44) - ror Aka, Aka,#(64-25) - ror Ake, Ake,#(64-8) +.macro store_state + stp Aba, Abe, [input_addr, #(1*8*0)] + stp Abi, Abo, [input_addr, #(1*8*2)] stp Abu, Aga, [input_addr, #(1*8*4)] - ror Ama, Ama,#(64-10) - ror Aku, Aku,#(64-6) + stp Age, Agi, [input_addr, #(1*8*6)] + stp Ago, Agu, [input_addr, #(1*8*8)] stp Aka, Ake, [input_addr, #(1*8*10)] - ror Asa, Asa,#(64-39) - ror Ase, Ase,#(64-41) + stp Aki, Ako, [input_addr, #(1*8*12)] stp Aku, Ama, [input_addr, #(1*8*14)] + stp Ame, Ami, [input_addr, #(1*8*16)] + stp Amo, Amu, [input_addr, #(1*8*18)] + stp Asa, Ase, [input_addr, #(1*8*20)] + stp Asi, Aso, [input_addr, #(1*8*22)] + str Asu, [input_addr, #(1*8*24)] +.endm + +.macro final_rotate ror Abe, Abe,#(64-21) + ror Abi, Abi,#(64-14) + ror Abu, Abu,#(64-44) + ror Aga, Aga,#(64-3) ror Age, Age,#(64-45) - stp Asa, Ase, [input_addr, #(1*8*20)] ror Agi, Agi,#(64-61) - stp Aba, Abe, [input_addr, #(1*8*0)] - ror Ame, Ame,#(64-15) - ror Ami, Ami,#(64-56) - stp Age, Agi, [input_addr, #(1*8*6)] - ror Abi, Abi,#(64-14) - ror Aki, Aki,#(64-18) - stp Ame, Ami, [input_addr, #(1*8*16)] - ror Ako, Ako,#(64-1) - stp Abi, Abo, [input_addr, #(1*8*2)] - ror Asi, Asi,#(64-2) - ror Aso, Aso,#(64-62) - stp Aki, Ako, [input_addr, #(1*8*12)] ror Ago, Ago,#(64-28) ror Agu, Agu,#(64-20) - stp Asi, Aso, [input_addr, #(1*8*22)] + ror Aka, Aka,#(64-25) + ror Ake, Ake,#(64-8) + ror Aki, Aki,#(64-18) + ror Ako, Ako,#(64-1) + ror Aku, Aku,#(64-6) + ror Ama, Ama,#(64-10) + ror Ame, Ame,#(64-15) + ror Ami, Ami,#(64-56) ror Amo, Amo,#(64-27) ror Amu, Amu,#(64-36) - stp Ago, Agu, [input_addr, #(1*8*8)] + ror Asa, Asa,#(64-39) + ror Ase, Ase,#(64-41) + ror Asi, Asi,#(64-2) + ror Aso, Aso,#(64-62) ror Asu, Asu,#(64-55) - stp Amo, Amu, [input_addr, #(1*8*18)] - str Asu, [input_addr, #(1*8*24)] .endm #define KECCAK_F1600_ROUNDS 24 @@ -485,244 +488,264 @@ _keccak_f1600_x1_scalar_slothy_opt_a55: alloc_stack save_gprs +initial: + load_state + str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT keccak_f1600_round_initial loop: - // Instructions: 109 - // Expected cycles: 54 - // Expected IPC: 2.02 + // Instructions: 113 + // Expected cycles: 57 + // Expected IPC: 1.98 // - // -------------------------------------------- original position ---------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|-------- - eor x30, x15, x11, ror #52 // *............................................................................................................ - eor x28, x1, x2, ror #61 // .*........................................................................................................... - eor x29, x23, x22, ror #50 // ..*.......................................................................................................... - eor x0, x8, x9, ror #57 // ...*......................................................................................................... - eor x27, x16, x18, ror #63 // ....*........................................................................................................ - eor x30, x30, x13, ror #48 // .....*....................................................................................................... - eor x26, x29, x24, ror #34 // .......*..................................................................................................... - eor x0, x0, x6, ror #51 // ........*.................................................................................................... - eor x29, x27, x19, ror #37 // .........*................................................................................................... - eor x27, x30, x14, ror #10 // ..........*.................................................................................................. - eor x26, x26, x21, ror #26 // ............*................................................................................................ - eor x0, x0, x10, ror #31 // .............*............................................................................................... - eor x30, x29, x17, ror #36 // ..............*.............................................................................................. - eor x28, x28, x4, ror #54 // ......*...................................................................................................... - eor x27, x27, x12, ror #5 // ...............*............................................................................................. - eor x26, x26, x25, ror #15 // .................*........................................................................................... - eor x29, x0, x7, ror #27 // ..................*.......................................................................................... - eor x0, x28, x3, ror #39 // ...........*................................................................................................. - ror x28, x27, #62 // .....................*....................................................................................... - eor x30, x30, x20, ror #2 // ...................*......................................................................................... - eor x28, x28, x26, ror #57 // ......................*...................................................................................... - eor x0, x0, x5, ror #25 // ................*............................................................................................ - ror x26, x26, #58 // .......................*..................................................................................... - eor x26, x26, x29, ror #55 // ........................*.................................................................................... - ror x29, x29, #56 // .........................*................................................................................... - eor x27, x0, x27, ror #61 // ....................*........................................................................................ - eor x29, x29, x30, ror #63 // ..........................*.................................................................................. - eor x0, x30, x0, ror #63 // ...........................*................................................................................. - eor x30, x26, x1 // ............................*................................................................................ - eor x1, x29, x11, ror #50 // .............................*............................................................................... - eor x11, x29, x13, ror #46 // ..............................*.............................................................................. - eor x13, x28, x18, ror #63 // ...............................*............................................................................. - eor x18, x0, x24, ror #28 // ................................*............................................................................ - eor x24, x28, x20, ror #2 // .................................*........................................................................... - eor x20, x26, x4, ror #54 // ..................................*.......................................................................... - eor x4, x27, x6, ror #43 // ...................................*......................................................................... - eor x6, x28, x17, ror #36 // ....................................*........................................................................ - eor x17, x27, x9, ror #49 // .....................................*....................................................................... - eor x9, x29, x12, ror #3 // ......................................*...................................................................... - eor x12, x26, x3, ror #39 // .......................................*..................................................................... - eor x3, x28, x16 // ........................................*.................................................................... - eor x16, x28, x19, ror #37 // .........................................*................................................................... - eor x19, x29, x14, ror #8 // ..........................................*.................................................................. - eor x14, x27, x8, ror #56 // ...........................................*................................................................. - eor x8, x0, x22, ror #44 // ............................................*................................................................ - eor x22, x29, x15, ror #62 // .............................................*............................................................... - eor x15, x0, x23, ror #58 // ..............................................*.............................................................. - eor x23, x26, x5, ror #25 // ...............................................*............................................................. - eor x29, x0, x21, ror #20 // ................................................*............................................................ - eor x21, x0, x25, ror #9 // .................................................*........................................................... - ldr x5, [sp, #STACK_OFFSET_CONST] // .....................................................*....................................................... - eor x25, x27, x10, ror #23 // ..................................................*.......................................................... - eor x26, x26, x2, ror #61 // ...................................................*......................................................... - eor x28, x27, x7, ror #19 // ....................................................*........................................................ - bic x7, x12, x8, ror #47 // .......................................................*..................................................... - bic x10, x22, x17, ror #16 // ..........................................................*.................................................. - bic x0, x17, x12, ror #42 // ........................................................*.................................................... - ldr w27, [sp, #STACK_OFFSET_COUNT] // ......................................................*...................................................... // @slothy:reads=STACK_OFFSET_COUNT - eor x2, x7, x3, ror #39 // .........................................................*................................................... - eor x7, x0, x8, ror #25 // ...........................................................*................................................. - bic x0, x3, x22, ror #31 // ............................................................*................................................ - bic x8, x8, x3, ror #56 // ..............................................................*.............................................. - eor x12, x10, x12, ror #58 // .............................................................*............................................... - eor x17, x0, x17, ror #47 // ...............................................................*............................................. - bic x3, x13, x9, ror #19 // ................................................................*............................................ - eor x22, x8, x22, ror #23 // .................................................................*........................................... - bic x8, x18, x13, ror #47 // ..................................................................*.......................................... - bic x0, x23, x18, ror #10 // ....................................................................*........................................ - bic x10, x4, x23, ror #47 // ......................................................................*...................................... - eor x3, x3, x4, ror #24 // ...................................................................*......................................... - eor x8, x8, x9, ror #2 // .....................................................................*....................................... - eor x13, x0, x13, ror #57 // .......................................................................*..................................... - bic x0, x9, x4, ror #5 // ........................................................................*.................................... - eor x18, x10, x18, ror #57 // .........................................................................*................................... - bic x4, x14, x26, ror #38 // ..........................................................................*.................................. - eor x23, x0, x23, ror #52 // ...........................................................................*................................. - bic x9, x19, x14, ror #5 // ............................................................................*................................ - eor x4, x4, x29, ror #47 // .............................................................................*............................... - bic x0, x24, x19, ror #41 // ..............................................................................*.............................. - eor x9, x9, x26, ror #43 // ...............................................................................*............................. - bic x10, x29, x24, ror #35 // ................................................................................*............................ - eor x14, x0, x14, ror #46 // .................................................................................*........................... - bic x0, x26, x29, ror #9 // ..................................................................................*.......................... - ldr x26, [x5, w27, UXTW #3] // ...................................................................................*......................... - bic x29, x25, x20, ror #25 // .........................................................................................*................... - bic x5, x15, x6, ror #48 // .....................................................................................*....................... - eor x19, x10, x19, ror #12 // ....................................................................................*........................ - eor x24, x0, x24, ror #44 // ......................................................................................*...................... - bic x10, x20, x15, ror #2 // .......................................................................................*..................... - bic x0, x1, x25, ror #60 // ...........................................................................................*................. - eor x5, x5, x1, ror #41 // ........................................................................................*.................... - eor x10, x10, x6, ror #50 // ..........................................................................................*.................. - eor x15, x29, x15, ror #27 // ............................................................................................*................ - bic x29, x6, x1, ror #57 // .............................................................................................*............... - eor x20, x0, x20, ror #21 // ..............................................................................................*.............. - bic x0, x11, x28, ror #63 // ...............................................................................................*............. - add w27, w27, #1 // ................................................................................................*............ - str w27, [sp, #STACK_OFFSET_COUNT] // .................................................................................................*........... // @slothy:writes=STACK_OFFSET_COUNT - eor x25, x29, x25, ror #53 // ..................................................................................................*.......... - bic x6, x16, x11, ror #42 // ...................................................................................................*......... - eor x1, x30, x0, ror #21 // ....................................................................................................*........ - bic x29, x21, x16, ror #57 // .....................................................................................................*....... - bic x0, x30, x21, ror #50 // .......................................................................................................*..... - bic x30, x28, x30, ror #44 // .........................................................................................................*... - eor x6, x6, x28, ror #41 // ......................................................................................................*...... - eor x16, x0, x16, ror #43 // ..........................................................................................................*.. - eor x11, x29, x11, ror #35 // ........................................................................................................*.... - eor x21, x30, x21, ror #30 // ...........................................................................................................*. - eor x1, x1, x26 // ............................................................................................................* - - // ----------------------------------------------- new position -----------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|-------- - // eor x27, x15, x11, ror #52 // *............................................................................................................ - // eor x30, x1, x2, ror #61 // .*........................................................................................................... - // eor x29, x23, x22, ror #50 // ..*.......................................................................................................... - // eor x26, x8, x9, ror #57 // ...*......................................................................................................... - // eor x28, x16, x18, ror #63 // ....*........................................................................................................ - // eor x27, x27, x13, ror #48 // .....*....................................................................................................... - // eor x30, x30, x4, ror #54 // .............*............................................................................................... - // eor x29, x29, x24, ror #34 // ......*...................................................................................................... - // eor x26, x26, x6, ror #51 // .......*..................................................................................................... - // eor x28, x28, x19, ror #37 // ........*.................................................................................................... - // eor x27, x27, x14, ror #10 // .........*................................................................................................... - // eor x30, x30, x3, ror #39 // .................*........................................................................................... - // eor x29, x29, x21, ror #26 // ..........*.................................................................................................. - // eor x26, x26, x10, ror #31 // ...........*................................................................................................. - // eor x28, x28, x17, ror #36 // ............*................................................................................................ - // eor x27, x27, x12, ror #5 // ..............*.............................................................................................. - // eor x30, x30, x5, ror #25 // .....................*....................................................................................... - // eor x29, x29, x25, ror #15 // ...............*............................................................................................. - // eor x26, x26, x7, ror #27 // ................*............................................................................................ - // eor x28, x28, x20, ror #2 // ...................*......................................................................................... - // eor x0, x30, x27, ror #61 // .........................*................................................................................... - // ror x27, x27, #62 // ..................*.......................................................................................... - // eor x27, x27, x29, ror #57 // ....................*........................................................................................ - // ror x29, x29, #58 // ......................*...................................................................................... - // eor x29, x29, x26, ror #55 // .......................*..................................................................................... - // ror x26, x26, #56 // ........................*.................................................................................... - // eor x26, x26, x28, ror #63 // ..........................*.................................................................................. - // eor x28, x28, x30, ror #63 // ...........................*................................................................................. - // eor x30, x29, x1 // ............................*................................................................................ - // eor x1, x26, x11, ror #50 // .............................*............................................................................... - // eor x11, x26, x13, ror #46 // ..............................*.............................................................................. - // eor x13, x27, x18, ror #63 // ...............................*............................................................................. - // eor x18, x28, x24, ror #28 // ................................*............................................................................ - // eor x24, x27, x20, ror #2 // .................................*........................................................................... - // eor x20, x29, x4, ror #54 // ..................................*.......................................................................... - // eor x4, x0, x6, ror #43 // ...................................*......................................................................... - // eor x6, x27, x17, ror #36 // ....................................*........................................................................ - // eor x17, x0, x9, ror #49 // .....................................*....................................................................... - // eor x9, x26, x12, ror #3 // ......................................*...................................................................... - // eor x12, x29, x3, ror #39 // .......................................*..................................................................... - // eor x3, x27, x16 // ........................................*.................................................................... - // eor x16, x27, x19, ror #37 // .........................................*................................................................... - // eor x19, x26, x14, ror #8 // ..........................................*.................................................................. - // eor x14, x0, x8, ror #56 // ...........................................*................................................................. - // eor x8, x28, x22, ror #44 // ............................................*................................................................ - // eor x22, x26, x15, ror #62 // .............................................*............................................................... - // eor x15, x28, x23, ror #58 // ..............................................*.............................................................. - // eor x23, x29, x5, ror #25 // ...............................................*............................................................. - // eor x5, x28, x21, ror #20 // ................................................*............................................................ - // eor x21, x28, x25, ror #9 // .................................................*........................................................... - // eor x25, x0, x10, ror #23 // ...................................................*......................................................... - // eor x10, x29, x2, ror #61 // ....................................................*........................................................ - // eor x28, x0, x7, ror #19 // .....................................................*....................................................... - // ldr x26, [sp, #(STACK_OFFSET_CONST)] // ..................................................*.......................................................... - // ldr w27, [sp, #STACK_OFFSET_COUNT] // .........................................................*................................................... - // bic x0, x12, x8, ror #47 // ......................................................*...................................................... - // bic x29, x17, x12, ror #42 // ........................................................*.................................................... - // eor x2, x0, x3, ror #39 // ..........................................................*.................................................. - // bic x0, x22, x17, ror #16 // .......................................................*..................................................... - // eor x7, x29, x8, ror #25 // ...........................................................*................................................. - // bic x29, x3, x22, ror #31 // ............................................................*................................................ - // eor x12, x0, x12, ror #58 // ..............................................................*.............................................. - // bic x0, x8, x3, ror #56 // .............................................................*............................................... - // eor x17, x29, x17, ror #47 // ...............................................................*............................................. - // bic x29, x13, x9, ror #19 // ................................................................*............................................ - // eor x22, x0, x22, ror #23 // .................................................................*........................................... - // bic x0, x18, x13, ror #47 // ..................................................................*.......................................... - // eor x3, x29, x4, ror #24 // .....................................................................*....................................... - // bic x29, x23, x18, ror #10 // ...................................................................*......................................... - // eor x8, x0, x9, ror #2 // ......................................................................*...................................... - // bic x0, x4, x23, ror #47 // ....................................................................*........................................ - // eor x13, x29, x13, ror #57 // .......................................................................*..................................... - // bic x29, x9, x4, ror #5 // ........................................................................*.................................... - // eor x18, x0, x18, ror #57 // .........................................................................*................................... - // bic x0, x14, x10, ror #38 // ..........................................................................*.................................. - // eor x23, x29, x23, ror #52 // ...........................................................................*................................. - // bic x29, x19, x14, ror #5 // ............................................................................*................................ - // eor x4, x0, x5, ror #47 // .............................................................................*............................... - // bic x0, x24, x19, ror #41 // ..............................................................................*.............................. - // eor x9, x29, x10, ror #43 // ...............................................................................*............................. - // bic x29, x5, x24, ror #35 // ................................................................................*............................ - // eor x14, x0, x14, ror #46 // .................................................................................*........................... - // bic x0, x10, x5, ror #9 // ..................................................................................*.......................... - // ldr x26, [x26, w27, UXTW #3] // ...................................................................................*......................... - // eor x19, x29, x19, ror #12 // ......................................................................................*...................... - // bic x29, x15, x6, ror #48 // .....................................................................................*....................... - // eor x24, x0, x24, ror #44 // .......................................................................................*..................... - // bic x0, x20, x15, ror #2 // ........................................................................................*.................... - // eor x5, x29, x1, ror #41 // ..........................................................................................*.................. - // bic x29, x25, x20, ror #25 // ....................................................................................*........................ - // eor x10, x0, x6, ror #50 // ...........................................................................................*................. - // bic x0, x1, x25, ror #60 // .........................................................................................*................... - // eor x15, x29, x15, ror #27 // ............................................................................................*................ - // bic x29, x6, x1, ror #57 // .............................................................................................*............... - // eor x20, x0, x20, ror #21 // ..............................................................................................*.............. - // bic x0, x11, x28, ror #63 // ...............................................................................................*............. - // add w27, w27, #1 // ................................................................................................*............ - // str w27, [sp, #STACK_OFFSET_COUNT] // .................................................................................................*........... - // eor x25, x29, x25, ror #53 // ..................................................................................................*.......... - // bic x29, x16, x11, ror #42 // ...................................................................................................*......... - // eor x1, x30, x0, ror #21 // ....................................................................................................*........ - // bic x0, x21, x16, ror #57 // .....................................................................................................*....... - // eor x6, x29, x28, ror #41 // ........................................................................................................*.... - // bic x29, x30, x21, ror #50 // ......................................................................................................*...... - // eor x11, x0, x11, ror #35 // ..........................................................................................................*.. - // bic x0, x28, x30, ror #44 // .......................................................................................................*..... - // eor x16, x29, x16, ror #43 // .........................................................................................................*... - // eor x21, x0, x21, ror #30 // ...........................................................................................................*. - // eor x1, x1, x26 // ............................................................................................................* + // Cycle bound: 57.0 + // IPC bound: 1.98 + // + // Wall time: 15.98s + // User time: 15.98s + // + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + eor x27, x1, x2, ror #61 // *........................................................ + eor x28, x27, x4, ror #54 // *........................................................ + eor x28, x28, x3, ror #39 // .*....................................................... + eor x0, x28, x5, ror #25 // .*....................................................... + eor x28, x8, x9, ror #57 // ..*...................................................... + eor x26, x28, x6, ror #51 // ...*..................................................... + eor x28, x26, x10, ror #31 // ...*..................................................... + eor x28, x28, x7, ror #27 // ....*.................................................... + eor x26, x15, x11, ror #52 // ....*.................................................... + eor x26, x26, x13, ror #48 // .....*................................................... + eor x26, x26, x14, ror #10 // .....*................................................... + eor x30, x26, x12, ror #5 // ......*.................................................. + eor x26, x16, x18, ror #63 // ......*.................................................. + eor x26, x26, x19, ror #37 // .......*................................................. + eor x26, x26, x17, ror #36 // .......*................................................. + eor x29, x26, x20, ror #2 // ........*................................................ + eor x26, x23, x22, ror #50 // ........*................................................ + eor x26, x26, x24, ror #34 // .........*............................................... + eor x26, x26, x21, ror #26 // .........*............................................... + eor x27, x26, x25, ror #15 // ..........*.............................................. + eor x26, x0, x30, ror #61 // ..........*.............................................. + ror x30, x30, #62 // ...........*............................................. + eor x30, x30, x27, ror #57 // ...........*............................................. + ror x27, x27, #58 // ............*............................................ + eor x27, x27, x28, ror #55 // ............*............................................ + ror x28, x28, #56 // .............*........................................... + eor x28, x28, x29, ror #63 // .............*........................................... + eor x0, x29, x0, ror #63 // ..............*.......................................... + str x7, [sp, #16] // ..............*.......................................... // @slothy:writes=Age + str x2, [sp, #24] // ...............*......................................... // @slothy:writes=Aga + ldr x2, [sp, #24] // ...............*......................................... // @slothy:reads=Aga + ldr x7, [sp, #16] // ................*........................................ // @slothy:reads=Age + eor x1, x27, x1 // ................*........................................ + eor x11, x28, x11, ror #50 // .................*....................................... + eor x29, x28, x13, ror #46 // .................*....................................... + eor x13, x30, x18, ror #63 // ..................*...................................... + eor x18, x0, x24, ror #28 // ..................*...................................... + eor x24, x30, x20, ror #2 // ...................*..................................... + eor x20, x27, x4, ror #54 // ...................*..................................... + eor x4, x26, x6, ror #43 // ....................*.................................... + eor x6, x30, x17, ror #36 // ....................*.................................... + eor x17, x26, x9, ror #49 // .....................*................................... + eor x9, x28, x12, ror #3 // .....................*................................... + eor x12, x27, x3, ror #39 // ......................*.................................. + eor x3, x30, x16 // ......................*.................................. + eor x16, x30, x19, ror #37 // .......................*................................. + eor x19, x28, x14, ror #8 // .......................*................................. + eor x14, x26, x8, ror #56 // ........................*................................ + eor x8, x0, x22, ror #44 // ........................*................................ + eor x28, x28, x15, ror #62 // .........................*............................... + eor x15, x0, x23, ror #58 // .........................*............................... + eor x23, x27, x5, ror #25 // ..........................*.............................. + eor x21, x0, x21, ror #20 // ..........................*.............................. + eor x30, x0, x25, ror #9 // ...........................*............................. + eor x25, x26, x10, ror #23 // ...........................*............................. + eor x10, x27, x2, ror #61 // ............................*............................ + eor x26, x26, x7, ror #19 // ............................*............................ + ldr x7, [sp, #STACK_OFFSET_CONST] // .............................*........................... + ldr w5, [sp, #STACK_OFFSET_COUNT] // .............................*........................... // @slothy:reads=STACK_OFFSET_COUNT + ldr x0, [x7, w5, UXTW #3] // ..............................*.......................... + add w27, w5, #1 // ..............................*.......................... + str w27, [sp, #STACK_OFFSET_COUNT] // ...............................*......................... // @slothy:writes=STACK_OFFSET_COUNT + bic x5, x8, x12, ror #47 // ...............................*......................... + eor x2, x5, x3, ror #39 // ................................*........................ + bic x5, x12, x17, ror #42 // ................................*........................ + eor x7, x5, x8, ror #25 // .................................*....................... + bic x5, x17, x28, ror #16 // .................................*....................... + eor x12, x5, x12, ror #58 // ..................................*...................... + bic x5, x28, x3, ror #31 // ..................................*...................... + eor x17, x5, x17, ror #47 // ...................................*..................... + bic x5, x3, x8, ror #56 // ...................................*..................... + eor x22, x5, x28, ror #23 // ....................................*.................... + bic x28, x9, x13, ror #19 // ....................................*.................... + eor x3, x28, x4, ror #24 // .....................................*................... + bic x5, x13, x18, ror #47 // .....................................*................... + eor x8, x5, x9, ror #2 // ......................................*.................. + bic x5, x18, x23, ror #10 // ......................................*.................. + eor x13, x5, x13, ror #57 // .......................................*................. + bic x5, x23, x4, ror #47 // .......................................*................. + eor x18, x5, x18, ror #57 // ........................................*................ + bic x5, x4, x9, ror #5 // ........................................*................ + eor x23, x5, x23, ror #52 // .........................................*............... + bic x5, x10, x14, ror #38 // .........................................*............... + eor x4, x5, x21, ror #47 // ..........................................*.............. + bic x5, x14, x19, ror #5 // ..........................................*.............. + eor x9, x5, x10, ror #43 // ...........................................*............. + bic x5, x19, x24, ror #41 // ...........................................*............. + eor x14, x5, x14, ror #46 // ............................................*............ + bic x5, x24, x21, ror #35 // ............................................*............ + eor x19, x5, x19, ror #12 // .............................................*........... + bic x5, x21, x10, ror #9 // .............................................*........... + eor x24, x5, x24, ror #44 // ..............................................*.......... + bic x5, x6, x15, ror #48 // ..............................................*.......... + eor x5, x5, x11, ror #41 // ...............................................*......... + bic x28, x15, x20, ror #2 // ...............................................*......... + eor x10, x28, x6, ror #50 // ................................................*........ + bic x28, x20, x25, ror #25 // ................................................*........ + eor x15, x28, x15, ror #27 // .................................................*....... + bic x28, x25, x11, ror #60 // .................................................*....... + eor x20, x28, x20, ror #21 // ..................................................*...... + bic x28, x11, x6, ror #57 // ..................................................*...... + eor x25, x28, x25, ror #53 // ...................................................*..... + bic x28, x26, x29, ror #63 // ...................................................*..... + eor x21, x28, x1, ror #21 // ....................................................*.... + bic x28, x29, x16, ror #42 // ....................................................*.... + eor x6, x28, x26, ror #41 // .....................................................*... + bic x11, x16, x30, ror #57 // .....................................................*... + bic x28, x30, x1, ror #50 // ......................................................*.. + eor x11, x11, x29, ror #35 // ......................................................*.. + eor x16, x28, x16, ror #43 // .......................................................*. + bic x28, x1, x26, ror #44 // .......................................................*. + eor x1, x21, x0 // ........................................................* + eor x21, x28, x30, ror #30 // ........................................................* + + // ------------------- cycle (expected) -------------------> + // 0 25 50 + // |------------------------|------------------------|------ + // eor X, x1, x2, ror #61 // *........................................................ + // eor X, X, x4, ror #54 // *........................................................ + // eor X, X, x3, ror #39 // .*....................................................... + // eor X, X, x5, ror #25 // .*....................................................... + // eor X, x8, x9, ror #57 // ..*...................................................... + // eor X, X, x6, ror #51 // ...*..................................................... + // eor X, X, x10, ror #31 // ...*..................................................... + // eor X, X, x7, ror #27 // ....*.................................................... + // eor X, x15, x11, ror #52 // ....*.................................................... + // eor X, X, x13, ror #48 // .....*................................................... + // eor X, X, x14, ror #10 // .....*................................................... + // eor X, X, x12, ror #5 // ......*.................................................. + // eor X, x16, x18, ror #63 // ......*.................................................. + // eor X, X, x19, ror #37 // .......*................................................. + // eor X, X, x17, ror #36 // .......*................................................. + // eor X, X, x20, ror #2 // ........*................................................ + // eor X, x23, x22, ror #50 // ........*................................................ + // eor X, X, x24, ror #34 // .........*............................................... + // eor X, X, x21, ror #26 // .........*............................................... + // eor X, X, x25, ror #15 // ..........*.............................................. + // eor X, X, X, ror #61 // ..........*.............................................. + // ror X, X, #62 // ...........*............................................. + // eor X, X, X, ror #57 // ...........*............................................. + // ror X, X, #58 // ............*............................................ + // eor X, X, X, ror #55 // ............*............................................ + // ror X, X, #56 // .............*........................................... + // eor X, X, X, ror #63 // .............*........................................... + // eor X, X, X, ror #63 // ..............*.......................................... + // str x7, [sp, #16] // ..............*.......................................... + // str x2, [sp, #24] // ...............*......................................... + // ldr x2, [sp, #24] // ...............*......................................... + // ldr x7, [sp, #16] // ................*........................................ + // eor x30, X, x1 // ................*........................................ + // eor x1, X, x11, ror #50 // .................*....................................... + // eor x11, X, x13, ror #46 // .................*....................................... + // eor x13, X, x18, ror #63 // ..................*...................................... + // eor x18, X, x24, ror #28 // ..................*...................................... + // eor x24, X, x20, ror #2 // ...................*..................................... + // eor x20, X, x4, ror #54 // ...................*..................................... + // eor x4, X, x6, ror #43 // ....................*.................................... + // eor x6, X, x17, ror #36 // ....................*.................................... + // eor x17, X, x9, ror #49 // .....................*................................... + // eor x9, X, x12, ror #3 // .....................*................................... + // eor x12, X, x3, ror #39 // ......................*.................................. + // eor x3, X, x16 // ......................*.................................. + // eor x16, X, x19, ror #37 // .......................*................................. + // eor x19, X, x14, ror #8 // .......................*................................. + // eor x14, X, x8, ror #56 // ........................*................................ + // eor x8, X, x22, ror #44 // ........................*................................ + // eor x22, X, x15, ror #62 // .........................*............................... + // eor x15, X, x23, ror #58 // .........................*............................... + // eor x23, X, x5, ror #25 // ..........................*.............................. + // eor x5, X, x21, ror #20 // ..........................*.............................. + // eor x21, X, x25, ror #9 // ...........................*............................. + // eor x25, X, x10, ror #23 // ...........................*............................. + // eor x10, X, x2, ror #61 // ............................*............................ + // eor x28, X, x7, ror #19 // ............................*............................ + // ldr x26, [sp, #(STACK_OFFSET_CONST)] // .............................*........................... + // ldr w27, [sp, #STACK_OFFSET_COUNT] // .............................*........................... + // ldr x26, [x26, w27, UXTW #3] // ..............................*.......................... + // add w27, w27, #1 // ..............................*.......................... + // str w27, [sp, #STACK_OFFSET_COUNT] // ...............................*......................... + // bic X, x8, x12, ror #47 // ...............................*......................... + // eor x2, X, x3, ror #39 // ................................*........................ + // bic X, x12, x17, ror #42 // ................................*........................ + // eor x7, X, x8, ror #25 // .................................*....................... + // bic X, x17, x22, ror #16 // .................................*....................... + // eor x12, X, x12, ror #58 // ..................................*...................... + // bic X, x22, x3, ror #31 // ..................................*...................... + // eor x17, X, x17, ror #47 // ...................................*..................... + // bic X, x3, x8, ror #56 // ...................................*..................... + // eor x22, X, x22, ror #23 // ....................................*.................... + // bic X, x9, x13, ror #19 // ....................................*.................... + // eor x3, X, x4, ror #24 // .....................................*................... + // bic X, x13, x18, ror #47 // .....................................*................... + // eor x8, X, x9, ror #2 // ......................................*.................. + // bic X, x18, x23, ror #10 // ......................................*.................. + // eor x13, X, x13, ror #57 // .......................................*................. + // bic X, x23, x4, ror #47 // .......................................*................. + // eor x18, X, x18, ror #57 // ........................................*................ + // bic X, x4, x9, ror #5 // ........................................*................ + // eor x23, X, x23, ror #52 // .........................................*............... + // bic X, x10, x14, ror #38 // .........................................*............... + // eor x4, X, x5, ror #47 // ..........................................*.............. + // bic X, x14, x19, ror #5 // ..........................................*.............. + // eor x9, X, x10, ror #43 // ...........................................*............. + // bic X, x19, x24, ror #41 // ...........................................*............. + // eor x14, X, x14, ror #46 // ............................................*............ + // bic X, x24, x5, ror #35 // ............................................*............ + // eor x19, X, x19, ror #12 // .............................................*........... + // bic X, x5, x10, ror #9 // .............................................*........... + // eor x24, X, x24, ror #44 // ..............................................*.......... + // bic X, x6, x15, ror #48 // ..............................................*.......... + // eor x5, X, x1, ror #41 // ...............................................*......... + // bic X, x15, x20, ror #2 // ...............................................*......... + // eor x10, X, x6, ror #50 // ................................................*........ + // bic X, x20, x25, ror #25 // ................................................*........ + // eor x15, X, x15, ror #27 // .................................................*....... + // bic X, x25, x1, ror #60 // .................................................*....... + // eor x20, X, x20, ror #21 // ..................................................*...... + // bic X, x1, x6, ror #57 // ..................................................*...... + // eor x25, X, x25, ror #53 // ...................................................*..... + // bic X, x28, x11, ror #63 // ...................................................*..... + // eor x1, X, x30, ror #21 // ....................................................*.... + // bic X, x11, x16, ror #42 // ....................................................*.... + // eor x6, X, x28, ror #41 // .....................................................*... + // bic X, x16, x21, ror #57 // .....................................................*... + // eor x11, X, x11, ror #35 // ......................................................*.. + // bic X, x21, x30, ror #50 // ......................................................*.. + // eor x16, X, x16, ror #43 // .......................................................*. + // bic X, x30, x28, ror #44 // .......................................................*. + // eor x21, X, x21, ror #30 // ........................................................* + // eor x1, x1, x26 // ........................................................* end_loop: cmp count, #(KECCAK_F1600_ROUNDS-1) ble loop - - final_rotate_store +final: + final_rotate + ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT + store_state +end_final: restore_gprs free_stack ret \ No newline at end of file